In [5]:
import pandas as pd

In [7]:
m = pd.read_csv("../../data/twitter_analysis/mentions_count.csv")
ml = pd.read_csv("../../data/twitter_analysis/mentions_count_labeled.csv", sep=';', encoding="ISO-8859-1")

In [18]:
m.columns = ['name', 'mention', 'count']
m[:2]
ml

Unnamed: 0,name,mention,label
0,repcummings,elijah e. cummings,1
1,teammoulton,seth moulton,1
2,larryhogan,larry hogan,1
3,repcummings,house oversightdems,0
4,pramilajayapal,pramila jayapal,1
...,...,...,...
296,govsandoval,nevada dot,0
297,repcummings,oversight committee,0
298,repmckinley,paul ryan,0
299,repmeehan,paul ryan,0


In [17]:
m_merged = m.merge(ml, how='left', on=['name', 'mention'])
m_merged['label'] = m_merged['label'].fillna(2).astype('UInt8')
m_merged[195:202]

Unnamed: 0,name,mention,count,label
195,gopleader,kevin mccarthy,270,2
196,repkclark,katherine clark,270,2
197,repratcliffe,john ratcliffe,268,2
198,senjeffmerkley,senator jeff merkley,267,2
199,repraulgrijalva,progressive caucus,267,2
200,repveasey,rep. marc veasey,266,2
201,governortomwolf,governor tom wolf,265,2


In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import Levenshtein
from fuzzywuzzy import fuzz
import re

def preprocess_text(text):
    """Clean and standardize Twitter names/handles."""
    if pd.isna(text):
        return ""
    # Convert to lowercase
    text = str(text).lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z]', '', text)
    return text

def calculate_similarity_features(name, mention):
    """Calculate various string similarity metrics between name and mention."""
    # Preprocess both strings
    clean_name = preprocess_text(name)
    clean_mention = preprocess_text(mention)
    
    # Handle empty strings
    if not clean_name or not clean_mention:
        return [0, 0, 0, 0, 0]
    
    # Calculate different similarity metrics
    levenshtein_dist = Levenshtein.distance(clean_name, clean_mention)
    levenshtein_ratio = Levenshtein.ratio(clean_name, clean_mention)
    
    # FuzzyWuzzy similarity scores
    token_sort_ratio = fuzz.token_sort_ratio(clean_name, clean_mention)
    partial_ratio = fuzz.partial_ratio(clean_name, clean_mention)
    
    # Length difference
    len_diff = abs(len(clean_name) - len(clean_mention))
    
    return [levenshtein_dist, levenshtein_ratio, token_sort_ratio/100, 
            partial_ratio/100, len_diff]

def create_features(df):
    """Create feature matrix from the dataset."""
    feature_list = []
    
    for _, row in df.iterrows():
        similarities = calculate_similarity_features(row['name'], row['mention'])
        # Add count as a feature
        features = similarities + [row['count']]
        feature_list.append(features)
    
    feature_names = ['levenshtein_dist', 'levenshtein_ratio', 'token_sort_ratio', 
                    'partial_ratio', 'len_diff', 'count']
    
    return pd.DataFrame(feature_list, columns=feature_names)

In [20]:

def train_and_predict(df):
    """Train model on labeled data and predict for all rows."""
    # Create feature matrix
    X = create_features(df)
    
    # Split labeled and unlabeled data
    labeled_mask = df['label']!=2
    X_labeled = X[labeled_mask]
    y_labeled = df.loc[labeled_mask, 'label']
    
    # Split labeled data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X_labeled, y_labeled, test_size=0.2, random_state=42
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Train Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train_scaled, y_train)
    
    # Evaluate on validation set
    val_predictions = rf.predict(X_val_scaled)
    print("\nValidation Results:")
    print(classification_report(y_val, val_predictions))
    
    # Scale and predict for all data
    X_all_scaled = scaler.transform(X)
    all_predictions = rf.predict(X_all_scaled)
    all_probabilities = rf.predict_proba(X_all_scaled)
    
    # Add predictions and confidence scores to original dataframe
    df_results = df.copy()
    df_results['predicted_label'] = all_predictions
    df_results['confidence'] = np.max(all_probabilities, axis=1)
    
    # Get feature importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nFeature Importance:")
    print(feature_importance)
    
    return df_results, rf, scaler

In [23]:
# Example usage
if __name__ == "__main__":
    # Load your data
    df = m_merged
    
    # Train model and get predictions
    results_df, model, scaler = train_and_predict(df)
    
    print("\nSample Results:")
    print(results_df[['name', 'mention', 'label', 'predicted_label', 'confidence']])


Validation Results:
              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97        33
         1.0       0.96      0.96      0.96        26

    accuracy                           0.97        59
   macro avg       0.97      0.97      0.97        59
weighted avg       0.97      0.97      0.97        59


Feature Importance:
             feature  importance
1  levenshtein_ratio    0.297019
2   token_sort_ratio    0.245179
3      partial_ratio    0.238063
0   levenshtein_dist    0.133182
5              count    0.080098
4           len_diff    0.006460

Sample Results:
                   name              mention  label  predicted_label  \
0           repcummings   elijah e. cummings      1              1.0   
1           teammoulton         seth moulton      1              1.0   
2            larryhogan          larry hogan      1              1.0   
3           repcummings  house oversightdems      0              0.0   
4        pramilajayapal

In [24]:
results_df.to_csv("resuls.csv")

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

data = {
    'name': ['Alice', 'Bob', 'Alice', 'Dave', 'Bob'],
    'mention': ['Bob', 'Alice', 'Dave', 'Alice', 'Dave'],
    'num_mentions': [3, 1, 2, 4, 2]
}
df = pd.DataFrame(data)

# Initialize directed graph
G = nx.DiGraph()

# Add edges with weights
for _, row in results_df.iterrows():
    G.add_edge(row['name'], row['mention'], weight=row['num_mentions'])

# Calculate centrality for scaling nodes
centrality = nx.degree_centrality(G)

# Draw the network
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G)  # Spring layout for better visualization

# Draw nodes, edges, and labels with scaling based on centrality and weight
nx.draw_networkx_nodes(G, pos, node_size=[v * 1000 for v in centrality.values()], node_color="skyblue")
nx.draw_networkx_edges(G, pos, width=[G[u][v]['weight']*0.5 for u, v in G.edges()], arrowstyle='->', arrowsize=20)
nx.draw_networkx_labels(G, pos, font_size=10, font_color="black")

plt.title("Network Analysis of Mentions")
plt.show()
