# Network Analysis

In [1]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt

Create user user_id list from preprocessed data

In [2]:
data = pd.read_csv('../output/preprocessed.csv')

# Create a dictionary from user_id and user
user_id_dict = dict(zip(data['user'], data['user_id']))
{k: user_id_dict[k] for k in list(user_id_dict)[:5]}

{'danielwoodard': 1077866112,
 'nelsonjacqueline': 1089670430,
 'ihooper': 1007478642,
 'wrightnicholas': 1039258480,
 'michael51': 1022492390}

In [3]:
# Function to convert mentions from usernames to user_ids
def convert_mentions_to_ids(mentions, user_id_dict):
    return [user_id_dict[mention] for mention in eval(mentions) if mention in user_id_dict]

# Apply the function to the mentions column
data['mentions'] = data['mentions'].apply(lambda x: convert_mentions_to_ids(x, user_id_dict))

data.head(7)

Unnamed: 0,timestamp,text,text_id,user,user_id,hashtags,mentions,emojis
0,00:00:00,run business mean juggle countless administrat...,2018569761,danielwoodard,1077866112,"['#HRtech', '#businessmanagement']",[],[]
1,00:00:00,liz truss walk linger shadow predecessor charl...,2092717718,nelsonjacqueline,1089670430,['#politics'],[],[]
2,00:00:00,uk brace war government building london raise ...,2059143248,ihooper,1007478642,"['#Ukrainewashed', '#WarPreparedness']",[],['ðŸ‡ºðŸ‡¦']
3,00:00:00,marry second cousin remove not taboo think gen...,2008209828,wrightnicholas,1039258480,"['#FamilyTree', '#GeneticFacts']",[],['ðŸ§¬']
4,00:00:00,truly disgraceful indian national congress sto...,2001239278,michael51,1021455936,['#RationChorCongress'],[],"['ðŸ¤¦', 'â™‚']"
5,00:00:00,school teach kid climate change healthcare pre...,2068121373,ihooper,1007478642,['#RealEducation'],[1086130221],[]
6,00:00:00,make day mrs stephanie wood profitable strateg...,2018541552,ahenderson,1072983630,"['#investment', '#profitable']",[],[]


In [5]:
# Load the graph data
graph_data = pd.read_csv('../data/graph.csv')

# Check if any username in the graph.csv is not in the user relation list
missing_users = set(graph_data['source']).union(set(graph_data['target'])) - set(user_id_dict.values())
len(missing_users)

9185

In [40]:
import ast

def create_weighted_network(df):
    # Create an empty directed graph
    G = nx.DiGraph()
    
    # Process each row
    interaction_counts = {}
    
    for _, row in df.iterrows():
        source = row['user_id']
        # Convert string representation of list to actual list
        mentions = ast.literal_eval(str(row['mentions']))  # Use ast.literal_eval for more safety
        
        # Add edges for each mention
        for target in mentions:
            if (source, target) in interaction_counts:
                interaction_counts[(source, target)] += 1
            else:
                interaction_counts[(source, target)] = 1
    
    # Find maximum interactions for normalization
    max_interactions = max(interaction_counts.values())
    
    # Add normalized weighted edges
    for (source, target), count in interaction_counts.items():
        normalized_weight = count / max_interactions
        G.add_edge(source, target, weight=normalized_weight)
    
    return G

# Usage
G = create_weighted_network(data)