In [2]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import regex as re
import networkx as nx
import random
from scipy.sparse import csr_matrix
from scipy.spatial import distance
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from scipy.spatial.distance import cosine
from sklearn.preprocessing import StandardScaler
import os
from tqdm import tqdm
import plotly.graph_objects as go
from PIL import Image
from scipy.stats import spearmanr

from networkx.algorithms.community import louvain_communities
from networkx.algorithms.community.quality import modularity

from functions import calculate_monthly_velocities_cosine, get_similarities, get_matrix

In [2]:
# Read the Parquet file into an Arrow Table
tweets = pq.read_table('data/tweets_light.parquet')
retweets = pq.read_table('data/retweets_light.parquet')
users = pq.read_table('data/users_tw+rt_light.parquet')

# Convert the Arrow Table to a Pandas DataFrame
df_tweets = tweets.to_pandas()
df_retweets = retweets.to_pandas()
df_users = users.to_pandas()

In [None]:
#convert column data types
df_tweets['author_id'] = df_tweets['author_id'].astype(int)
df_users['id'] = df_users['id'].astype(float)

#merge df_tweets and df_users
df_users_tweets = pd.merge(df_tweets, df_users, left_on="author_id", right_on="id")

In [None]:
df_users_tweets.drop(columns="id_y", inplace = True)
df_users_tweets = df_users_tweets.rename(columns={"created_at_x":"tweet_created_at", "id_x":"original_post_id", "created_at_y":"account_created_at", "name":"author_name", "username":"author_username"})

In [None]:
#merge df_user_tweets and df_retweets on original_post_id and post_id
df_all = pd.merge(df_users_tweets, df_retweets, left_on="original_post_id", right_on="post_id")
df_all.drop(columns=["post_id", "url", "location", "verified"], inplace = True)

In [None]:
#convert column data types
df_all['original_post_id'] = df_all['original_post_id'].astype(int)
df_all['retweeter_id'] = df_all['retweeter_id'].astype(int)

In [None]:
df_all = df_all.drop(columns=['lang', 'text',
       'possibly_sensitive', 'referenced_id', 'reference_type',
       'public_metrics.like_count', 'public_metrics.quote_count',
       'public_metrics.reply_count', 'public_metrics.retweet_count',
       'account_created_at', 'description','name'])

In [None]:
df_all.dropna(inplace=True)

In [None]:
df_all.describe()

Unnamed: 0,tweet_created_at,original_post_id,author_id,retweeter_id
count,29235029,29235030.0,29235030.0,29235030.0
mean,2020-06-25 17:59:32.733049,1.276245e+18,1.18561e+17,4.458498e+17
min,2017-12-31 23:11:09,9.476212e+17,5893702.0,12.0
25%,2019-03-09 14:13:47,1.1044e+18,14060260.0,575423900.0
50%,2020-06-02 07:19:34,1.267752e+18,150725700.0,2895642000.0
75%,2021-09-21 22:00:00,1.440548e+18,1024976000.0,9.851892e+17
max,2022-12-31 22:19:02,1.609328e+18,1.555225e+18,1.666974e+18
std,,1.928046e+17,3.106293e+17,5.506337e+17


In [None]:
print("df has shape:",df_all.shape)

df has shape: (29235029, 6)


In [None]:
df_all.head()

Unnamed: 0,tweet_created_at,original_post_id,author_id,author_name,author_username,retweeter_id
0,2018-12-31 22:46:18,1079886497279561728,622354597,Salvo Di Grazia,MedBunker,951848540
1,2018-12-31 22:46:18,1079886497279561728,622354597,Salvo Di Grazia,MedBunker,135554444
2,2018-12-31 22:46:18,1079886497279561728,622354597,Salvo Di Grazia,MedBunker,433418060
3,2018-12-31 22:46:18,1079886497279561728,622354597,Salvo Di Grazia,MedBunker,1668533642
4,2018-12-31 22:46:18,1079886497279561728,622354597,Salvo Di Grazia,MedBunker,1623208790


In [3]:
checkpoint_path = "data/df_checkpoint.parquet"

# load data from checkpoint or save
if os.path.exists(checkpoint_path):
    print("Loading df from checkpoint...")
    df_all = pd.read_parquet(checkpoint_path)
else:
    print("Saving df to checkpoint...")
    df_all.to_parquet(checkpoint_path)

Loading df from checkpoint...


### Get Statistcs

In [4]:
years = df_all["tweet_created_at"].dt.year.unique()
years = [y for y in years if y != 2017]

In [5]:
month_mapping = {
    1: "January",
    2: "February",
    3: "March",
    4: "April",
    5: "May",
    6: "June",
    7: "July",
    8: "August",
    9: "September",
    10: "October",
    11: "November",
    12: "December"
}

In [6]:
def get_month(month_num:int, year:int):
    '''
    get 1 month of data from df (all data)
    '''
    return df[df['tweet_created_at'].dt.to_period('M') == f'{year}-{month_num}']

In [7]:
def author_mapping(month_name:str):

        '''
        This function outputs the top and bottom 10 similarities for a specific month.
        '''
        author_to_index = {author: j for j, author in enumerate(month_data[month_name]['author_id'].unique())}

        ## Create a mapping from author_id to author_name
        author_id_to_name = month_data[month_name].set_index('author_id')['author_name'].to_dict()

        #  Map author IDs to names
        author_names = [author_id_to_name[author] for author in author_to_index.keys()]

        # Create a DataFrame for the similarities matrix
        similarities_df = similarities_copy[month_name]
        similarities_df.columns = author_names
        similarities_df.index = author_names

In [8]:
year_stats = {}

for year in years:

    # create a filtered dataframe for the current year
    df = df_all[df_all["tweet_created_at"].dt.year == year]
    
    month_data = {}

    for k, v in month_mapping.items():
        month_data[v] = get_month(month_num=k, year=year)

    month_matrices = {}

    for k, v in month_data.items():
        month_matrices[k] = get_matrix(month_data[k])

    similarities = {}

    for k, v in tqdm(month_matrices.items(), desc=f"Calculating similarities for {year}"):
        similarities[k] = get_similarities(month_matrix=month_matrices[k], metric='cosine')
    
    similarities_copy = similarities.copy()

    # map author names to rows and columns
    for month in similarities_copy.keys():
        author_mapping(month_name=month)

    # merge similarities using full outer join
    merged_similarities = pd.DataFrame()

    for month, similarity_df in similarities_copy.items():
        similarity_df.columns = [f"{col}_{month}" for col in similarity_df.columns]
        merged_similarities = pd.merge(
            merged_similarities, similarity_df, how="outer", left_index=True, right_index=True).fillna(0) # Add suffix manually to avoid collision
            
    merged_similarities = merged_similarities.T

    month_names = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

    avg_std_per_month = {}
    avg_mean_per_month = {}

    for month in month_names:
        filtered = merged_similarities[merged_similarities.index.str.contains(f'_{month}', case=False)].sort_index()
        std_per_column = filtered.std(axis=0, skipna=True)
        mean_per_column = filtered.mean(axis=0, skipna=True)
        avg_std_per_month[month] = std_per_column.mean()
        avg_mean_per_month[month] = mean_per_column.mean()

    # Calculate velocities using cosine distance
    velocities_cosine_full = calculate_monthly_velocities_cosine(merged_similarities, month_names)

    # Convert the velocities_cosine dictionary into a DataFrame
    velocities_df_full = pd.concat(velocities_cosine_full, axis=0)

    # Reset the index to make the month pairs a column
    velocities_df_full.reset_index(inplace=True)

    # Rename the columns for clarity
    velocities_df_full.columns = ['Month Pair', 'Node', 'Velocity']

    # Define the desired column order
    column_order = [
        'January-February', 'February-March', 'March-April', 'April-May', 
        'May-June', 'June-July', 'July-August', 'August-September', 
        'September-October', 'October-November', 'November-December'
    ]

    # Pivot the DataFrame to make month pairs the column names
    velocities_df_full = velocities_df_full.pivot(index='Node', columns='Month Pair', values='Velocity')

    # Reorder the columns
    velocities_df_full = velocities_df_full[column_order]

    # Reset the index to make it more readable (optional)
    velocities_df_full.reset_index(inplace=False)

    # Prepare velocity means and stds for the same x-axis
    veloc_means_arr = np.array([velocities_df_full[col].mean(skipna=True) for col in column_order])
    veloc_stds_arr = np.array([velocities_df_full[col].std(skipna=True) for col in column_order])

    modularity_per_month = {}

    for month in month_names:
        # Create the graph for the month
        filtered = merged_similarities[merged_similarities.index.str.contains(f'_{month}', case=False)].sort_index()
        filtered.index = filtered.index.str.replace(r'_[^_]+$', '', regex=True)

        filtered = filtered.loc[filtered.index, filtered.index]
        G = nx.from_pandas_adjacency(filtered)
        
        # Louvain communities and modularity
        communities = louvain_communities(G, weight='weight', seed=42) # weight='weight' as values from the filtered variable
        mod = modularity(G, communities, weight='weight')
        modularity_per_month[month] = mod

    consecutive_modularity_averages = []
    consecutive_month_pairs = []

    months = list(modularity_per_month.keys())
    modularities = list(modularity_per_month.values())

    for i in range(len(months) - 1):
        avg = (modularities[i] + modularities[i + 1]) / 2
        consecutive_modularity_averages.append(avg)
        consecutive_month_pairs.append(f"{months[i]}-{months[i+1]}")

    # Store stats for this year, now including avg_std_per_month and avg_mean_per_month
    year_stats[year] = (
        veloc_means_arr,
        veloc_stds_arr,
        consecutive_modularity_averages,
        avg_std_per_month,
        avg_mean_per_month
    )

# After the loop, create a DataFrame
stats_df = pd.DataFrame([
    {
        'year': year,
        'veloc_means_arr': veloc_means_arr,
        'veloc_stds_arr': veloc_stds_arr,
        'consecutive_modularity_averages': mod_avgs,
        'sim_avg_std_per_month': avg_std_per_month,
        'sim_avg_mean_per_month': avg_mean_per_month
    }
    for year, (veloc_means_arr, veloc_stds_arr, mod_avgs, avg_std_per_month, avg_mean_per_month) in year_stats.items()
])

Calculating similarities for 2018: 100%|██████████| 12/12 [01:13<00:00,  6.12s/it]
Calculating similarities for 2019: 100%|██████████| 12/12 [00:40<00:00,  3.35s/it]
Calculating similarities for 2020: 100%|██████████| 12/12 [00:54<00:00,  4.55s/it]
Calculating similarities for 2021: 100%|██████████| 12/12 [00:45<00:00,  3.83s/it]
Calculating similarities for 2022: 100%|██████████| 12/12 [00:45<00:00,  3.77s/it]


In [9]:
stats_df.head()

Unnamed: 0,year,veloc_means_arr,veloc_stds_arr,consecutive_modularity_averages,sim_avg_std_per_month,sim_avg_mean_per_month
0,2018,"[0.051214008463286664, 0.05491317332766376, 0....","[0.05299022692465565, 0.0890761021085431, 0.07...","[0.5680653341085709, 0.5783983258293317, 0.569...","{'January': 0.05438617324026037, 'February': 0...","{'January': 0.01591008195557077, 'February': 0..."
1,2019,"[0.04648863740507882, 0.04981199329265121, 0.0...","[0.05207777475178581, 0.05488856551853223, 0.0...","[0.5111162670097047, 0.5347026476729275, 0.551...","{'January': 0.06722109384721667, 'February': 0...","{'January': 0.024296180851497488, 'February': ..."
2,2020,"[0.04632057040232594, 0.04101189044380686, 0.0...","[0.03383944252611081, 0.02733676694251247, 0.0...","[0.51963245106091, 0.4870908112851826, 0.47975...","{'January': 0.06090538976340603, 'February': 0...","{'January': 0.020997684324814293, 'February': ..."
3,2021,"[0.03419731597636642, 0.03098708858310372, 0.0...","[0.0330341457502581, 0.02991915562206083, 0.04...","[0.5112193727677552, 0.5272539271862956, 0.530...","{'January': 0.06033276184962287, 'February': 0...","{'January': 0.020772162064979664, 'February': ..."
4,2022,"[0.039197716300539656, 0.04606048233657261, 0....","[0.028935671284474378, 0.03648860027168035, 0....","[0.5118907310648402, 0.5256815905362684, 0.542...","{'January': 0.05742891722244035, 'February': 0...","{'January': 0.018737144726381, 'February': 0.0..."


In [3]:
stats_checkpoint_path = "stats_checkpoint.parquet"

# load data from checkpoint or save
if os.path.exists(stats_checkpoint_path):
    print("Loading stats_df from checkpoint...")
    stats_df = pd.read_parquet(stats_checkpoint_path)
else:
    print("Saving stats_df to checkpoint...")
    stats_df.to_parquet(stats_checkpoint_path, index=False)

Loading stats_df from checkpoint...
