In [5]:
import igraph as ig
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import json

import sys
sys.path.append('../src')

from graph_utils import degree_centralization

In [6]:
def get_usergraph(edges: pd.DataFrame) -> ig.Graph:
    expression = re.compile(r'@[\w\d\.]+')
    edges['stitcher'] = edges['stitcher_url'].apply(lambda x: re.findall(expression, x)[0])
    edges['stitchee'] = edges['stitchee_url'].apply(lambda x: re.findall(expression, x)[0])

    edges = edges.groupby(['stitcher', 'stitchee']).size().reset_index()

    G = ig.Graph.TupleList(edges[['stitcher', 'stitchee']].values, directed=True, edge_attrs=['weight'])
    G.es['weight'] = edges[0]

    return G

In [7]:
def output_summary_statistics(G: ig.Graph) -> None:
    G_un = G.as_undirected()
    G_simple = G_un.simplify()

    components = G_simple.components()
    c_sizes = [len(c) for c in components if len(c) > 2]
    c_centralizations = [degree_centralization(G_simple.subgraph(c)) for c in components if len(c) > 2]

    lcc_centralization = c_centralizations[np.argmax(c_sizes)]

    summary_data = {
        'Metric': [
            'Number of vertices',
            'Number of edges',
            'Number of components',
            'Largest component size',
            'Degree assortativity',
            'Clustering coefficient',
            'Diameter',
            'Undirected diameter',
            'Reciprocity',
            'Global degree centralization',
            'Largest component degree centralization',
            'Avg local degree centralization',
            'Weighted avg local degree centralization'
        ],
        'Value': [
            G.vcount(),
            G.ecount(),
            len(G_un.components()),
            max(G_un.components().sizes()),
            G.assortativity_degree(directed=True),
            G.transitivity_undirected(),
            G.diameter(),
            G_un.diameter(),
            G.reciprocity(),
            degree_centralization(G_simple.as_undirected()),
            lcc_centralization,
            np.mean(c_centralizations),
            np.average(c_centralizations, weights=c_sizes)
        ]
    }

    # Convert the data to a pandas DataFrame
    df_summary = pd.DataFrame(summary_data)

    # Return the DataFrame
    return df_summary

In [21]:
import pandas as pd
import json
import igraph as ig

# Initialize empty dataframes for videos and users
df_video_combined = pd.DataFrame()
df_user_combined = pd.DataFrame()

hashtags = [
    'anime',
    'asmr',
    'biden2024',
    'challenge',
    'conspiracy',
    'election',
    'gym',
    'israel',
    'jazz',
    'kpop',
    'learnontiktok',
    'lgbt',
    'maga',
    'makeup',
    'minecraft',
    'movie',
    'palestine',
    'plantsoftiktok',
    'storytime',
    'tiktoknews',
    'trump2024',
    'watermelon'
]

# Iterate over each hashtag
for hashtag in hashtags:
    try:

        # Read vertex data
        with open(f'../data/hashtags/stitch/vertices/{hashtag}.json', 'r') as f:
            vertices = json.load(f)

        # Read edges from file
        edges = pd.read_csv(f'../data/hashtags/stitch/edges/{hashtag}_edges.txt', header=None)
        edges.columns = ['stitcher_url', 'stitchee_url']

        # Remove all None edges
        edges = edges.dropna()

        # Clean dataset
        edges['stitcher'] = edges['stitcher_url'].apply(lambda x: x.split('/')[-1]).astype(int)
        edges['stitchee'] = edges['stitchee_url'].apply(lambda x: x.split('/')[-1]).astype(int)

        # Construct graph for videos
        G_video = ig.Graph.TupleList(edges[['stitcher', 'stitchee']].values, directed=True, edge_attrs=['weight'])
        G_video.es['weight'] = 1

        # Output summary statistics for the video graph
        df_video = output_summary_statistics(G_video)
        
        # Set the index to the metrics (assuming the metrics are already row labels)
        df_video = df_video.set_index(df_video.columns[0])

        # Rename the column to the current hashtag
        df_video.columns = [hashtag]

        # Concatenate the video dataframe horizontally (axis=1)
        df_video_combined = pd.concat([df_video_combined, df_video], axis=1)

        # Repeat for user graph
        G_user = get_usergraph(edges)
        df_user = output_summary_statistics(G_user)
        
        # Set the index to the metrics
        df_user = df_user.set_index(df_user.columns[0])

        # Rename the column to the current hashtag
        df_user.columns = [hashtag]

        # Concatenate the user dataframe horizontally (axis=1)
        df_user_combined = pd.concat([df_user_combined, df_user], axis=1)

    except Exception as e:
        continue

In [26]:
print(df_video_combined.to_latex(float_format='%.2f'))
print(df_user_combined.to_latex(float_format='%.2f'))

\begin{tabular}{lrrrrrrrrrrrrrrrrrrrr}
\toprule
 & anime & biden2024 & challenge & conspiracy & election & gym & israel & kpop & learnontiktok & lgbt & maga & makeup & minecraft & movie & palestine & plantsoftiktok & storytime & tiktoknews & trump2024 & watermelon \\
Metric &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\midrule
Number of vertices & 2351.00 & 281.00 & 668.00 & 435.00 & 401.00 & 929.00 & 816.00 & 1100.00 & 792.00 & 2111.00 & 1311.00 & 1069.00 & 169.00 & 782.00 & 1550.00 & 140.00 & 2330.00 & 150.00 & 1120.00 & 339.00 \\
Number of edges & 1363.00 & 145.00 & 345.00 & 227.00 & 210.00 & 480.00 & 433.00 & 737.00 & 413.00 & 1183.00 & 689.00 & 626.00 & 94.00 & 433.00 & 841.00 & 71.00 & 1385.00 & 76.00 & 600.00 & 180.00 \\
Number of components & 988.00 & 136.00 & 323.00 & 208.00 & 191.00 & 449.00 & 383.00 & 363.00 & 379.00 & 928.00 & 622.00 & 443.00 & 75.00 & 349.00 & 709.00 & 69.00 & 945.00 & 74.00 & 520.00 & 159.00 \\
Largest component size & 113.00 & 4.00 & 5.