In [None]:
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.width', 500)
import csv
import sys
import time
import igraph as ig

In [None]:
# Not in use, takes too long to load even part of entire graph from edge list
def create_igraph_graph_from_edgelist(fn, nrows=None):
    print("reading edges ... ", end='')
    start = time.time()
    df = pd.read_csv(fn, nrows=nrows)
    end = time.time()
    print("read {:_} lines (took {:.1f}s)".format(len(df), (end-start)))
    print("creating graph ... ", end='')
    start = time.time()
    g = ig.Graph.TupleList(df.values)
    end = time.time()
    print("created graph with {:_} nodes and {:_} edges (took {:.1f}s)".format(len(g.vs), len(g.es), (end-start)))
    return g

In [None]:
# Not in use, takes too long to load even part of entire graph from edge list
def create_networkx_graph_from_edgelist(fn, nrows=None):
    print("reading edges ... ", end='')
    start = time.time()
    edges_df = pd.read_csv(fn, nrows=nrows)
    end = time.time()
    print("read {:_} lines (took {:.1f}s)".format(len(edges_df), (end-start)))
    print("creating graph ... ", end='')
    start = time.time()
    g = nx.from_pandas_edgelist(edges_df, source='source', target='target')
    end = time.time()
    print("created graph with {:_} nodes and {:_} edges (took {:.1f}s)".format(len(g.nodes), len(g.edges), (end-start)))
    return g

In [None]:
def get_tweets_dataframe():
    # Import dataset from tsv file
    dataset_fn = "dataset/TweetsCOV19.tsv"
    header = ["Tweet Id", "Username", "Timestamp", "Followers", "Friends", "Retweets", "Favorites", "Entities", "Sentiment", "Mentions", "Hashtags", "URLs", "EXTRA"]
    dtype = {"Tweet Id":"string", "Username":"string", "Timestamp":"string", "Followers":int, "Friends":int, "Retweets":int, "Favorites":int, "Entities":"string", "Sentiment":"string", "Mentions":"string", "Hashtags":"string", "URLs":"string", "EXTRA":"string"}
    print("Importing dataset from tsv file ...", end='')
    start = time.time()
    df = pd.read_csv(dataset_fn, sep='\t', names=header, on_bad_lines='warn', dtype=dtype)
    end = time.time()
    print("read {:_} lines (took {:.1f}s)".format(len(df), end-start))
    df.set_index('Tweet Id', inplace=True)

    # Convert timestamp column to Timestamp object
    print("Converting timestamp column")
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%a %b %d %H:%M:%S %z %Y')

    # Filter columns and timestamp
    print("Filtering desired columns and between desired dates ... ", end='')
    dff = df[["Username", "Timestamp", "Sentiment", "Hashtags"]]
    start_date =    pd.to_datetime('2019-12-01 00:00:00 +0000')
    end_date =      pd.to_datetime('2020-03-01 00:00:00 +0000')
    dff = dff[(dff['Timestamp'] >= start_date) & (dff['Timestamp'] < end_date)]
    print("{:_} rows in dataframe".format(len(df)))

    # Parse hashtags tab into array
    print("Parsing hashtags and positive/negative sentiments")
    dff['Hashtags'] = dff['Hashtags'].str.split().apply(lambda x: [name for name in x if name != "null;"] if isinstance(x, list) else [])

    # Split positive and negative sentiments into own columns (and convert to int type)
    dff[['Sentiment_pos', 'Sentiment_neg']] = dff['Sentiment'].str.split(" ", expand=True)
    dff['Sentiment_pos'], dff['Sentiment_neg'] = dff['Sentiment_pos'].astype(int), dff['Sentiment_neg'].astype(int)
    dff.drop("Sentiment", axis=1, inplace=True)

    # Filter rows with mentions (and less that outlier mentions)
    print("filtering for tweets that contain hashtags ... ", end='')
    ht = dff[dff['Hashtags'].apply(lambda x: len(x) > 0 and len(x) < 60)]
    print("{:_} rows in dataframe".format(len(df)))

    return ht

In [None]:
def track_progess(total, progress, text='progress:', inc=1):
    a, b = progress, total
    perc = ((a+1) / b * 100)
    if a%inc == 0:
        print("\r {} {:_}/{:_} ({:.5f}%)".format( text, (a+1), b, perc ), end='')
    a += 1
    return a, perc

In [None]:
# Variables
edges_fn = "../data/edges.csv"
edges_total = 684_732_453 # hardcoded

In [None]:
# Read edges to dataframe
perc = 100
g = create_igraph_graph_from_edgelist(edges_fn, nrows=int(edges_total*perc/100))

In [None]:
# Display degree centrality distribution
degree = g.degree()
print(np.mean(degree))
print(len(degree))
plt.hist(degree, bins=50)
plt.show()

In [None]:
ccs = g.connected_components()
print("found {:_} connected components".format(len(ccs)))
print(ccs.sizes())
print(max(ccs.sizes()))
i = ccs.sizes().index(max(ccs.sizes()))
print(i)
print("largest connected component has {:_} nodes".format(len(ccs[i])))
#sg = g.subgraph(ccs[i])
#print(sg)
#print(ccs[i])
#print(ccs.membership)
#for cc in ccs:
#    print(len(cc))

In [None]:
# Make subgraph from largest connected component
sg = g.subgraph(ccs[i])
print("largest connected component has {:_} edges".format(len(sg.es)))

average_shortest_path = np.mean(sg.shortest_paths())
print("Average shortest path in largest connected component:", average_shortest_path)

In [None]:
# Get average shortest path
average_shortest_path = np.mean(g.shortest_paths())
print(average_shortest_path)

In [None]:
########## UP TO HERE!!! ###############

In [None]:
# Load tweets dataframe
ht = get_tweets_dataframe()
df_data_usage = ht.memory_usage(deep=True).sum()
print("rows:    {:_}".format( ht.shape[0] ))
print("size mb: {:_}".format( int(df_data_usage /(1024**2)) ))

In [None]:
# Get df with only tweets in graph g
ig = ht[ht['Tweet Id'].apply(lambda x: x in g)]
df_data_usage = ig.memory_usage(deep=True).sum()
print("rows:    {:_}".format( ig.shape[0] ))
print("size mb: {:_}".format( int(df_data_usage /(1024**2)) ))
print(ig.head())

In [None]:
# Setting node values
print("")
nodes_handled = 0
for n in g.nodes:
    try:
        row = ht.loc[n]
    except:
        print(n)
        print(nodes_handled)
        break
    g.nodes[n]['timestamp'] = row['Timestamp']
    g.nodes[n]['positive_sentiment'] = row['Sentiment_pos']
    g.nodes[n]['negative_sentiment'] = row['Sentiment_neg']
    nodes_handled, perc = track_progess(len(g.nodes), nodes_handled, text='nodes handled:', inc=25)
    #if perc > 2: break
print("\nDone.")

In [None]:
# Select nodes before date and make subgraph
n = list(g.nodes)[0]
print(g.nodes[n])
ts = pd.to_datetime("2020-01-03 00:00:00 +0000")
before_nodes = [ n for n in g.nodes if g.nodes[n]['timestamp'] < ts ]
print(len(before_nodes))
sg = g.subgraph(before_nodes)
print(sg)

In [None]:
# Adjacency matrix
matrix = nx.adjacency_matrix(g)
print(matrix.shape)

In [None]:
# Connected components
ccs = list(nx.connected_components(g))
print(len(ccs))

In [None]:
for cc in ccs:
    print(len(cc))
    if len(cc) < 18:
        print(cc)
        sg = g.subgraph(cc)
        print(sg)
        nx.draw(sg)
        plt.show()

In [None]:
# Degree centralities histogram
degree_centralities = nx.degree_centrality(g)
values = list(degree_centralities.values())
plt.hist(values, bins=50)
plt.show()

In [None]:
def get_tweets_for_each_hashtag(df):
    array_col = "Hashtags"
    dicti = {}
    print("Getting list of ids per hashtag ...")
    i = 0
    for _, row in df.iterrows():
        i += 1
        perc = (i) / len(df) * 100
        print("\r {:_}/{:_} ({:.1f}%)".format(i, len(df), perc), end='')
        for term in set(row[array_col]):
            dicti[term] = dicti.get(term, []) + [row.name]
    print("\nDone.")
    print("Found {:_} unique hashtags".format(len(dicti)))
    dicti = { k: v for k, v in dicti.items() if len(v) > 1 }
    print("Found {:_} hashtags with more than 1 associated tweet".format(len(dicti)))
    return dicti

In [None]:
def make_graph_from_hashtags(ht_dict):
    print("Creating edges from {} hashtags ...".format(len(ht_dict)))
    g = nx.Graph()
    for _, ids in ht_dict.items():
        for i in range(len(ids)):
            for j in range(i+1, len(ids)):
                g.add_edge(ids[i], ids[j])
    print(g)
    return g

In [None]:
### BACK TO DATAFRAMES ###
df = get_tweets_dataframe()

In [None]:
timestamps = list(df['Timestamp'])

In [None]:
print(len(timestamps))
print(type(timestamps[0]))
print(max(timestamps))
print(min(timestamps))
ts_start, ts_end = min(timestamps), max(timestamps)
ts_inc = (ts_end - ts_start) / 100
window_start = ts_start
window_end = window_start + 1*ts_inc
sel = df[(window_start <= df['Timestamp']) & (df['Timestamp'] < window_end)]
print(sel.shape)
#sel.head()

In [None]:
ht_dict = get_tweets_for_each_hashtag(sel)

In [None]:
# Get number of edges that will be created
from math import comb
edges_n = sum([ comb(len(v),2) for v in list(ht_dict.values())[:-1] ])
print("Number of edges that will be created: {:_}".format(edges_n))

In [None]:
# make graph from hashtags
g = make_graph_from_hashtags(ht_dict)

In [None]:
nx.write_edgelist(g, "../data/increments/edgelist_1.csv", data=["source", "target"])
#nx.write_edgelist(g, "test.csv", data=["source", "target"])

In [None]:
#nodes = list(g.nodes)[:]
fig, ax = plt.subplots(figsize=(15,8))
#nx.draw_networkx(g.subgraph(nodes), with_labels=False, ax=ax)
nx.draw_networkx(g, with_labels=False, ax=ax)
plt.show()

In [None]:
increments = 100
ts_start, ts_end = min(timestamps), max(timestamps)
ts_inc = (ts_end - ts_start) / 100
for i in range(increments):
    window_start = ts_start
    window_end = window_start + (i+1)*ts_inc
    sel = df[(window_start <= df['Timestamp']) & (df['Timestamp'] < window_end)]
    print(sel)
    ht_dict = get_tweets_for_each_hashtag(sel)
    g = make_graph_from_hashtags(ht_dict)
    nx.write_edgelist(g, "../data/increments/edgelist_{}.csv".format(i+1), data=["source", "target"])
    
    input("...")