In [None]:
# Lines (tweets): 8_151_524
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
pd.set_option('display.width', 500)

In [None]:
def show_distribution(values, one_width_bins=False):
    values = np.array(values)
    if one_width_bins:  bins = range(min(values), max(values) + 1, 1)
    else:               bins = 25
    print("\nSTATS:")
    print("max: {:_}".format(max(values)))
    print("mean: {:.2f}".format(np.mean(values)))
    fig, ax = plt.subplots(ncols=2, figsize=(10,4))
    ax[0].hist(values, bins=bins)
    ax[0].set_title('Histogram of amount of mentions per tweet')
    ax[1].hist(values, log=True, bins=bins)
    ax[1].set_title('Logarithmic Histogram of amount of mentions per tweet')
    plt.show()

In [None]:
# Import dataset from tsv file
dataset_fn = "dataset/TweetsCOV19.tsv"
header = ["Tweet Id", "Username", "Timestamp", "Followers", "Friends", "Retweets", "Favorites", "Entities", "Sentiment", "Mentions", "Hashtags", "URLs", "EXTRA"]
dtype = {"Tweet Id":"string", "Username":"string", "Timestamp":"string", "Followers":int, "Friends":int, "Retweets":int, "Favorites":int, "Entities":"string", "Sentiment":"string", "Mentions":"string", "Hashtags":"string", "URLs":"string", "EXTRA":"string"}
df = pd.read_csv(dataset_fn, sep='\t', names=header, on_bad_lines='warn', dtype=dtype)
df.set_index('Tweet Id', inplace=True)
print(df.shape)
print(df.head())
print(df.tail())

In [None]:
# Convert timestamp column to Timestamp object
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%a %b %d %H:%M:%S %z %Y')
print(df.head())

In [None]:
# Filter columns and timestamp
dff = df[["Username", "Timestamp", "Sentiment", "Hashtags"]]
start_date =    pd.to_datetime('2019-12-01 00:00:00 +0000')
end_date =      pd.to_datetime('2020-03-01 00:00:00 +0000')
dff = dff[(dff['Timestamp'] >= start_date) & (dff['Timestamp'] < end_date)]
print(dff.shape)
print(dff.head())

In [None]:
# Parse hashtags and mentions tab into array
#dff['Mentions'] = dff['Mentions'].str.split().apply(lambda x: [name for name in x if name != "null;"] if isinstance(x, list) else [])
dff['Hashtags'] = dff['Hashtags'].str.split().apply(lambda x: [name for name in x if name != "null;"] if isinstance(x, list) else [])

# Split positive and negative sentiments into own columns (and convert to int type)
dff[['Sentiment_pos', 'Sentiment_neg']] = dff['Sentiment'].str.split(" ", expand=True)
dff['Sentiment_pos'], dff['Sentiment_neg'] = dff['Sentiment_pos'].astype(int), dff['Sentiment_neg'].astype(int)
dff.drop("Sentiment", axis=1, inplace=True)

In [None]:
# Filter rows with mentions (and less that outlier mentions)
with_hashtags = dff[dff['Hashtags'].apply(lambda x: len(x) > 0 and len(x) < 60)]
print(with_hashtags.shape)

In [None]:
# View distribution of amount of hashtags per tweet
print("Extracting amount of hashtags ...")
hashtags_n = np.array(with_hashtags['Hashtags'].apply(lambda x: len(set(x))))
print("Sorting ...")
hashtags_n = sorted(hashtags_n, reverse=True)
print("Showing distribution ...")
show_distribution(hashtags_n, one_width_bins=True)

In [None]:
# Get tweet ids for each hashtag
df = with_hashtags
array_col, id_col = "Hashtags", "Tweet Id"
dict = {}
print("Getting list of ids per hashtag ...")
i = 0
for _, row in df.iterrows():
    i += 1
    perc = (i) / len(df) * 100
    print("\r {:_}/{:_} ({:.1f}%)".format(i, len(df), perc), end='')
    for term in set(row[array_col]):
        dict[term] = dict.get(term, []) + [row[id_col]]
print("\nDone.")
print("Found {:_} unique hashtags".format(len(dict)))

In [None]:
# Filter mentions with more than 1 associated tweet
dictf = { k: v for k, v in dict.items() if len(v) > 1 }
print("Found {:_} hashtags with more than 1 associated tweet".format(len(dictf)))

In [None]:
# View most common mentions
keys_sorted = sorted(dict.keys(), reverse=True, key=lambda key: len(dict[key]))
for i in range(5):
    key = keys_sorted[i]
    print("key: '{}' number of tweets: {:_}".format(key, len(dict[key])))

In [None]:
# Get number of edges that will be created
from math import comb
edges_n = sum([ comb(len(v),2) for v in list(dictf.values())[:-1] ])
print("Number of edges that will be created: {:_}".format(edges_n))

In [None]:
# Create edges from term:array pairs in dictf
print("Creating edges ...")
edges_fn = "data/edges.txt"
with open(edges_fn, 'w') as f:
    edges_created = 0
    for done, (_, ids) in enumerate(dictf.items()):
        perc = (done+1) / len(dictf) * 100
        print("\r {:_}/{:_} ({:.3f}%) edges: {:_}".format(done+1, len(dictf), perc, edges_created), end='')
        for i in range(len(ids)):
            for j in range(i+1, len(ids)):
                line = "{} {}\n".format(ids[i], ids[j])
                f.write(line)
                edges_created += 1
        if perc > 0.01: break
print("\nDone!")

In [None]:
# Create edges from term:array pairs in dictf and save to csv file
edges_csv_fn = "data/edges.csv"
csvfile = open(edges_csv_fn, 'w', newline='')
csvwriter = csv.writer(csvfile)
csvwriter.writerow(["source", "target"])
print("Creating edges ...")
for done, (_, ids) in enumerate(dictf.items()):
    perc = (done+1) / len(dictf) * 100
    print("\r {:_}/{:_} ({:.3f}%) edges: {:_}".format(done+1, len(dictf), perc, edges_created), end='')
    for i in range(len(ids)):
        for j in range(i+1, len(ids)):
            #line = "{} {}\n".format(ids[i], ids[j])
            #f.write(line)
            csvwriter.writerow([ids[i], ids[j]])
            edges_created += 1
    #if perc > 0.1: break
csvfile.close()
print("\nDone!")

In [None]:
# Create test dataframe
import random
df2 = pd.DataFrame()
n = 5
df2['id'] = [int(random.random()*10000) for _ in range(n)]
df2['Mentions'] = [ ['a'], ['b', 'c'], ['a', 'c'], ['b', 'd'], ['e', 'd', 'a'] ]
#print(df2)
mentions_n = df2['Mentions'].apply(lambda x: len(x))
mentions_n = np.array(mentions_n)
#mentions_n = np.log(mentions_n)
print(type(mentions_n))
print(mentions_n)

# GET DICT
df = df2
array_col, id_col = "Mentions", "id"
dict = {}
print("Getting list of ids per mention ...")
i = 0
for _, row in df.iterrows():
    i += 1
    perc = (i) / len(df) * 100
    print("\r {:_}/{:_} ({:.1f}%)".format(i, len(df), perc), end='')
    for mention in set(row[array_col]):
        dict[mention] = dict.get(mention, []) + [row[id_col]]
print("\nDone.")
print("Found {:_} unique mentions".format(len(dict)))

# CREATE AND SAVE EDGES
print("Creating edges ...")
edges_fn = "data/edges.txt"
with open(edges_fn, 'w') as f:
    
    edges = set()
    for _, ids in dict.items():
        for i in range(len(ids)):
            for j in range(i+1, len(ids)):
                edge = tuple(sorted((ids[i], ids[j])))
                edges.add(edge)
print("\nDone!")