In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.width', 500)
import time
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import igraph as ig
from fun.fun import *

In [None]:
def get_tweets_dataframe():
    # Import dataset from tsv file
    dataset_fn = "../dataset/TweetsCOV19.tsv"
    header = ["Tweet Id", "Username", "Timestamp", "Followers", "Friends", "Retweets", "Favorites", "Entities", "Sentiment", "Mentions", "Hashtags", "URLs", "EXTRA"]
    dtype = {"Tweet Id":"string", "Username":"string", "Timestamp":"string", "Followers":int, "Friends":int, "Retweets":int, "Favorites":int, "Entities":"string", "Sentiment":"string", "Mentions":"string", "Hashtags":"string", "URLs":"string", "EXTRA":"string"}
    print("Importing dataset from tsv file ...", end='')
    start = time.time()
    df = pd.read_csv(dataset_fn, sep='\t', names=header, on_bad_lines='warn', dtype=dtype)
    end = time.time()
    print("read {:_} lines (took {:.1f}s)".format(len(df), end-start))
    df.set_index('Tweet Id', inplace=True)

    # Convert timestamp column to Timestamp object
    print("Converting timestamp column")
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%a %b %d %H:%M:%S %z %Y')

    # Filter columns and timestamp
    print("Filtering desired columns and between desired dates ... ", end='')
    dff = df[["Username", "Timestamp", "Sentiment", "Hashtags"]]
    start_date =    pd.to_datetime('2019-12-01 00:00:00 +0000')
    end_date =      pd.to_datetime('2020-03-01 00:00:00 +0000')
    dff = dff[(dff['Timestamp'] >= start_date) & (dff['Timestamp'] < end_date)]
    print("{:_} rows in dataframe".format(len(df)))

    # Parse hashtags tab into array
    print("Parsing hashtags and positive/negative sentiments")
    dff['Hashtags'] = dff['Hashtags'].str.split().apply(lambda x: [name for name in x if name != "null;"] if isinstance(x, list) else [])

    # Split positive and negative sentiments into own columns (and convert to int type)
    dff[['Sentiment_pos', 'Sentiment_neg']] = dff['Sentiment'].str.split(" ", expand=True)
    dff['Sentiment_pos'], dff['Sentiment_neg'] = dff['Sentiment_pos'].astype(int), dff['Sentiment_neg'].astype(int)
    dff.drop("Sentiment", axis=1, inplace=True)

    # Filter rows with mentions (and less that outlier mentions)
    print("filtering for tweets that contain hashtags ... ", end='')
    ht = dff[dff['Hashtags'].apply(lambda x: len(x) > 0 and len(x) < 60)]
    print("{:_} rows in dataframe".format(len(df)))

    return ht

In [None]:
# Variables
edges_fn = "../data/edges.csv"
edges_total = 684_732_453 # hardcoded

In [None]:
# Read edges to dataframe
perc = 100
nrows=int(edges_total*perc/100)
print("reading edges ... ", end='')
start = time.time()
df = pd.read_csv(edges_fn, nrows=nrows)
end = time.time()
print("read {:_} lines (took {:.1f}s)".format(len(df), (end-start)))

In [None]:
print(df['source'].dtype)

In [None]:
# Read edges to dataframe
perc = 100
nrows=int(edges_total*perc/100)
print("reading edges ... ", end='')
start = time.time()
df = pd.read_csv(edges_fn, nrows=nrows, dtype={'source':'int32', 'target':'int32'})
end = time.time()
print("read {:_} lines (took {:.1f}s)".format(len(df), (end-start)))

In [None]:
print("creating graph ... ", end='')
start = time.time()
g = ig.Graph.TupleList(df.values)
end = time.time()
print("created graph with {:_} nodes and {:_} edges (took {:.1f}s)".format(len(g.vs), len(g.es), (end-start)))

In [None]:
# Calculate degree centrality of graph via edge list
degree_cent = {}
for i, node in enumerate(pd.concat([df['source'], df['target']])):
    degree_cent[node] = degree_cent.get(node, 0) + 1
    _, perc = track_progess(len(df['source'])*2, i, inc=25)
print("\nDone.")
print(len(degree_cent))

In [None]:
# get estimate for nodelist representation of grpah
size = 0
base = 19
for v in degree_cent.values():
    size += base * (1 + v)
print("{:_}".format(size/1000000))

In [None]:
values = degree_cent.values()
plt.hist(values, bins=50)
plt.show()

In [None]:
def plot_power_law_fit(values, title='power law fit check', figsize=(5,5), ci=0.95):
    v_min, v_max = min(values), max(values)
    X = np.logspace(np.log10(v_min), np.log10(v_max), 40) # generate log distributed x values
    Y = [len([v for v in values if v <= x]) for x in X] # cumulative sum

    lnX, lnY = np.log(X), np.log(Y)

    """ regr = LinearRegression().fit(lnX.reshape(-1,1), lnY)
    lnY_pred = regr.predict(lnX.reshape(-1,1)) """

    lnX_con = sm.add_constant(lnX)
    lr = sm.OLS(lnY, lnX_con).fit()
    y_int, grad = lr.params
    conf_interval = lr.conf_int(1-ci)
    (y_int_lower, y_int_upper), _ = conf_interval
    lnY_pred = lr.predict(lnX_con)

    _, ax = plt.subplots(figsize=figsize)
    ax.plot(lnX, lnY, 'r.', label='samples')
    ax.plot(lnX, y_int + grad*lnX, 'g', label='regression line')
    ax.plot(lnX, y_int_lower + grad*lnX, 'b', label='lower ci ({})'.format(ci))
    ax.plot(lnX, y_int_upper + grad*lnX, 'b', label='upper ci ({})'.format(ci))
    #ax.plot(lnX, lnY_pred, 'p', label='pred'.format(ci))

    ax.set_title(title)
    plt.ylabel('log(n)')
    plt.xlabel('log(p_k)')

    plt.legend(loc=0)
    leg = plt.gca().get_legend()
    ltext = leg.get_texts()
    plt.setp(ltext, fontsize=10)

    plt.show()

In [None]:
plot_power_law_fit(values, title='1% of whole graph')