In [1]:
# Functions: Analyse the evoltion of average positive and negative
import numpy as np
import pandas as pd
import dask.dataframe as dd
import igraph as ig
import matplotlib.pyplot as plt
import time
from fun.fun import *

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
# Filenames
dataset_fn = "../dataset/TweetsCOV19.tsv"
plot_fn = "../images/plots/transitivity_evolution.png"

In [3]:
# -> IN : Read Edge List
edges_fn = "../data/edges.parquet"
print("reading edges ... ", end='')
start = time.time()
df = dd.read_parquet("../data/edges.parquet")
df[['source', 'target']], df[['source', 'target']].astype('int32')
end = time.time()
print("read {:_} lines (took {:.1f}s)".format(len(df), (end-start)))
print(df.dtypes)
df.head()

reading edges ... read 684_732_453 lines (took 0.1s)
source    int64
target    int64
dtype: object


Unnamed: 0,source,target
0,13,103151
1,13,214293
2,103151,214293
3,13,138731
4,13,42023


In [None]:
# create adjacency matrix
nodes = dd.concat([df['source'], df['target']]).unique().compute()
num_nodes = len(nodes)
print("found {:_} unique nodes".format(num_nodes))

adj_matrix = np.zeros((num_nodes, num_nodes), dtype=int)
for i, j in df.values:
    adj_matrix[i, j] = 1
    adj_matrix[j, i] = 1

In [3]:
# -> IN : Load tweets dataframe
tw = get_filtered_tweets_dataframe(dataset_fn)
print("Loaded {:_} tweets".format(len(tw)))
tw.head()

Importing dataset from tsv file ...

In [5]:
# -> IN : Read list of nodes
nodes = pd.read_csv('../data/node_ids.csv', index_col='twitter id')
print("Number of nodes: {:_}".format(len(nodes)))
nodes.head()


Number of nodes: 410_885


Unnamed: 0_level_0,index
twitter id,Unnamed: 1_level_1
1200927495186505729,0
1200927503201816576,1
1200927507828097026,2
1200927511087067136,3
1200927514216062976,4


In [None]:
# GRAPH (ALL EDGES) : Loading edges into iGraph
print("Creating iGraph ...")
g = ig.Graph()
g.add_vertices(len(nodes))
print("Processing {} partitions ...".format(df.npartitions))
for i, partition in enumerate(df.to_delayed()):
    p_df = partition.compute()
    g.add_edges(p_df.values.tolist())
    _, perc = track_progress(df.npartitions, i, text="partitions processed:")
    #if perc > 1: break
    #if input("...") == 'b': break
    if i+1 >= 5: break
print("\nDone.")
print("\nGRAPH CREATED:")
print("  nodes: {:_}".format(len(g.vs)))
print("  edges: {:_}".format(len(g.es)))

In [None]:
# get transitivity
glob_transitivity = g.transitivity_undirected()
print(glob_transitivity)

In [None]:
from math import comb
n = 410000
triplets = comb(n,3)
triangles = glob_transitivity * triplets
print(f"{triplets:_}")
print(f"{triangles:_}")

In [6]:
# Compute timestamp bins
increments = 100
ts_min, ts_max = min(tw['Timestamp']), max(tw['Timestamp'])
ts_inc = (ts_max - ts_min) / 100
ts_bins = [ ts_min + (i+1)*ts_inc for i in range(increments) ]

print("Min timestamp:", ts_min)
print("Max timestamp:", ts_max)
print("Timestamp increment:", ts_inc)

Min timestamp: 2019-12-01 00:00:01+00:00
Max timestamp: 2020-02-29 23:59:49+00:00
Timestamp increment: 0 days 21:50:23.880000


In [7]:
# Compute average cumulative sentiment for timestamp increments
ts = ts_min + 10*ts_inc
tweets = tw[tw['Timestamp'] < ts]
len(tweets)

47303