In [1]:
# Functions: Analyse the evoltion of average positive and negative
import numpy as np
import pandas as pd
import dask.dataframe as dd
import igraph as ig
import matplotlib.pyplot as plt
import time
from fun.fun import *

pd.set_option('display.width', 500)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
# Filenames
dataset_fn = "../dataset/TweetsCOV19.tsv"
plot_fn = "../images/plots/transitivity_evolution.png"

In [3]:
# -> IN : Read Edge List
edges_fn = "../data/edges.parquet"
print("reading edges ... ", end='')
start = time.time()
df = dd.read_parquet("../data/edges.parquet")
df[['source', 'target']] = df[['source', 'target']].astype('int32')
end = time.time()
print("read {:_} lines (took {:.1f}s)".format(len(df), (end-start)))
print(df.dtypes)
df.head()

reading edges ... read 684_732_453 lines (took 0.1s)
source    int64
target    int64
dtype: object


Unnamed: 0,source,target
0,13,103151
1,13,214293
2,103151,214293
3,13,138731
4,13,42023


In [4]:
# -> IN : Load tweets dataframe
tw = get_filtered_tweets_dataframe(dataset_fn)
print("Loaded {:_} tweets".format(len(tw)))
tw.head()

Importing dataset from tsv file ...read 8_077_794 lines (took 59.9s)
Converting timestamp column
Filtering desired columns and between desired dates ... 8_077_794 rows in dataframe
Parsing hashtags and positive/negative sentiments
filtering for tweets that contain hashtags ... 8_077_794 rows in dataframe
Loaded 462_901 tweets


Unnamed: 0_level_0,Username,Timestamp,Hashtags,Sentiment_pos,Sentiment_neg
Tweet Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1200928806757752833,83c182d0ee195dc692900d7ff7328171,2019-12-01 00:05:14+00:00,"[rgvwx, rgv, txwx, spi]",1,-2
1200931476642115586,73c3e83574f1eb0a5201c71cf46c732c,2019-12-01 00:15:50+00:00,[KeDezembaBoss],2,-4
1200941770835521536,4efca14ff68aa1edd9aea0c78c615ef2,2019-12-01 00:56:45+00:00,"[fame., FreeBritney]",2,-2
1200948819015454721,a53fd620b4178d04a1d2c4e37e85e0ae,2019-12-01 01:24:45+00:00,[FakeNews],1,-2
1200955132944560135,215a5a37e161f7ceaa0c532e73f62c40,2019-12-01 01:49:51+00:00,[KabukiPolitics],1,-1


In [95]:
# -> IN : Read list of nodes
nodes = pd.read_csv('../data/node_ids.csv', index_col='twitter id')
print("Number of nodes: {:_}".format(len(nodes)))
nodes.head()

Number of nodes: 410_885


Unnamed: 0_level_0,index
twitter id,Unnamed: 1_level_1
1200927495186505729,0
1200927503201816576,1
1200927507828097026,2
1200927511087067136,3
1200927514216062976,4


In [96]:
# Compute timestamp bins
steps = 100
ts_min, ts_max = min(tw['Timestamp']), max(tw['Timestamp'])
ts_inc = (ts_max - ts_min) / 100
ts_bins = [ ts_min + i*ts_inc for i in range(steps+1) ]

print("Min timestamp:", ts_min)
print("Max timestamp:", ts_max)
print("Timestamp increment:", ts_inc)

Min timestamp: 2019-12-01 00:00:01+00:00
Max timestamp: 2020-02-29 23:59:49+00:00
Timestamp increment: 0 days 21:50:23.880000


In [97]:
# Limit tweet ids to between timeframe and fetch index values
tw_sel = pd.DataFrame()
tw_sel['ids'] = np.array( tw[ (ts_bins[1] <= tw['Timestamp']) & (tw['Timestamp'] < ts_bins[2]) ].index )
tw_sel['ids'] = tw_sel['ids'].astype('int64')
tw_sel = tw_sel[tw_sel['ids'].isin(nodes.index)]
tw_sel['index'] = tw_sel['ids'].apply( lambda id: nodes.loc[id]['index'] )
tw_sel['index'] = tw_sel['index'].astype('int32')
#tw_sel.set_index('ids', inplace=True)
print(len(tw_sel))
node_index = { node: index for index, node in enumerate( sorted(tw_sel['index'].tolist()) ) }
print(node_index.values())

3253
dict_values([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 21

In [98]:
# fetch edges of nodes
edges = df[ (df['source'].isin(tw_sel['index'])) & (df['target'].isin(tw_sel['index'])) ].compute()
edges['source_i'] = edges['source'].apply( lambda x: node_index[x] )
edges['target_i'] = edges['target'].apply( lambda x: node_index[x] )
edges.head()
#edges.values.tolist()

Unnamed: 0,source,target,source_i,target_i
9999,3158,3380,4,226
65406,5484,4175,2330,1021
65408,5484,4116,2330,962
65410,5484,3375,2330,221
65414,5484,3710,2330,556


In [107]:
df = edges.copy()
df.drop(['source_i', 'target_i'], axis=1, inplace=True)
adjls = df.groupby('source')['target'].apply(list, meta=('target', 'object')).compute()
adjls

TypeError: list() takes no keyword arguments

In [62]:
indeces = tw_sel['index'].tolist()
for s, t in edges.values.tolist():
    #print(s, t)
    if s not in indeces or t not in indeces:
        print(s)
        break
    #print(t in tw_sel['index'])
    #print( not (s in tw_sel['index'] and t in tw_sel['index']) )
    #break
    #if not (s in tw_sel['index'] and t in tw_sel['index']):
    #    print(s, t)
    
print("done")

done


In [101]:
# iGraph
g = ig.Graph()
#vertices = tw_sel['index'].tolist()
g.add_vertices(len(node_index))
g.add_edges( edges[['source_i', 'target_i']].values.tolist() )
#g.add_edges(edges.values.tolist())
ig.summary(g)
gt = g.transitivity_undirected()
print(gt)

IGRAPH U--- 3253 21268 -- 
0.8945713453158072


In [91]:
g = ig.Graph()
nodes = [2,3,4,6,7,8,9]
ss = [3,4,6,7,8]
ts = [2,3,9,8,9]
node_index = {node: index for index, node in enumerate(nodes)}
edges = pd.DataFrame(data={'source' : ss, 'target' : ts})
edges['source_i'] = edges['source'].apply( lambda x: node_index[x] )
edges['target_i'] = edges['target'].apply( lambda x: node_index[x] )
edges
g.add_vertices(len(nodes))
g.add_edges(edges[['source_i', 'target_i']].values.tolist())
print(g)

IGRAPH U--- 7 5 --
+ edges:
0--1 1--2 3--6 4--5 5--6
