In [1]:
# Functions:
# 1) Reads degree centrality values form csv, 
# 2) displays histograms and 
# 3) analyses power law fit
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
pd.set_option('display.width', 500)
import time
import statsmodels.api as sm
from fun.fun import *

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [4]:
# Variables
edges_fn = '../data/edges.csv'
dataset_fn = "../dataset/TweetsCOV19.tsv"
edges_prq = "../data/edges.parquet"

In [5]:
# -> IN : Load tweets dataframe
tw = get_filtered_tweets_dataframe(dataset_fn)
print("Loaded {:_} tweets".format(len(tw)))
tw

Importing dataset from tsv file ...read 8_077_794 lines (took 38.3s)
Converting timestamp column
Filtering desired columns and between desired dates ... 8_077_794 rows in dataframe
Parsing hashtags and positive/negative sentiments
filtering for tweets that contain hashtags ... 8_077_794 rows in dataframe
Loaded 462_901 tweets


Unnamed: 0_level_0,Username,Timestamp,Hashtags,Sentiment_pos,Sentiment_neg
Tweet Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1200928806757752833,83c182d0ee195dc692900d7ff7328171,2019-12-01 00:05:14+00:00,"[rgvwx, rgv, txwx, spi]",1,-2
1200931476642115586,73c3e83574f1eb0a5201c71cf46c732c,2019-12-01 00:15:50+00:00,[KeDezembaBoss],2,-4
1200941770835521536,4efca14ff68aa1edd9aea0c78c615ef2,2019-12-01 00:56:45+00:00,"[fame., FreeBritney]",2,-2
1200948819015454721,a53fd620b4178d04a1d2c4e37e85e0ae,2019-12-01 01:24:45+00:00,[FakeNews],1,-2
1200955132944560135,215a5a37e161f7ceaa0c532e73f62c40,2019-12-01 01:49:51+00:00,[KabukiPolitics],1,-1
...,...,...,...,...,...
1233891411109597185,aac81411d4a07a67c96b72655974ed12,2020-02-29 23:06:51+00:00,"[ai, ml, dl]",2,-1
1233896872961552384,4df9baa19ac46918148219d1090740e0,2020-02-29 23:28:33+00:00,"[SKHwy17:, SKHwy3,]",1,-1
1233898288413388807,a19f22a61cd4250367ef67fe2e229f77,2020-02-29 23:34:11+00:00,"[ThinkBIGSundayWithMarsha, SundayThoughts, Sun...",3,-1
1233903008154517510,55b8516279faf5f0c30d2dd81a1dc4b9,2020-02-29 23:52:56+00:00,"[Coronavirius, DNCisCorrupt, DNCRigging]",1,-1


In [8]:
# Get number of hashtags for each tweet
hashtags = {}
total, i = len(tw), 0
for _, row in tw.iterrows():
    for ht in row['Hashtags']:
        hashtags[ht] = hashtags.get(ht, 0) + 1
    i, perc = track_progress(total, i)
    #if perc >= 5: break
print("\nDone")

 progress: 462_901/462_901 (100.00000%)

Done


In [29]:
# Sort keys by frequency and filter out hashtags with 1 associated tweet
keys_sorted = sorted(hashtags.keys(), reverse=True, key=lambda x: hashtags[x])
print("Hashtags used: {:_}".format(len(keys_sorted)))
keys_sorted = [x for x in keys_sorted if hashtags[x] > 1]
print("Hashtags with more than 1 tweet: {:_}".format(len(keys_sorted)))

Hashtags used: 292_264
Hashtags with more than 1 tweet: 81_931


In [26]:
# Create dataframe with hashtags, frequencies and edges
from math import comb
htt = pd.DataFrame()
htt['hashtag'] = keys_sorted
htt['freq'] = htt['hashtag'].apply( lambda ht: hashtags[ht] )
htt['edges'] = htt['freq'].apply( lambda x: comb(x, 2) )
print(len(htt))
htt.head(10)

81931


Unnamed: 0,hashtag,freq,edges
0,coronavirus,27570,380038665
1,China,10033,50325528
2,spotifywrapped,8004,32028006
3,ShowStopperAsim,7838,30713203
4,COVID19,7370,27154765
5,Coronavirus,6327,20012301
6,CoronavirusOutbreak,6107,18644671
7,1.,5553,15415128
8,NoMeat_NoCoronaVirus,3926,7704775
9,Wuhan,3863,7459453
