In [1]:
import pandas as pd
import numpy as np
import json
import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import LabelEncoder
from dateutil.relativedelta import relativedelta
from matplotlib import pyplot as plt
import math
import seaborn as sns
sns.set()

### Files locations

In [251]:
FILE_USERS = "../../data/raw_users.csv"
FILE_USERS_TWEETS = "../../data/raw_tweets.csv"
FILE_RETWEETERS = "../../data/retweets.csv"
FILE_RETWEETERS_USERS = "../../data/retweets_users.csv"
FINAL_DATASET = "../../data/tweets_2020_2021.csv"

### Load Datasets

In [204]:
df_users = pd.read_csv(filepath_or_buffer=FILE_USERS, sep=",")
df_tweets = pd.read_csv(filepath_or_buffer=FILE_USERS_TWEETS, sep=",")

# df_users = df_users[:100]
# df_tweets = df_tweets[:100]

  df_users = pd.read_csv(filepath_or_buffer=FILE_USERS, sep=",")


In [234]:
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1072387 entries, 0 to 1072386
Data columns (total 18 columns):
 #   Column             Non-Null Count    Dtype 
---  ------             --------------    ----- 
 0   index              1072387 non-null  int64 
 1   Unnamed: 0         1072387 non-null  int64 
 2   tweet_id           1072387 non-null  int64 
 3   text               1072387 non-null  object
 4   user_id            1072387 non-null  int64 
 5   timestamp          1072387 non-null  object
 6   tweet_type         1072387 non-null  object
 7   like_count         1072387 non-null  int64 
 8   reply_count        1072387 non-null  int64 
 9   retweet_count      1072387 non-null  int64 
 10  quote_count        1072387 non-null  int64 
 11  device             1037514 non-null  object
 12  lang               1072387 non-null  object
 13  topics_ids         366686 non-null   object
 14  topics             366686 non-null   object
 15  referenced_tweets  257787 non-null   object
 16  

### Sort tweets by timestamp

In [207]:
df_tweets = df_tweets.sort_values(by='timestamp', ascending=True).reset_index()
df_tweets.head(5)

Unnamed: 0.1,index,Unnamed: 0,tweet_id,text,user_id,timestamp,tweet_type,like_count,reply_count,retweet_count,quote_count,device,lang,topics_ids,topics,referenced_tweets
0,1028700,1028700,1212176422770544641,2️⃣0️⃣1️⃣9️⃣ WHAT. A. YEAR. 🌟 #bestnine2019 #t...,29885607,2020-01-01 00:59:14,Original Tweet,35,0,1,0,iPhone,en,[],[],
1,1028701,1028701,1212176421659062274,Gold N Hot 🤪 https://t.co/VeO2k4Mq4h,1130217871286636545,2020-01-01 00:59:14,Quote Tweet,0,0,0,0,iPhone,en,[],[],"[ReferencedTweet(id: 1212167212120920072, type..."
2,1028698,1028698,1212176423189909506,and he still cheating https://t.co/zYAQ8wc6Vg,3322006549,2020-01-01 00:59:15,Quote Tweet,2,2,1,7,iPhone,en,"['10', '60']","['Person', 'Athlete']","[ReferencedTweet(id: 1211878384961228802, type..."
3,1028697,1028697,1212176425916280833,Got my hot coco bc it cold out siiiiide boii h...,18707915,2020-01-01 00:59:15,Original Tweet,1,1,0,0,iPhone,en,[],[],
4,1028696,1028696,1212176426088181760,Disgruntled Pope Francis pulls himself free fr...,858762062,2020-01-01 00:59:15,Original Tweet,0,0,0,0,Android,en,"['10', '46', '47']","['Person', 'Brand Category', 'Brand']",


### Delete duplicate users

In [209]:
df_users = df_users.drop_duplicates(subset=['user_id'], keep="first")
df_users

Unnamed: 0.1,Unnamed: 0,user_id,name,username,followers,following,tweet_count,verified,created_at
0,0,874735375,ARobles,anthonysradical,278,203,3844,False,2012-10-12 01:55:26+00:00
1,1,2331261888,James Smith Jr,iamjamesnumber1,2292,4490,96610,False,2014-02-07 04:06:30+00:00
2,2,1043073024092766208,"STOP SAYING POC, IT’s A CUSS WORD",ADOSALLY1,1076,1190,13969,False,2018-09-21 09:42:43+00:00
3,3,1350969232897552385,Pure Rap Podcast,PureRapPodcast,2465,2111,27373,False,2021-01-18 00:52:30+00:00
4,4,811604288443187200,April Showers ⛈🌪💫,moonlitefire,1570,874,27642,False,2016-12-21 16:08:36+00:00
...,...,...,...,...,...,...,...,...,...
1072375,1072375,324616347,1056043,TwitTitty10,388,251,38688,False,2011-06-26 23:10:32
1072376,1072376,249105428,1056044,gwarmothwftv,10598,189,17492,True,2011-02-08 11:19:53
1072381,1072381,395058107,1056049,sgeezy99,572,773,6236,False,2011-10-21 02:06:15
1072382,1072382,608420462,1056050,Skeeter_Bentley,388,0,5846,False,2012-06-14 19:30:38


## Calculate tweet reach

#### These values indicate the maximum possible of people that could have seen a given tweet

To do this, a sum of all the original tweet user's followers is made with the followers from all the users that retweeted/quoted the tweet

Note: this assumes that if a retweet is then shared by retweet again, those next audiences are not taken into account

In [210]:
df_retweeters = pd.read_csv(filepath_or_buffer=FILE_RETWEETERS_USERS, sep=",")
df_retweets = pd.read_csv(filepath_or_buffer=FILE_RETWEETERS, sep=",")

In [211]:
df_retweets.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,text,user_id,timestamp,tweet_type,like_count,reply_count,retweet_count,quote_count,device,lang,topics_ids,topics,referenced_tweets
0,0,1475758320568971271,RT @Keiththebat: Zack Snyder's back must be so...,3457095793,2021-12-28 09:19:36,Retweet Tweet,0,0,4,0,Android,en,[],[],"[ReferencedTweet(id: 1473095800364351490, type..."
1,1,1473588933883801601,RT @Keiththebat: Zack Snyder's back must be so...,1206541010408333313,2021-12-22 09:39:14,Retweet Tweet,0,0,4,0,Bot,en,[],[],"[ReferencedTweet(id: 1473095800364351490, type..."
2,2,1473588921749573635,RT @Keiththebat: Zack Snyder's back must be so...,1369228958739263488,2021-12-22 09:39:11,Retweet Tweet,0,0,4,0,Android,en,[],[],"[ReferencedTweet(id: 1473095800364351490, type..."
3,3,1473430854680838144,RT @gameallniteshow: Getting in some Coffee Tr...,3223132542,2021-12-21 23:11:05,Retweet Tweet,0,0,2,0,Android,en,[],[],"[ReferencedTweet(id: 1473095644709535745, type..."
4,4,1473324608963100679,RT @Tails7354: Sonic X: FIU Eye-Catch Card: Fl...,1256240643258671105,2021-12-21 16:08:54,Retweet Tweet,0,0,2,0,iPhone,en,[],[],"[ReferencedTweet(id: 1473200836071329793, type..."


In [212]:
df_retweeters.head()

Unnamed: 0.1,Unnamed: 0,user_id,name,username,followers,following,tweet_count,verified,created_at
0,0,3457095793,SALEH ALOTAIBI,SALEHAL49202807,29,0,7473,False,2015-09-05 07:49:00
1,1,1206541010408333313,DC Tweets,RetweetDc,7851,40,406311,False,2019-12-16 11:47:16
2,2,1369228958739263488,Cottencandy,Cottenc57931689,100,309,30127,False,2021-03-09 10:10:16
3,3,3223132542,アマノ,sikkoukakari,3383,4707,10649,False,2015-05-22 10:48:05
4,4,1256240643258671105,💙Sonic The Dragonhog🌙 (On hiatus),SDragonhog,657,1017,121701,False,2020-05-01 15:15:05


### Removing duplicated retweeters

37432 retweeters will be removed

In [213]:
df_retweeters.shape

(278349, 9)

In [214]:
df_retweeters = df_retweeters.drop_duplicates(subset=['user_id'], keep="first")
df_retweeters

Unnamed: 0.1,Unnamed: 0,user_id,name,username,followers,following,tweet_count,verified,created_at
0,0,3457095793,SALEH ALOTAIBI,SALEHAL49202807,29,0,7473,False,2015-09-05 07:49:00
1,1,1206541010408333313,DC Tweets,RetweetDc,7851,40,406311,False,2019-12-16 11:47:16
2,2,1369228958739263488,Cottencandy,Cottenc57931689,100,309,30127,False,2021-03-09 10:10:16
3,3,3223132542,アマノ,sikkoukakari,3383,4707,10649,False,2015-05-22 10:48:05
4,4,1256240643258671105,💙Sonic The Dragonhog🌙 (On hiatus),SDragonhog,657,1017,121701,False,2020-05-01 15:15:05
...,...,...,...,...,...,...,...,...,...
278344,278344,3736161195,‏َ,noahsoIo,4020,290,29116,False,2015-09-22 06:42:48
278345,278345,1220520282017976320,Ju Quer Abraçar O Josh🇨🇦♥️,uniter_juju,11,79,57,False,2020-01-24 01:35:01
278346,278346,42664231,Gilbert Economic Dev,GilbertAZEcoDev,3215,557,3657,True,2009-05-26 16:51:49
278347,278347,86356349,"Gilbert, Arizona",GilbertYourTown,25992,2666,22969,True,2009-10-30 17:18:49


In [215]:
df_retweets.shape

(278349, 15)

In [216]:
df_retweets = df_retweets.sort_values(by='timestamp', ascending=True).reset_index()
df_retweets.head(5)

Unnamed: 0.1,index,Unnamed: 0,tweet_id,text,user_id,timestamp,tweet_type,like_count,reply_count,retweet_count,quote_count,device,lang,topics_ids,topics,referenced_tweets
0,267872,267872,1212176450679377921,This is the way and only way Jay Armstrong Cut...,833382201785978882,2020-01-01 00:59:21,Quote Tweet,1,0,1,0,Android,en,[],[],"[ReferencedTweet(id: 1212175890785996801, type..."
1,267871,267871,1212176611627458560,RT @LexingtonPD: TRAFFIC ALERT - A collision o...,3028245479,2020-01-01 01:00:00,Retweet Tweet,0,0,4,0,iPhone,en,[],[],"[ReferencedTweet(id: 1212176549769830401, type..."
2,267870,267870,1212176685002571776,RT @alsilicaniyours: This is the way and only ...,833382201785978882,2020-01-01 01:00:17,Retweet Tweet,0,0,1,0,Android,en,[],[],"[ReferencedTweet(id: 1212176450679377921, type..."
3,267869,267869,1212176700861227009,Let’s not https://t.co/gMH6UWVB6q,27785930,2020-01-01 01:00:21,Quote Tweet,0,3,0,0,iPhone,en,[],[],"[ReferencedTweet(id: 1212176423189909506, type..."
4,267868,267868,1212176835603288064,RT @LexingtonPD: TRAFFIC ALERT - A collision o...,1012475880248143873,2020-01-01 01:00:53,Retweet Tweet,0,0,4,0,Android,en,[],[],"[ReferencedTweet(id: 1212176549769830401, type..."


### Merge retweets datasets

In [217]:
df_retweets_info = pd.merge(df_retweets[['tweet_id', 'user_id', 'topics', 'referenced_tweets']], df_retweeters[['user_id', 'followers', 'following']], how='left', on="user_id")
df_retweets_info.head(5)

Unnamed: 0,tweet_id,user_id,topics,referenced_tweets,followers,following
0,1212176450679377921,833382201785978882,[],"[ReferencedTweet(id: 1212175890785996801, type...",20,622
1,1212176611627458560,3028245479,[],"[ReferencedTweet(id: 1212176549769830401, type...",11059,4125
2,1212176685002571776,833382201785978882,[],"[ReferencedTweet(id: 1212176450679377921, type...",20,622
3,1212176700861227009,27785930,[],"[ReferencedTweet(id: 1212176423189909506, type...",10230,6114
4,1212176835603288064,1012475880248143873,[],"[ReferencedTweet(id: 1212176549769830401, type...",9685,10633


### Removing non-direct retweets

When colecting the retweets some tweets that mencioned the url of the original tweet and because of that were also collected as "retweets".
Since this tipe of tweets don't count to the retweet count of the original tweet and they only represent 0,0012% of the retweets collected they will be removed

In [218]:
df_retweets_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 278349 entries, 0 to 278348
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   tweet_id           278349 non-null  int64 
 1   user_id            278349 non-null  int64 
 2   topics             278349 non-null  object
 3   referenced_tweets  278019 non-null  object
 4   followers          278349 non-null  int64 
 5   following          278349 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 14.9+ MB


In [219]:
df_retweets_info[pd.isna(df_retweets_info['referenced_tweets'])]

Unnamed: 0,tweet_id,user_id,topics,referenced_tweets,followers,following
465,1213173157735141376,786792941658382339,[],,503,504
635,1213652399782604806,876320380792844292,[],,539,398
645,1213731859131064320,3315742299,[],,326,464
685,1213908969183424513,3215222674,[],,405,446
2325,1216434627549847555,1089894708208373761,[],,1017,1244
...,...,...,...,...,...,...
277556,1489616172404420614,838245040858222592,[],,1214,2244
277584,1490216196716875780,1456396368361279492,[],,715,386
277593,1490692560209207298,1468588637415944192,[],,109,193
277759,1492506389503254537,1328709172423249921,[],,99,538


In [220]:
df_retweets[df_retweets['tweet_id'] == 1213652399782604806]['text']

635    #FlyEaglesFly #PhillyvsEveryone https://t.co/G...
Name: text, dtype: object

In [221]:
df_retweets_info = df_retweets_info[df_retweets_info['referenced_tweets'].notna()]

In [222]:
def get_ref_tweet_id(ref_tweets):
    g = str(ref_tweets)
    if isinstance(g, str) and g != 'nan' and not pd.isna(g):
        return int(g.split()[1].replace(',', ''))
    else: return None

In [223]:
df_retweets_info['ref_tweed_id'] = df_retweets_info['referenced_tweets'].apply(lambda ref_tweet: get_ref_tweet_id(ref_tweet))
df_retweets_info.head()

Unnamed: 0,tweet_id,user_id,topics,referenced_tweets,followers,following,ref_tweed_id
0,1212176450679377921,833382201785978882,[],"[ReferencedTweet(id: 1212175890785996801, type...",20,622,1212175890785996801
1,1212176611627458560,3028245479,[],"[ReferencedTweet(id: 1212176549769830401, type...",11059,4125,1212176549769830401
2,1212176685002571776,833382201785978882,[],"[ReferencedTweet(id: 1212176450679377921, type...",20,622,1212176450679377921
3,1212176700861227009,27785930,[],"[ReferencedTweet(id: 1212176423189909506, type...",10230,6114,1212176423189909506
4,1212176835603288064,1012475880248143873,[],"[ReferencedTweet(id: 1212176549769830401, type...",9685,10633,1212176549769830401


In [224]:
df_tweets['ref_tweed_id'] = df_tweets['referenced_tweets'].apply(lambda ref_tweet: get_ref_tweet_id(ref_tweet))
# converting the reference ids that exist to int
df_tweets['ref_tweed_id'] = pd.to_numeric(df_tweets['ref_tweed_id'], errors='coerce').fillna(0).astype(np.int64)
df_tweets.head()

Unnamed: 0.1,index,Unnamed: 0,tweet_id,text,user_id,timestamp,tweet_type,like_count,reply_count,retweet_count,quote_count,device,lang,topics_ids,topics,referenced_tweets,ref_tweed_id
0,1028700,1028700,1212176422770544641,2️⃣0️⃣1️⃣9️⃣ WHAT. A. YEAR. 🌟 #bestnine2019 #t...,29885607,2020-01-01 00:59:14,Original Tweet,35,0,1,0,iPhone,en,[],[],,0
1,1028701,1028701,1212176421659062274,Gold N Hot 🤪 https://t.co/VeO2k4Mq4h,1130217871286636545,2020-01-01 00:59:14,Quote Tweet,0,0,0,0,iPhone,en,[],[],"[ReferencedTweet(id: 1212167212120920072, type...",1212167212120920064
2,1028698,1028698,1212176423189909506,and he still cheating https://t.co/zYAQ8wc6Vg,3322006549,2020-01-01 00:59:15,Quote Tweet,2,2,1,7,iPhone,en,"['10', '60']","['Person', 'Athlete']","[ReferencedTweet(id: 1211878384961228802, type...",1211878384961228800
3,1028697,1028697,1212176425916280833,Got my hot coco bc it cold out siiiiide boii h...,18707915,2020-01-01 00:59:15,Original Tweet,1,1,0,0,iPhone,en,[],[],,0
4,1028696,1028696,1212176426088181760,Disgruntled Pope Francis pulls himself free fr...,858762062,2020-01-01 00:59:15,Original Tweet,0,0,0,0,Android,en,"['10', '46', '47']","['Person', 'Brand Category', 'Brand']",,0


In [225]:
def calculate_tweet_reach(tweet_id, user_id, topics, retweets_count, ref_tweed_id):
    total_reach = 0
    total_reach += df_users[df_users['user_id'] == user_id]['followers'].sum()
    if topics != '[]' and retweets_count > 0:
        if ref_tweed_id is not None and ref_tweed_id != 0:
            total_reach += df_retweets_info[(df_retweets_info['ref_tweed_id'] == tweet_id) | (df_retweets_info['ref_tweed_id'] == ref_tweed_id)]['followers'].sum()
        else:
            total_reach += df_retweets_info[df_retweets_info['ref_tweed_id'] == tweet_id]['followers'].sum()
    return total_reach

In [25]:
df_tweets['reach'] = df_tweets.apply(lambda tweet_row: calculate_tweet_reach(tweet_row['tweet_id'], tweet_row['user_id'], tweet_row['topics'], tweet_row['retweet_count'], tweet_row['ref_tweed_id']), axis=1)
df_tweets.head(5)

# faster alternative: # Iterating over multiple columns - differing data type
# result = [f(row[0], ..., row[n]) for row in zip(df['col1'], ..., df['coln'])]

NameError: name 'calculate_tweet_reach' is not defined

## Topic analysis

### Loading the previously trained topic analysis model and it's evaluations of each tweet text

In [227]:
df_tweets.head(5)

Unnamed: 0.1,index,Unnamed: 0,tweet_id,text,user_id,timestamp,tweet_type,like_count,reply_count,retweet_count,quote_count,device,lang,topics_ids,topics,referenced_tweets,ref_tweed_id,reach
0,1028700,1028700,1212176422770544641,2️⃣0️⃣1️⃣9️⃣ WHAT. A. YEAR. 🌟 #bestnine2019 #t...,29885607,2020-01-01 00:59:14,Original Tweet,35,0,1,0,iPhone,en,[],[],,0,7545
1,1028701,1028701,1212176421659062274,Gold N Hot 🤪 https://t.co/VeO2k4Mq4h,1130217871286636545,2020-01-01 00:59:14,Quote Tweet,0,0,0,0,iPhone,en,[],[],"[ReferencedTweet(id: 1212167212120920072, type...",1212167212120920064,4008
2,1028698,1028698,1212176423189909506,and he still cheating https://t.co/zYAQ8wc6Vg,3322006549,2020-01-01 00:59:15,Quote Tweet,2,2,1,7,iPhone,en,"['10', '60']","['Person', 'Athlete']","[ReferencedTweet(id: 1211878384961228802, type...",1211878384961228800,13837
3,1028697,1028697,1212176425916280833,Got my hot coco bc it cold out siiiiide boii h...,18707915,2020-01-01 00:59:15,Original Tweet,1,1,0,0,iPhone,en,[],[],,0,247
4,1028696,1028696,1212176426088181760,Disgruntled Pope Francis pulls himself free fr...,858762062,2020-01-01 00:59:15,Original Tweet,0,0,0,0,Android,en,"['10', '46', '47']","['Person', 'Brand Category', 'Brand']",,0,26108


In [228]:
def process_topic(topics:str):
    s = topics
    for i in range(topics.count('\'')):
        s = s.replace('\'', '"')
    t = json.loads(s)
    if len(t) > 0:
        return t[0]
    return None

In [229]:
df_tweets['topics'] = df_tweets['topics'].apply(lambda topics: process_topic(topics))
df_tweets['topics_ids'] = df_tweets['topics_ids'].apply(lambda topics: process_topic(topics))
df_tweets.head(5)

Unnamed: 0.1,index,Unnamed: 0,tweet_id,text,user_id,timestamp,tweet_type,like_count,reply_count,retweet_count,quote_count,device,lang,topics_ids,topics,referenced_tweets,ref_tweed_id,reach
0,1028700,1028700,1212176422770544641,2️⃣0️⃣1️⃣9️⃣ WHAT. A. YEAR. 🌟 #bestnine2019 #t...,29885607,2020-01-01 00:59:14,Original Tweet,35,0,1,0,iPhone,en,,,,0,7545
1,1028701,1028701,1212176421659062274,Gold N Hot 🤪 https://t.co/VeO2k4Mq4h,1130217871286636545,2020-01-01 00:59:14,Quote Tweet,0,0,0,0,iPhone,en,,,"[ReferencedTweet(id: 1212167212120920072, type...",1212167212120920064,4008
2,1028698,1028698,1212176423189909506,and he still cheating https://t.co/zYAQ8wc6Vg,3322006549,2020-01-01 00:59:15,Quote Tweet,2,2,1,7,iPhone,en,10.0,Person,"[ReferencedTweet(id: 1211878384961228802, type...",1211878384961228800,13837
3,1028697,1028697,1212176425916280833,Got my hot coco bc it cold out siiiiide boii h...,18707915,2020-01-01 00:59:15,Original Tweet,1,1,0,0,iPhone,en,,,,0,247
4,1028696,1028696,1212176426088181760,Disgruntled Pope Francis pulls himself free fr...,858762062,2020-01-01 00:59:15,Original Tweet,0,0,0,0,Android,en,10.0,Person,,0,26108


### Grouping the topics in broader categories

In [20]:
def group_topics(topic):
    if pd.isna(topic):
        return None

    if not isinstance(topic, str):
        print(topic)

    if 'Brand' in topic or 'Product' in topic:
        return 'Brand'
    elif 'Person' in topic:
        return 'Person'
    elif 'Sport' in topic or 'Athlete' in topic or 'Coach' in topic or 'Hockey' in topic or 'Football' in topic or 'NFL' in topic:
        return 'Sport'
    elif 'TV' in topic or 'Movie' in topic or 'Award' in topic or 'Actor' in topic or 'Fictional Character' in topic\
            or 'Entertainment' in topic:
        return 'TV and Movies'
    elif 'Music' in topic or 'Musician' in topic or 'Concert' in topic or 'Song' in topic or 'Radio' in topic:
        return 'Music'
    elif 'Book' in topic:
        return 'Book'
    elif 'Hobbies' in topic:
        return 'Interest and Hobbies'
    elif 'Video Game' in topic or 'Esports' in topic or 'eSport' in topic:
        return 'Video Game'
    elif 'Political' in topic or 'Politicians' in topic:
        return 'Political'
    elif 'Holiday' in topic:
        return 'Holiday'
    elif 'News' in topic:
        return 'News'
    elif 'Entities' in topic:
        return 'Entities'
    else:
        return 'Other'

In [21]:
df_tweets['topics_cleaned'] = df_tweets['topics'].apply(lambda topic: group_topics(topic))
df_tweets.head(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,tweet_id,text,timestamp,user_id,like_count,retweet_count,quote_count,...,day_phase,week_idx,day_phase_enc,day_of_week_enc,month_enc,year_enc,sentiment_enc,verified_enc,seniority,topics_cleaned
0,0,0,0,1212176422770544641,2️⃣0️⃣1️⃣9️⃣ WHAT. A. YEAR. 🌟 #bestnine2019 #t...,2020-01-01 00:59:14,29885607,35,1,0,...,Dawn,2020-01,1,6,4,0,1,0,13,
1,1,1,1,1212176421659062274,Gold N Hot 🤪 https://t.co/VeO2k4Mq4h,2020-01-01 00:59:14,1130217871286636545,0,0,0,...,Dawn,2020-01,1,6,4,0,1,0,2,
2,2,2,2,1215030407881805824,Damn I wonder why https://t.co/mtzuGWyafL,2020-01-08 21:59:58,1130217871286636545,0,0,0,...,Night,2020-02,4,6,4,0,0,0,2,
3,3,3,3,1223606768430911493,No fucking way I’m bring no yogurt parfait. Pu...,2020-02-01 13:59:21+00:00,1130217871286636545,3,0,0,...,Afternoon,2020-05,0,2,3,0,0,0,2,Entities
4,4,4,4,1266534733447651328,No purpose. Just Doing Shit https://t.co/UmcuH...,2020-05-30 00:59:46+00:00,1130217871286636545,0,0,0,...,Dawn,2020-22,1,2,8,0,0,0,2,


In [22]:
df_tweets['topics_cleaned'].value_counts(normalize=True)

Person                  0.218579
TV and Movies           0.186972
Brand                   0.169682
Interest and Hobbies    0.147919
Entities                0.100866
News                    0.061112
Sport                   0.042161
Holiday                 0.027320
Video Game              0.012365
Other                   0.011718
Music                   0.010431
Political               0.008880
Book                    0.001994
Name: topics_cleaned, dtype: float64

## Sentiment analysis

In [239]:
sid_obj = SentimentIntensityAnalyzer()

In [240]:
def sentiment_scores(sentence, prints):
    if prints: print("\nSentence:", sentence)

    sentiment_dict = sid_obj.polarity_scores(sentence)

    if sentiment_dict['compound'] >= 0.05:
        if prints: print("Positive")
        return "Positive"
    elif sentiment_dict['compound'] <= - 0.05:
        if prints: print("Positive")
        return "Negative"
    else:
        if prints: print("Neutral")
        return "Neutral"

In [241]:
df_tweets['sentiment'] = df_tweets['text'].apply(lambda tweet_text: sentiment_scores(tweet_text, False))

In [242]:
df_tweets.head(5)

Unnamed: 0.1,index,Unnamed: 0,tweet_id,text,user_id,timestamp,tweet_type,like_count,reply_count,retweet_count,quote_count,device,lang,topics_ids,topics,referenced_tweets,ref_tweed_id,reach,sentiment
0,1028700,1028700,1212176422770544641,2️⃣0️⃣1️⃣9️⃣ WHAT. A. YEAR. 🌟 #bestnine2019 #t...,29885607,2020-01-01 00:59:14,Original Tweet,35,0,1,0,iPhone,en,,,,0,7545,Neutral
1,1028701,1028701,1212176421659062274,Gold N Hot 🤪 https://t.co/VeO2k4Mq4h,1130217871286636545,2020-01-01 00:59:14,Quote Tweet,0,0,0,0,iPhone,en,,,"[ReferencedTweet(id: 1212167212120920072, type...",1212167212120920064,4008,Neutral
2,1028698,1028698,1212176423189909506,and he still cheating https://t.co/zYAQ8wc6Vg,3322006549,2020-01-01 00:59:15,Quote Tweet,2,2,1,7,iPhone,en,10.0,Person,"[ReferencedTweet(id: 1211878384961228802, type...",1211878384961228800,13837,Negative
3,1028697,1028697,1212176425916280833,Got my hot coco bc it cold out siiiiide boii h...,18707915,2020-01-01 00:59:15,Original Tweet,1,1,0,0,iPhone,en,,,,0,247,Neutral
4,1028696,1028696,1212176426088181760,Disgruntled Pope Francis pulls himself free fr...,858762062,2020-01-01 00:59:15,Original Tweet,0,0,0,0,Android,en,10.0,Person,,0,26108,Positive


## Tweet popularity

To define tweet popularity, each tweet with at least 1 retweet or 1 quote tweet was considered popular (1)
if the criteria wasn't met it was marked as unpopular (0)

In [243]:
def tweet_popularity_label(retweet_count, quote_count):
    if retweet_count > 0 or quote_count > 0:
        return 1
    else:
        return 0

In [244]:
df_tweets['popularity'] = df_tweets.apply(lambda row: tweet_popularity_label(row['retweet_count'], row['quote_count']), axis=1)

In [246]:
df_tweets['popularity'].value_counts(normalize=True)

0    0.809545
1    0.190455
Name: popularity, dtype: float64

## Combine the two datasets into the final one

For every tweet select the relevant columns and merge the corresponding information from the original user

In [None]:
df = pd.merge(df_tweets[['tweet_id', 'text', 'timestamp', 'user_id', 'like_count', 'retweet_count', 'quote_count',
                        'reply_count', 'reach', 'topics_ids', 'topics', 'sentiment', 'popularity']],
                        df_users[['user_id', 'followers', 'following', 'tweet_count', 'verified', 'created_at']],
                        on="user_id").reset_index()
df.head(5)

## Process tweets timestamps

In [4]:
df = pd.read_csv(filepath_or_buffer='../../data/tweets_2020_2021.csv', sep=",")

In [5]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

### Categorize by year, month, week days and phases of the day

In [6]:
def get_day_phase(hour):
    if 0 <= hour < 7:
        return "Dawn"
    elif 7 <= hour < 13:
        return "Morning"
    elif 13 <= hour < 16:
        return "Afternoon"
    elif 16 <= hour < 20:
        return "Evening"
    elif 20 <= hour < 24:
        return "Night"

In [8]:
df['year'] = df['timestamp'].apply(lambda x: x.year)
df['month'] = df['timestamp'].apply(lambda x: x.strftime('%B'))
df['day_of_week'] = df['timestamp'].apply(lambda x: x.strftime('%A'))
df['day_phase'] = df['timestamp'].apply(lambda x: get_day_phase(int(x.hour)))
df['week_idx'] = df['timestamp'].apply(lambda x: '%s-%s' % (x.year, '{:02d}'.format(x.isocalendar()[1])))
df

Unnamed: 0.1,Unnamed: 0,index,tweet_id,text,timestamp,user_id,like_count,retweet_count,quote_count,reply_count,...,followers,following,tweet_count,verified,created_at,year,month,day_of_week,day_phase,week_idx
0,0,0,1212176422770544641,2️⃣0️⃣1️⃣9️⃣ WHAT. A. YEAR. 🌟 #bestnine2019 #t...,2020-01-01 00:59:14,29885607,35,1,0,0,...,7545,3335,11473,False,2009-04-09 01:39:24,2020,January,Wednesday,Dawn,2020-01
1,1,1,1212176421659062274,Gold N Hot 🤪 https://t.co/VeO2k4Mq4h,2020-01-01 00:59:14,1130217871286636545,0,0,0,0,...,4008,1488,141802,False,2019-05-19 21:05:13+00:00,2020,January,Wednesday,Dawn,2020-01
2,2,2,1215030407881805824,Damn I wonder why https://t.co/mtzuGWyafL,2020-01-08 21:59:58,1130217871286636545,0,0,0,0,...,4008,1488,141802,False,2019-05-19 21:05:13+00:00,2020,January,Wednesday,Night,2020-02
3,3,3,1223606768430911493,No fucking way I’m bring no yogurt parfait. Pu...,2020-02-01 13:59:21+00:00,1130217871286636545,3,0,0,3,...,4008,1488,141802,False,2019-05-19 21:05:13+00:00,2020,February,Saturday,Afternoon,2020-05
4,4,4,1266534733447651328,No purpose. Just Doing Shit https://t.co/UmcuH...,2020-05-30 00:59:46+00:00,1130217871286636545,0,0,0,0,...,4008,1488,141802,False,2019-05-19 21:05:13+00:00,2020,May,Saturday,Dawn,2020-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1072382,1072382,1072382,1477036824749125633,Classic Golden Girls. #ripbettywhite #RIPBetty...,2021-12-31 21:59:55+00:00,21630146,5,0,0,0,...,4561,2218,60595,False,2009-02-23 04:12:29+00:00,2021,December,Friday,Night,2021-52
1072383,1072383,1072383,1477036825579507714,No one told me I’d have a real life princess. ...,2021-12-31 21:59:55+00:00,38572410,0,0,0,0,...,437,1046,7665,False,2009-05-08 01:25:54+00:00,2021,December,Friday,Night,2021-52
1072384,1072384,1072384,1477036838217035777,Fast And Professional Vehicle Lockout Service ...,2021-12-31 21:59:58+00:00,210241982,0,0,0,0,...,394,465,21897,False,2010-10-30 23:06:33+00:00,2021,December,Friday,Night,2021-52
1072385,1072385,1072385,1477036840024567809,Very much this https://t.co/b3bgtstLfr,2021-12-31 21:59:59+00:00,883576549,3,0,0,0,...,702,1263,20166,False,2012-10-16 03:01:52+00:00,2021,December,Friday,Night,2021-52


In [9]:

cols_to_transform = ['day_phase', 'day_of_week', 'month', 'year', 'sentiment', 'verified']

for col in cols_to_transform:
    enc = LabelEncoder()
    enc.fit(df[col])
    df[col + '_enc'] = enc.transform(df[col])

# Day phase: Dawn -> 1 | Morning -> 3 | Afternoon -> 0 | Evening -> 2 | Night -> 4

# Days of the week:
#   Monday -> 1 | Tuesday -> 5 | Wednesday -> 6
#   Thursday -> 4 | Friday -> 0 | Saturday -> 2 | Sunday -> 3

# Months
#   January -> 4 | February -> 3 | March -> 7 | April -> 0
#   May -> 8 | June -> 6 | July -> 5 | August -> 1
#   September -> 11 | October -> 10 | November -> 9 | December -> 2

# Years
#   2020 -> 0 | 2021 -> 1

# Sentiment
#  Negative -> 0 | Neutral -> 1 | Positive -> 2

# Verified
#  False -> 0 | True -> 1

df.head(5)

Unnamed: 0.1,Unnamed: 0,index,tweet_id,text,timestamp,user_id,like_count,retweet_count,quote_count,reply_count,...,month,day_of_week,day_phase,week_idx,day_phase_enc,day_of_week_enc,month_enc,year_enc,sentiment_enc,verified_enc
0,0,0,1212176422770544641,2️⃣0️⃣1️⃣9️⃣ WHAT. A. YEAR. 🌟 #bestnine2019 #t...,2020-01-01 00:59:14,29885607,35,1,0,0,...,January,Wednesday,Dawn,2020-01,1,6,4,0,1,0
1,1,1,1212176421659062274,Gold N Hot 🤪 https://t.co/VeO2k4Mq4h,2020-01-01 00:59:14,1130217871286636545,0,0,0,0,...,January,Wednesday,Dawn,2020-01,1,6,4,0,1,0
2,2,2,1215030407881805824,Damn I wonder why https://t.co/mtzuGWyafL,2020-01-08 21:59:58,1130217871286636545,0,0,0,0,...,January,Wednesday,Night,2020-02,4,6,4,0,0,0
3,3,3,1223606768430911493,No fucking way I’m bring no yogurt parfait. Pu...,2020-02-01 13:59:21+00:00,1130217871286636545,3,0,0,3,...,February,Saturday,Afternoon,2020-05,0,2,3,0,0,0
4,4,4,1266534733447651328,No purpose. Just Doing Shit https://t.co/UmcuH...,2020-05-30 00:59:46+00:00,1130217871286636545,0,0,0,0,...,May,Saturday,Dawn,2020-22,1,2,8,0,0,0


### Calculate users seniority

In [None]:
df['created_at'] = pd.to_datetime(df['created_at'], utc=True).dt.strftime("%Y-%m-%d")
df['created_at'] = pd.to_datetime(df['created_at'])
df['seniority'] = df['created_at'].apply(lambda x: relativedelta(datetime.datetime.now(), x).years)

## Outliers

In [None]:
var_outliers = ['like_count', 'retweet_count', 'quote_count', 'reply_count', 'reach', 'topics_ids', 'sentiment_enc', 'day_phase_enc', 'day_of_week_enc', 'month_enc', 'popularity', 'followers', 'following', 'tweet_count', 'verified_enc', 'seniority']

fig, axes = plt.subplots(2, math.ceil(len(var_outliers) / 2), figsize=(28, 15))

for ax, feat in zip(axes.flatten(), var_outliers):
    sns.boxplot(x=df[feat], ax=ax, color='steelblue')

title = "Variables' Box Plots"
plt.suptitle(title, y=0.95, fontsize=20)
plt.show()

In [None]:
outliers_filter = (((df['followers'] < 10000) & (df['following'] < 70000)) & ((df['retweet_count'] < 100) & (df['like_count'] < 4000) & (df['seniority'] < 17)))
df_no_outliers = df[outliers_filter].copy()
df_no_outliers.shape

In [None]:
print('Percentage of data kept after removing outliers:', np.round(df_no_outliers.shape[0] / df.shape[0], 4) * 100, '%')
print('Percentage of data removed:', np.round((1 - (df_no_outliers.shape[0] / df.shape[0])) * 100, 4), '%')
df = df_no_outliers

In [None]:
fig, axes = plt.subplots(2, math.ceil(len(var_outliers) / 2), figsize=(28, 15))

for ax, feat in zip(axes.flatten(), var_outliers):
    sns.boxplot(x=df[feat], ax=ax, color='steelblue')

title = "Variables' Box Plots"
plt.suptitle(title, y=0.95, fontsize=20)
plt.show()

## Save the final version of the dataset

In [None]:
df.to_csv('../../data/tweets_2020_2021_v2.csv', sep=',', date_format='%Y-%m-%d %H:%M:%S')