# EDA

In [3]:
# internal imports

# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload
# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

import sys, os
sys.path.insert(0, os.path.abspath('..'))
from src.data import extract_media_data as emd
from src.data import checker
from src.data import preprocessor
from src.models import topic_modeling as tm

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/johnnywang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/johnnywang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import numpy as np
import pandas as pd
import seaborn as sn
from typing import List
from tqdm import tqdm

In [5]:
#Visualizations
import seaborn as sns
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.sklearn
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

#Natural Language Processing (NLP)
import spacy
import gensim
from spacy.tokenizer import Tokenizer
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import CoherenceModel
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
# from wordcloud import STOPWORDS
# stopwords = set(STOPWORDS)

In [6]:
# data before the war
data_before = pd.read_csv('../data/interim/new_data.csv')
# data after the war
data_after = pd.read_csv('../data/interim/data_cleaned_version_1.csv', index_col=[0])

## Overview

In [9]:
data_before.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105599 entries, 0 to 105598
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   userid          105599 non-null  int64 
 1   username        105599 non-null  object
 2   following       105599 non-null  int64 
 3   followers       105599 non-null  int64 
 4   totaltweets     105599 non-null  int64 
 5   usercreatedts   105599 non-null  object
 6   tweetcreatedts  105599 non-null  object
 7   retweetcount    105599 non-null  int64 
 8   text            105599 non-null  object
 9   hashtags        105599 non-null  object
 10  language        105599 non-null  object
 11  favorite_count  105599 non-null  int64 
dtypes: int64(6), object(6)
memory usage: 9.7+ MB


In [10]:
# checker.check_data(data_before)
data_before.head()

Unnamed: 0,userid,username,following,followers,totaltweets,usercreatedts,tweetcreatedts,retweetcount,text,hashtags,language,favorite_count
0,1235244517307166725,InformazioneA,103,864,7999,2020-03-04 16:43:53,2022-01-17 00:00:01,1,#Russia #Ukraine #US \nIn the midst of the cri...,"['russia', 'ukraine', 'us', 'moscow', 'kiev', ...",en,1
1,429580982,Sec_Cyber,1105,16344,246636,2011-12-06 04:14:04,2022-01-17 00:00:09,0,Ukraine claims Russia behind #cyberattack in '...,['cyberattack'],en,0
2,31106730,samiaji,120,463,83876,2009-04-14 12:08:45,2022-01-17 00:00:35,1,Ukraine says Russia behind cyberattack in 'hyb...,[],en,0
3,16028382,WESH,353,222861,427358,2008-08-28 15:42:42,2022-01-17 00:01:05,0,Ukraine says Russia behind cyberattack in 'hyb...,[],en,2
4,9285512,whatsinitforme,3421,1332,26311,2007-10-06 22:23:39,2022-01-17 00:01:19,0,@BowesChay @rickyspanish81 @Russia @Rus_Emb_Ir...,[],en,3


In [11]:
data_after.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 835202 entries, 0 to 835201
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   userid          835202 non-null  int64 
 1   username        835202 non-null  object
 2   following       835202 non-null  int64 
 3   followers       835202 non-null  int64 
 4   totaltweets     835202 non-null  int64 
 5   usercreatedts   835202 non-null  object
 6   tweetcreatedts  835202 non-null  object
 7   retweetcount    835202 non-null  int64 
 8   text            835202 non-null  object
 9   hashtags        835202 non-null  object
 10  language        835202 non-null  object
 11  favorite_count  835202 non-null  int64 
dtypes: int64(6), object(6)
memory usage: 82.8+ MB


## Concate two dataframe together

In [7]:
df = pd.concat([data_before, data_after])
df.reset_index(inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940801 entries, 0 to 940800
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   index           940801 non-null  int64 
 1   userid          940801 non-null  int64 
 2   username        940801 non-null  object
 3   following       940801 non-null  int64 
 4   followers       940801 non-null  int64 
 5   totaltweets     940801 non-null  int64 
 6   usercreatedts   940801 non-null  object
 7   tweetcreatedts  940801 non-null  object
 8   retweetcount    940801 non-null  int64 
 9   text            940801 non-null  object
 10  hashtags        940801 non-null  object
 11  language        940801 non-null  object
 12  favorite_count  940801 non-null  int64 
dtypes: int64(7), object(6)
memory usage: 93.3+ MB


In [9]:
# only these three columns have missing values
# luckily they are not too important for our model building, so we can abandon these columns
checker.check_missing_value(df, df.columns)

Unnamed: 0,Feature,NA Count,NA Rate


In [10]:
# most tweets do not get likes
checker.check_zeros(df, df.columns)

Unnamed: 0,Feature,Value Count,Rate
0,favorite_count,711778,0.756566
1,retweetcount,596378,0.633905
2,following,4884,0.005191
3,index,2,2e-06


# Preprocessing

## Get Media list

In [19]:
url = "https://memeburn.com/2010/09/the-100-most-influential-news-media-twitter-accounts"
media_account_dict = emd.get_media_dict(url)



  soup = BeautifulSoup(urllib.request.urlopen(url).read())


## Filter tweets from media and ordinary people

In [20]:
media_tweets_df = df[df.username.isin(media_account_dict.values())]

In [21]:
media_tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63 entries, 2440 to 704729
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   index           63 non-null     int64 
 1   userid          63 non-null     int64 
 2   username        63 non-null     object
 3   following       63 non-null     int64 
 4   followers       63 non-null     int64 
 5   totaltweets     63 non-null     int64 
 6   usercreatedts   63 non-null     object
 7   tweetcreatedts  63 non-null     object
 8   retweetcount    63 non-null     int64 
 9   text            63 non-null     object
 10  hashtags        63 non-null     object
 11  language        63 non-null     object
 12  favorite_count  63 non-null     int64 
dtypes: int64(7), object(6)
memory usage: 6.9+ KB


Since there are not too many tweets (only 63) which were generated from the media list. We can also filter out Twitter account whose followers is larger than a threshold, for example 1 milliom.

In [11]:
follower_threshold = 100000
influencer_tweets = df[df.followers>=follower_threshold].copy()
normal_tweets = df[df.followers>=follower_threshold].copy()

In [12]:
# convert to pandas date time for easy processing
df['tweetcreatedts'] = pd.to_datetime(df['tweetcreatedts'])
# reset the index
df = df.set_index('tweetcreatedts', drop=False)

In [13]:
date_list = [str(i) for i in np.unique(df.index.date)]
df[df.followers>follower_threshold].loc[date_list[0]]

Unnamed: 0_level_0,index,userid,username,following,followers,totaltweets,usercreatedts,tweetcreatedts,retweetcount,text,hashtags,language,favorite_count
tweetcreatedts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2022-01-17 00:01:05,3,16028382,WESH,353,222861,427358,2008-08-28 15:42:42,2022-01-17 00:01:05,0,Ukraine says Russia behind cyberattack in 'hyb...,[],en,2
2022-01-17 00:03:00,6,16086928,komonews,2318,453474,309608,2008-09-01 19:48:55,2022-01-17 00:03:00,2,Ukraine said Sunday that Russia was behind a c...,[],en,2
2022-01-17 00:03:18,7,1115874631,CGTNOfficial,74,13391764,213027,2013-01-24 03:18:59,2022-01-17 00:03:18,6,Russian troops to stay near Ukrainian border a...,[],en,23
2022-01-17 00:10:00,18,16558796,7News,1151,352439,361513,2008-10-02 13:11:54,2022-01-17 00:10:00,1,Ukraine says Russia behind cyberattack in ‘hyb...,[],en,2
2022-01-17 00:11:06,21,15922073,WXII,1300,129832,379847,2008-08-20 19:42:50,2022-01-17 00:11:06,0,Ukraine says Russia behind cyberattack in 'hyb...,[],en,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-01-17 23:19:04,2838,1339166129110065152,GBNEWS,43,375894,35262,2020-12-16 11:11:23,2022-01-17 23:19:04,3,📰 Dominic Frisby reads through tomorrow's fron...,[],en,18
2022-01-17 23:32:44,2879,15535860,nbcsandiego,4191,252100,268044,2008-07-22 19:34:30,2022-01-17 23:32:44,1,Russia Denies Looking for Pretext to Invade Uk...,[],en,0
2022-01-17 23:50:00,2914,34655603,TheSun,367,1959368,656505,2009-04-23 16:23:40,2022-01-17 23:50:00,5,Putin ‘sending ballistic missiles and snipers’...,[],en,10
2022-01-17 23:52:37,2918,860123351154556928,DeadlineWH,178,390270,12609,2017-05-04 13:26:02,2022-01-17 23:52:37,70,"""I hope Putin comes to his senses because an i...",[],en,219


## Data Split

In [19]:
# split all the tweets to tweets per day
date_list = [str(i) for i in np.unique(df.index.date)]
text_dict = {}
media_text_dict = {}
normal_text_dict = {}
for date in date_list:
    text_dict[date] = list(df.loc[date].text)
    media_text_dict[date] = list(df[df.followers>follower_threshold].loc[date].text)
    normal_text_dict[date] = list(df[df.followers<=follower_threshold].loc[date].text)

## Data Cleaning

In [20]:
for date, media_text, normal_text in tqdm(zip(date_list, media_text_dict.values(), normal_text_dict.values())):
    media_cleaned_text = [preprocessor.clean_message(i) for i in media_text]
    normal_cleaned_text = [preprocessor.clean_message(i) for i in normal_text]
    media_text_dict[date] = media_cleaned_text
    normal_text_dict[date] = normal_cleaned_text
    text_dict[date] = media_cleaned_text + normal_cleaned_text


25it [40:04, 96.20s/it] 


In [45]:
for date, text_list in tqdm(text_dict.items()):
    # TODO: loop through all the text_list instead of 1000 entries
    cleaned_text = [preprocessor.clean_message(i) for i in text_list[:1000]]
    text_dict[date] = cleaned_text


100%|██████████| 25/25 [00:40<00:00,  1.62s/it]


# Modeling

## Baseline

In [21]:
# let's first get the baseline model from tweets data on 01-17
tweet_list = text_dict['2022-01-17']

In [22]:
vectorizer = CountVectorizer(
analyzer='word',       
min_df=3,# minimum required occurences of a word 
lowercase=True,# convert all words to lowercase
token_pattern='[a-zA-Z0-9]{3,}',# num chars > 3
max_features=5000,# max number of unique words
                            )
data_matrix = vectorizer.fit_transform(tweet_list)

In [23]:
# I will use LDA to create topics along with the probability distribution for each word in our vocabulary for each topic
lda_model = LatentDirichletAllocation(
n_components=5, # Number of topics
learning_method='online',
random_state=20,       
n_jobs = -1  # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_matrix)

## Evaluation

In [24]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_matrix))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_matrix))

# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -312682.32767904125
Perplexity:  806.2552006106661
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 5,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 20,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


## Visualization

In [25]:
#pyLDAvis extracts information from a fitted LDA topic model to inform an interactive web-based visualization
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_model, data_matrix, vectorizer, mds='tsne')

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [26]:
#  top 5 most frequent words from each topic that found by LDA
for i,topic in enumerate(lda_model.components_):
    print('Top 5 words for topic:',i)
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-5:]])
    print('\n')

Top 5 words for topic: 0
['nato', 'russian', 'russia', 'war', 'ukrain']


Top 5 words for topic: 1
['russian', 'invad', 'nato', 'russia', 'ukrain']


Top 5 words for topic: 2
['troop', 'border', 'war', 'russia', 'ukrain']


Top 5 words for topic: 3
['the', 'russia', 'amp', 'nato', 'ukrain']


Top 5 words for topic: 4
['fear', 'russiaukrain', 'europ', 'crisi', 'war']






## Hyperparameter Tuning

In [27]:
# Define Search Param
search_params = {'n_components': [3, 5, 10, 15], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_matrix)

GridSearchCV(estimator=LatentDirichletAllocation(),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [3, 5, 10, 15]})

In [28]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_matrix))

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 3}
Best Log Likelihood Score:  -69647.76385084036
Model Perplexity:  761.7509505221232


## Topic Modeling Per Day

In [33]:
daily_topics = tm.generate_daily_topic(text_dict, 3, 0.5)

100%|██████████| 25/25 [07:49<00:00, 18.77s/it]


In [29]:
media_daily_topics = tm.generate_daily_topic(media_text_dict, 3, 0.5)

100%|██████████| 25/25 [00:13<00:00,  1.92it/s]


In [30]:
normal_daily_topics = tm.generate_daily_topic(normal_text_dict, 3, 0.5)

100%|██████████| 25/25 [07:33<00:00, 18.15s/it]


In [34]:
daily_topics

Unnamed: 0,topic_0,topic_1,topic_2
2022-01-17,"['russia', 'russian', 'nato', 'war', 'ukrain']","['russian', 'nato', 'invad', 'russia', 'ukrain']","['border', 'nato', 'russia', 'war', 'ukrain']"
2022-01-18,"['invad', 'nato', 'war', 'russia', 'ukrain']","['belaru', 'border', 'ukrain', 'russian', 'tro...","['move', 'troop', 'nato', 'russia', 'ukrain']"
2022-01-24,"['troop', 'border', 'russia', 'war', 'ukrain']","['war', 'invad', 'nato', 'russia', 'ukrain']","['8500', 'alert', 'nato', 'ukrain', 'troop']"
2022-01-25,"['nato', 'russian', 'troop', 'border', 'ukrain']","['nato', 'invad', 'war', 'russia', 'ukrain']","['europ', 'nato', 'russia', 'troop', 'ukrain']"
2022-02-21,"['order', 'putin', 'russian', 'ukrain', 'troop']","['putin', 'nato', 'war', 'russia', 'ukrain']","['russia', 'troop', 'border', 'russian', 'ukra..."
2022-02-22,"['the', 'world', 'standwithukrain', 'war', 'uk...","['russia', 'putin', 'russian', 'ukrain', 'troop']","['invad', 'putin', 'nato', 'russia', 'ukrain']"
2022-02-24,"['stand', 'war', 'peopl', 'ukrain', 'standwith...","['russia', 'border', 'troop', 'russian', 'ukra...","['putin', 'war', 'nato', 'russia', 'ukrain']"
2022-02-25,"['kyiv', 'ukrainian', 'troop', 'ukrain', 'russ...","['amp', 'peopl', 'standwithukrain', 'war', 'uk...","['war', 'putin', 'russia', 'nato', 'ukrain']"
2022-02-28,"['the', 'war', 'russia', 'putin', 'ukrain']","['ukrainerussiawar', 'kyiv', 'russian', 'russi...","['ukrainerussiawar', 'peopl', 'help', 'amp', '..."
2022-03-01,"['war', 'russia', 'ukrainerussiawar', 'putin',...","['war', 'putin', 'amp', 'russia', 'ukrain']","['ukrainian', 'kyiv', 'russia', 'russian', 'uk..."


In [31]:
media_daily_topics

Unnamed: 0,topic_0,topic_1,topic_2
2022-01-17,"['russia', 'russian', 'troop', 'ukrain', 'bord...","['nato', 'war', 'invad', 'russia', 'ukrain']","['new', 'amp', 'war', 'nato', 'ukrain']"
2022-01-18,"['invad', 'war', 'nato', 'russia', 'ukrain']","['invit', 'tension', 'talk', 'nato', 'ukrain']","['belaru', 'russian', 'russia', 'troop', 'ukra..."
2022-01-24,"['russian', 'troop', 'russia', 'nato', 'ukrain']","['european', 'war', 'border', 'russia', 'ukrain']","['put', '8500', 'alert', 'ukrain', 'troop']"
2022-01-25,"['border', 'russian', 'troop', 'nato', 'ukrain']","['russian', 'deploy', 'russia', 'troop', 'ukra...","['putin', 'biden', 'war', 'russia', 'ukrain']"
2022-02-21,"['say', 'russia', 'russian', 'ukrain', 'border']","['order', 'russian', 'putin', 'troop', 'ukrain']","['secur', 'biden', 'invad', 'russia', 'ukrain']"
2022-02-22,"['russia', 'putin', 'russian', 'ukrain', 'troop']","['russian', 'the', 'sanction', 'russia', 'ukra...","['say', 'russia', 'war', 'border', 'ukrain']"
2022-02-24,"['chernobyl', 'ukrainian', 'ukrain', 'troop', ...","['war', 'putin', 'troop', 'russia', 'ukrain']","['troop', 'ukrainian', 'ukrain', 'border', 'ru..."
2022-02-25,"['first', 'cross', 'the', 'border', 'ukrain']","['kyiv', 'ukrainian', 'troop', 'ukrain', 'russ...","['nato', 'russian', 'russia', 'troop', 'ukrain']"
2022-02-28,"['invas', 'the', 'russian', 'russia', 'ukrain']","['war', 'presid', 'putin', 'russia', 'ukrain']","['student', 'sanction', 'countri', 'russia', '..."
2022-03-01,"['russiaukrainewar', 'putin', 'russian', 'russ...","['the', 'invas', 'russian', 'ukrain', 'russia']","['india', 'russiaukrainewar', 'student', 'indi..."


In [32]:
normal_daily_topics

Unnamed: 0,topic_0,topic_1,topic_2
2022-01-17,"['move', 'europ', 'russia', 'ukrain', 'war']","['russian', 'war', 'nato', 'russia', 'ukrain']","['troop', 'russian', 'nato', 'russia', 'ukrain']"
2022-01-18,"['border', 'russian', 'russia', 'troop', 'ukra...","['invad', 'nato', 'war', 'russia', 'ukrain']","['putin', 'war', 'nato', 'russia', 'ukrain']"
2022-01-24,"['nato', 'invad', 'war', 'russia', 'ukrain']","['8500', 'alert', 'nato', 'ukrain', 'troop']","['war', 'troop', 'biden', 'border', 'ukrain']"
2022-01-25,"['russian', 'russia', 'nato', 'troop', 'ukrain']","['invad', 'nato', 'war', 'russia', 'ukrain']","['amp', 'troop', 'russian', 'border', 'ukrain']"
2022-02-21,"['order', 'putin', 'ukrain', 'russian', 'troop']","['presid', 'biden', 'putin', 'russia', 'ukrain']","['putin', 'war', 'russia', 'nato', 'ukrain']"
2022-02-22,"['putin', 'russia', 'russian', 'ukrain', 'troop']","['war', 'invad', 'nato', 'russia', 'ukrain']","['standwithukrain', 'trump', 'putin', 'war', '..."
2022-02-24,"['ukrainian', 'border', 'troop', 'ukrain', 'ru...","['stand', 'peopl', 'standwithukrain', 'war', '...","['war', 'putin', 'nato', 'russia', 'ukrain']"
2022-02-25,"['invad', 'putin', 'russia', 'nato', 'ukrain']","['stand', 'peopl', 'standwithukrain', 'war', '...","['ukrainian', 'border', 'troop', 'ukrain', 'ru..."
2022-02-28,"['ukrainian', 'ukrainerussiawar', 'russia', 'r...","['peopl', 'amp', 'war', 'putin', 'ukrain']","['nato', 'putin', 'russian', 'ukrain', 'russia']"
2022-03-01,"['amp', 'war', 'putin', 'ukrain', 'russia']","['amp', 'war', 'putin', 'peopl', 'ukrain']","['ukrainerussiawar', 'kyiv', 'russia', 'russia..."


## Data persistence

In [35]:
daily_topics.to_csv('../data/processed/daily_topics.csv')
media_daily_topics.to_csv('../data/processed/influencer_daily_topics.csv')
normal_daily_topics.to_csv('../data/processed/normal_daily_topics.csv')