In [193]:
import pandas as pd
import numpy as np
import os
import glob
import re

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from estnltk import Text

sns.set_theme(style="whitegrid")

In [None]:
pd.options.display.max_colwidth = 1000

# 1. Leaders

Leaders' tweets are in 13 time intervals starting from 01.01.2022 and ending in 31.01.2023.  
Reading in leaders' data to remove them from the public.

In [None]:
# import politicians' data
df_politicians = pd.concat(map(pd.read_csv, glob.glob('../leaders_data/*.csv')))
df_politicians = df_politicians.reset_index(drop = True)
df_politicians

In [26]:
# import municipality leaders data (were left out from the previous leaders' Twitter script run)
df_municipality = pd.concat(map(pd.read_csv, glob.glob('../new_data_24_02_kov/*.csv')))
df_municipality = df_municipality.reset_index(drop = True)

# after manual checking of twitter accounts, these 5 will remain
options = open("options.txt", "r")
df_municipality = df_municipality[df_municipality['user'].isin(options)]
df_municipality.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58 entries, 13 to 689
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   user              58 non-null     object
 1   tweet_id          58 non-null     int64 
 2   tweet_text        58 non-null     object
 3   tweet_lang        58 non-null     object
 4   start             58 non-null     object
 5   end               58 non-null     object
 6   retweet_count     58 non-null     int64 
 7   reply_count       58 non-null     int64 
 8   like_count        58 non-null     int64 
 9   quote_count       58 non-null     int64 
 10  impression_count  58 non-null     int64 
 11  created_at        58 non-null     object
dtypes: int64(6), object(6)
memory usage: 5.9+ KB


In [27]:
# merge politicians + municipality leaders
df_leaders = pd.concat([df_politicians, df_municipality])
print(df_leaders.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128445 entries, 0 to 689
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   user              128445 non-null  object
 1   tweet_id          128445 non-null  int64 
 2   tweet_text        128443 non-null  object
 3   tweet_lang        128445 non-null  object
 4   start             128445 non-null  object
 5   end               128445 non-null  object
 6   retweet_count     128445 non-null  int64 
 7   reply_count       128445 non-null  int64 
 8   like_count        128445 non-null  int64 
 9   quote_count       128445 non-null  int64 
 10  impression_count  128445 non-null  int64 
 11  created_at        58 non-null      object
dtypes: int64(6), object(6)
memory usage: 12.7+ MB
None


In [28]:
# import data from parties official accounts
df_parties = pd.concat(map(pd.read_csv, glob.glob('../new_data_24_02_erakonnad/*.csv')))
df_parties = df_parties.reset_index(drop = True)
df_parties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4977 entries, 0 to 4976
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   user              4977 non-null   object
 1   tweet_id          4977 non-null   int64 
 2   tweet_text        4977 non-null   object
 3   tweet_lang        4977 non-null   object
 4   start             4977 non-null   object
 5   end               4977 non-null   object
 6   retweet_count     4977 non-null   int64 
 7   reply_count       4977 non-null   int64 
 8   like_count        4977 non-null   int64 
 9   quote_count       4977 non-null   int64 
 10  impression_count  4977 non-null   int64 
 11  created_at        4977 non-null   object
dtypes: int64(6), object(6)
memory usage: 466.7+ KB


In [29]:
# merge parties with df_leaders to get the final df of all LEADERS
df_leaders = pd.concat([df_leaders, df_parties])
df_leaders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 133422 entries, 0 to 4976
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   user              133422 non-null  object
 1   tweet_id          133422 non-null  int64 
 2   tweet_text        133420 non-null  object
 3   tweet_lang        133422 non-null  object
 4   start             133422 non-null  object
 5   end               133422 non-null  object
 6   retweet_count     133422 non-null  int64 
 7   reply_count       133422 non-null  int64 
 8   like_count        133422 non-null  int64 
 9   quote_count       133422 non-null  int64 
 10  impression_count  133422 non-null  int64 
 11  created_at        5035 non-null    object
dtypes: int64(6), object(6)
memory usage: 13.2+ MB


...133 662 rows of tweets from given time intervals

In [33]:
# drop duplicates
df_leaders = df_leaders.drop_duplicates()
df_leaders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 133422 entries, 0 to 133421
Data columns (total 20 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   user                 133422 non-null  object 
 1   tweet_id             133422 non-null  int64  
 2   tweet_text           133420 non-null  object 
 3   tweet_lang           133422 non-null  object 
 4   start                133422 non-null  object 
 5   end                  133422 non-null  object 
 6   retweet_count        133422 non-null  int64  
 7   reply_count          133422 non-null  int64  
 8   like_count           133422 non-null  int64  
 9   quote_count          133422 non-null  int64  
 10  impression_count     133422 non-null  int64  
 11  created_at           5035 non-null    object 
 12  nimi                 133422 non-null  object 
 13  erakond              51835 non-null   object 
 14  from                 133316 non-null  object 
 15  media            

In [35]:
# save final df of LEADERS
df_leaders.to_csv('../FINALS/leaders_tweets_final.csv', index=False)

# 2. Public

In [36]:
def is_non_zero_file(fpath):  
    return os.path.isfile(fpath) and os.path.getsize(fpath) > 0


In [37]:
csv_files = glob.glob('../keywords_data/*.csv')
column_names = ['tweetId', 'authorId', 'tweetText', 'lang', 'created_at', 'geo', 'isRetweet', 'retweetCount', 'replyCount', 'likeCount', 'quoteCount', 'impressionCount']
df_keywords = pd.DataFrame([], columns=column_names)
chunksize = 10 ** 6

# import data from keywords search
for f in csv_files:
    if is_non_zero_file(f):
        print(f)
        # read the csv file
        
        for chunk in pd.read_csv(f, chunksize=chunksize, names=column_names):
            df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
        

../keywords_data\tweet_keyword_agressioon.csv
../keywords_data\tweet_keyword_eesti putin.csv
../keywords_data\tweet_keyword_eesti ukraina.csv
../keywords_data\tweet_keyword_eesti venemaa.csv
../keywords_data\tweet_keyword_immigratsioon ukraina.csv
../keywords_data\tweet_keyword_lääneriigid.csv
../keywords_data\tweet_keyword_lääs ukraina.csv
../keywords_data\tweet_keyword_lääs.csv
../keywords_data\tweet_keyword_pagulane ukraina.csv
../keywords_data\tweet_keyword_putini sõda.csv
../keywords_data\tweet_keyword_põgenik.csv


  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df

../keywords_data\tweet_keyword_sõda putin.csv
../keywords_data\tweet_keyword_sõda.csv


  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df

../keywords_data\tweet_keyword_sõjakuritegu.csv
../keywords_data\tweet_keyword_sõjakuriteod.csv
../keywords_data\tweet_keyword_sõjapõgenik.csv
../keywords_data\tweet_keyword_ukraina abi.csv
../keywords_data\tweet_keyword_ukraina aggressioon.csv
../keywords_data\tweet_keyword_ukraina agressor.csv
../keywords_data\tweet_keyword_ukraina elu eesti.csv
../keywords_data\tweet_keyword_ukraina erioperatsioon.csv


  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df

../keywords_data\tweet_keyword_ukraina inimene.csv
../keywords_data\tweet_keyword_ukraina inimese.csv
../keywords_data\tweet_keyword_ukraina inimesega.csv
../keywords_data\tweet_keyword_ukraina inimesele.csv
../keywords_data\tweet_keyword_ukraina inimesena.csv
../keywords_data\tweet_keyword_ukraina inimest.csv
../keywords_data\tweet_keyword_ukraina inimeste.csv
../keywords_data\tweet_keyword_ukraina inimestega.csv
../keywords_data\tweet_keyword_ukraina inimestel.csv
../keywords_data\tweet_keyword_ukraina inimestest.csv
../keywords_data\tweet_keyword_ukraina julgeolek.csv
../keywords_data\tweet_keyword_ukraina konflikt.csv


  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)


../keywords_data\tweet_keyword_ukraina koostöö.csv
../keywords_data\tweet_keyword_ukraina liitlane.csv
../keywords_data\tweet_keyword_ukraina migrant.csv
../keywords_data\tweet_keyword_ukraina pagulane.csv
../keywords_data\tweet_keyword_ukraina putin.csv


  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df

../keywords_data\tweet_keyword_ukraina putini.csv
../keywords_data\tweet_keyword_ukraina põgenik.csv
../keywords_data\tweet_keyword_ukraina põgenikku.csv
../keywords_data\tweet_keyword_ukraina põgeniku.csv
../keywords_data\tweet_keyword_ukraina põgenikuks.csv
../keywords_data\tweet_keyword_ukraina põgenikule.csv
../keywords_data\tweet_keyword_ukraina põgenikuna.csv
../keywords_data\tweet_keyword_ukraina põgenikust.csv
../keywords_data\tweet_keyword_ukraina ründama.csv
../keywords_data\tweet_keyword_ukraina ründas.csv


  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df

../keywords_data\tweet_keyword_ukraina rünnak.csv
../keywords_data\tweet_keyword_ukraina sanktsioon.csv
../keywords_data\tweet_keyword_ukraina sõda.csv
../keywords_data\tweet_keyword_ukraina sõja.csv
../keywords_data\tweet_keyword_ukraina sõjaga.csv
../keywords_data\tweet_keyword_ukraina sõjaks.csv
../keywords_data\tweet_keyword_ukraina sõjal.csv
../keywords_data\tweet_keyword_ukraina sõjale.csv
../keywords_data\tweet_keyword_ukraina sõjani.csv
../keywords_data\tweet_keyword_ukraina sõjapõgenik.csv
../keywords_data\tweet_keyword_ukraina sõjapõgeniku.csv
../keywords_data\tweet_keyword_ukraina sõjas.csv


  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df

../keywords_data\tweet_keyword_ukraina sõjast.csv
../keywords_data\tweet_keyword_ukraina toetamine.csv
../keywords_data\tweet_keyword_ukraina toetus.csv
../keywords_data\tweet_keyword_ukraina vaba.csv
../keywords_data\tweet_keyword_ukraina vabastamine.csv
../keywords_data\tweet_keyword_ukraina venemaa.csv
../keywords_data\tweet_keyword_ukraina.csv


  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df

../keywords_data\tweet_keyword_ukrainlane.csv
../keywords_data\tweet_keyword_varjupaiga ukraina.csv
../keywords_data\tweet_keyword_varjupaik ukraina.csv
../keywords_data\tweet_keyword_venemaa agressioon.csv
../keywords_data\tweet_keyword_venemaa agressiooni.csv
../keywords_data\tweet_keyword_venemaa agressor.csv
../keywords_data\tweet_keyword_venemaa erioperatsioon.csv
../keywords_data\tweet_keyword_venemaa konflikt.csv
../keywords_data\tweet_keyword_venemaa putin.csv


  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df

../keywords_data\tweet_keyword_venemaa putini.csv
../keywords_data\tweet_keyword_venemaa putinil.csv
../keywords_data\tweet_keyword_venemaa ründama.csv
../keywords_data\tweet_keyword_venemaa ründamine.csv
../keywords_data\tweet_keyword_venemaa ründas.csv
../keywords_data\tweet_keyword_venemaa rünnak.csv
../keywords_data\tweet_keyword_venemaa sanktsioon.csv
../keywords_data\tweet_keyword_venemaa sõda.csv
../keywords_data\tweet_keyword_venemaa.csv


  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)
  df_keywords = df_keywords.append(chunk[chunk['lang'] == 'et'],ignore_index=True)


In [38]:
df_keywords.columns

Index(['tweetId', 'authorId', 'tweetText', 'lang', 'created_at', 'geo',
       'isRetweet', 'retweetCount', 'replyCount', 'likeCount', 'quoteCount',
       'impressionCount'],
      dtype='object')

In [39]:
df_keywords['lang'].unique()

array(['et'], dtype=object)

In [105]:
df_keywords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72465 entries, 0 to 72464
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   tweetId          72465 non-null  int64 
 1   authorId         72465 non-null  int64 
 2   tweetText        72465 non-null  object
 3   lang             72465 non-null  object
 4   created_at       72465 non-null  object
 5   geo              465 non-null    object
 6   isRetweet        72465 non-null  bool  
 7   retweetCount     72465 non-null  int64 
 8   replyCount       72465 non-null  int64 
 9   likeCount        72465 non-null  int64 
 10  quoteCount       72465 non-null  int64 
 11  impressionCount  72465 non-null  int64 
dtypes: bool(1), int64(7), object(4)
memory usage: 6.2+ MB


In [41]:
# save to file
df_keywords.to_csv('../FINALS/public_all_keywords_data.csv', index=False)

## Get leaders userIds through Twitter API

In [43]:
df_leaders = pd.read_csv('../FINALS/leaders_tweets_final.csv')

  df_leaders = pd.read_csv('../FINALS/leaders_tweets_final.csv')


In [28]:
leaders_twitter_usernames = df_leaders.user.unique()

In [None]:
import tweepy
import configparser
config = configparser.RawConfigParser()
config.read('tweet.ini')
api_key = config['twitter']['api_key']
api_key_secret = config['twitter']['api_key_secret']
bearer_token =  config['twitter']['bearer_token']

client = tweepy.Client(bearer_token, wait_on_rate_limit = True)

auth = tweepy.OAuth2BearerHandler(bearer_token)
api = tweepy.API(auth, wait_on_rate_limit= True, retry_count=12, retry_delay=5, retry_errors=set([503, 500]))

In [30]:
len(leaders_twitter_usernames)

247

In [37]:
results = {}
for username in leaders_twitter_usernames:
    for response in tweepy.Paginator(client.get_users,usernames=[username]):
        if (response.data != None):
            results[username] = response.data[0].id

Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pag

Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pagination_token
Unexpected parameter: pag

In [47]:
df_data = {'username': list(results.keys()), 'author_id': list(results.values())}
new_leaders_id_df = pd.DataFrame(data=df_data)

In [79]:
# save to file
new_leaders_id_df.to_csv('./FINALS/leaders_username_userid.csv')

## Filter leaders tweets out of public

In [457]:
new_leaders_id_df = pd.read_csv('../FINALS/leaders_username_userid.csv')

In [91]:
# first we get an array of twitter usernames where the user has a media value (is a journalist)
media_usernames = df_leaders[df_leaders['media'].notnull()]['twitter_usernames'].unique()

# next we filter our leaders_id_df and remove media usernames from there 
# (we will use politicans only df to remove politican tweets from public tweets)
politicians_only = new_leaders_id_df[~new_leaders_id_df.username.isin(media_usernames)]

In [47]:
# public tweets
df_keywords = pd.read_csv('../FINALS/public_all_keywords_data.csv')

In [100]:
politicians_only_ids = politicians_only.author_id

In [101]:
# ~df_keywords.authorId.isin the ~ character means that instead of isin, we have is not in
public_tweets = df_keywords[~df_keywords.authorId.isin(politicians_only_ids)]

In [102]:
public_tweets = public_tweets.drop_duplicates()
public_tweets = public_tweets.reset_index()
public_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48805 entries, 0 to 48804
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   index            48805 non-null  int64 
 1   tweetId          48805 non-null  int64 
 2   authorId         48805 non-null  int64 
 3   tweetText        48805 non-null  object
 4   lang             48805 non-null  object
 5   created_at       48805 non-null  object
 6   geo              324 non-null    object
 7   isRetweet        48805 non-null  bool  
 8   retweetCount     48805 non-null  int64 
 9   replyCount       48805 non-null  int64 
 10  likeCount        48805 non-null  int64 
 11  quoteCount       48805 non-null  int64 
 12  impressionCount  48805 non-null  int64 
dtypes: bool(1), int64(8), object(4)
memory usage: 4.5+ MB


In [106]:
# save to file
public_tweets.to_csv('../FINALS/public_final.csv', index=False)

## Preprocess

In [868]:
public_final = pd.read_csv('../FINALS/public_final.csv')

In [870]:
public_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48805 entries, 0 to 48804
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   index            48805 non-null  int64 
 1   tweetId          48805 non-null  int64 
 2   authorId         48805 non-null  int64 
 3   tweetText        48805 non-null  object
 4   lang             48805 non-null  object
 5   created_at       48805 non-null  object
 6   geo              324 non-null    object
 7   isRetweet        48805 non-null  bool  
 8   retweetCount     48805 non-null  int64 
 9   replyCount       48805 non-null  int64 
 10  likeCount        48805 non-null  int64 
 11  quoteCount       48805 non-null  int64 
 12  impressionCount  48805 non-null  int64 
dtypes: bool(1), int64(8), object(4)
memory usage: 4.5+ MB


In [871]:
public_final['isRetweet'].value_counts()

False    29326
True     19479
Name: isRetweet, dtype: int64

In [872]:
public_final['lang'].value_counts() # lies!

et    48805
Name: lang, dtype: int64

In [873]:
# filter out rows where isRetweet = True
public_final = public_final[public_final['isRetweet'] == False]

In [874]:
len(public_final)

29326

In [875]:
public_final = public_final.reset_index()

In [876]:
# estonian stopwords: https://datadoi.ee/handle/33/78
with open('../estonian-stopwords.txt', encoding='utf-8') as f:
    stop_words_est = [word for line in f for word in line.split()]

In [877]:
# english words 1
eng_words_nltk = set(nltk.corpus.words.words()) 

In [878]:
# english words 2: 
# SCOWL, http://wordlist.aspell.net/dicts/ --> https://sourceforge.net/projects/wordlist/files/speller/2020.12.07/
with open('en_US-large.dic', encoding='utf-8') as f:
    eng_words_scowl = [word for line in f for word in line.split()]
    eng_words_scowl = [item.lower() for item in eng_words_scowl]
    eng_words_scowl = [item.split('/') for item in eng_words_scowl]
    eng_words_scowl = [item[0] for item in eng_words_scowl]

In [879]:
# lemmas for ENG words (to easily remove them)
lemmatizer = WordNetLemmatizer()

def lemmatize_text(tweet):
    empty = []
    for word, tag in pos_tag(word_tokenize(tweet)):
        word_tag = tag[0].lower()
        #print(word, word_tag)
        word_tag = word_tag if word_tag in ['a', 'r', 'n', 'v'] else None
        if not word_tag:
            lemma = word
            empty.append(lemma)
        else:
            lemma = lemmatizer.lemmatize(word, word_tag)
            empty.append(lemma)
    return ' '.join(empty)

In [880]:
%%time

# preprocessing function
def text_preprocess_est(x):
    x = x.lower() # lower casing
    x = re.sub(r'[\r\n]+', '', x) # remove return and newline characters
    x = re.sub(r'.*[\u0E00-\u0E7F].*', '', x) #remove thai characters
    x = re.sub(r'.*[ëç].*', '', x) # remove albanian, containing character ë
    x = re.sub(r'.*\b(?!sõda)\w*soda\w*\b.*','', x) # remove all text containing 'soda'
    x = re.sub('.*[\u0400-\u04FF].*', '', x) # remove Cyrillic characters
    x = re.sub(r'https*\S+', ' ', x) # remove urls
    x = re.sub(r'@\S+', ' ', x) # remove mentions
    x = re.sub('[0-9]+', '', x) # remove numbers
    x = re.sub(r'[^\w\s]+', '', x) # remove punctuations
    x = re.sub(r'\s{2,}', ' ', x) # replace the over spaces
    x = ' '.join([word for word in x.split(' ') if word not in stop_words_est]) # remove stop words

    return x

# remove ENG words from posts
def remove_eng_words(x):
    x = " ".join(w for w in nltk.wordpunct_tokenize(x) if w not in eng_words_nltk)
    x = " ".join([w for w in x.split(' ') if w not in eng_words_scowl])
    x = re.sub(r'\s*([.,:;!?)])', r'\1', x) # Remove spaces before punctuation marks
    x = re.sub(r'([(])\s*', r'\1', x)
    return x


public_final['clean_text'] = public_final['tweetText'].apply(text_preprocess_est)
public_final['clean_text'] = public_final['clean_text'].apply(lemmatize_text)
public_final['clean_text'] = public_final['clean_text'].apply(remove_eng_words)


#sub['clean_text'] = sub['tweetText'].apply(text_preprocess_est)
#sub['clean_text'] = sub['clean_text'].apply(lemmatize_text)
#sub['clean_text'] = sub['clean_text'].apply(remove_eng_words)

CPU times: total: 15min 32s
Wall time: 15min 34s


In [881]:
public_final['tweetText'].loc[7]

'Vene agressioon on reaalne oht meie riigile ja me peame olema valmis selleks, et kaitsta oma riiki, meie inimesi ja meie vabadusi. https://t.co/SbgwJn0bDE'

In [882]:
public_final['clean_text'].loc[7]

'vene agressioon reaalne oht riigile valmis kaitsta riiki inimesi vabadusi'

In [885]:
public_final = public_final[public_final['clean_text'] != '']

In [886]:
len(public_final)

19207

In [887]:
%%time

# lemmatization EST
def getLemmaSentence(sentence):
    return ' '.join(map(lambda lemma_word: lemma_word[0], sentence.tag_layer().lemma))

# create estnltk's text object 
public_final['estnltk_text'] = public_final['clean_text'].apply(Text)

# add lemma sentences to df
public_final['lemma_text'] = public_final['estnltk_text'].apply(getLemmaSentence)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: total: 2min 8s
Wall time: 2min 12s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [888]:
public_final['lemma_text'].loc[7]

'vene agressioon reaalne oht riik valmis kaitsma riik inimene vabadus'

## BERT preprocess

In [890]:
%%time

# preprocessing function
# preprocessing function
def text_preprocess_bert(x):
    x = re.sub(r'[\r\n]+', '', x) # remove return and newline characters
    x = re.sub(r'.*[\u0E00-\u0E7F].*', '', x) #remove thai characters
    x = re.sub(r'.*[ëç].*', '', x) # remove albanian, containing character ë,ç
    x = re.sub(r'.*\b(?!sõda)\w*soda\w*\b.*','', x) # remove all text containing 'soda'
    x = re.sub('.*[\u0400-\u04FF].*', '', x) # remove Cyrillic characters
    x = re.sub(r'https*\S+', ' ', x) # remove urls
    x = re.sub(r'@\S+', ' ', x) # remove mentions
    x = re.sub(r'\s{2,}', ' ', x) # replace the over spaces
    
    return x

public_final['bert_text'] = public_final['tweetText'].apply(text_preprocess_bert)
public_final['bert_text'] = public_final['bert_text'].apply(remove_eng_words)

#sub['bert_text'] = sub['tweetText'].apply(text_preprocess_bert)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: total: 24min 19s
Wall time: 24min 20s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [891]:
# save file
public_final.to_excel('public_bert_not_final.xlsx', index = False)

In [957]:
public_final = pd.read_excel('public_bert_not_final.xlsx')

In [958]:
len(public_final)

19207

In [959]:
public_final = public_final[~public_final['bert_text'].str.contains('aplikasi')]
public_final = public_final[~public_final['bert_text'].str.contains('fiktio')]

In [960]:
len(public_final)

19173

In [961]:
public_final = public_final[~public_final['bert_text'].str.contains('Cherson')]

In [962]:
len(public_final)

19167

In [963]:
public_final = public_final[~public_final['bert_text'].str.contains('Quo vadis')]

In [964]:
len(public_final)

19158

In [965]:
public_final = public_final[~public_final['bert_text'].str.contains('Norberg')]

In [966]:
len(public_final)

19152

In [967]:
public_final = public_final[~public_final['bert_text'].str.contains('krimea')] # 2
public_final = public_final[~public_final['bert_text'].str.contains('türk')] # 17
public_final = public_final[~public_final['bert_text'].str.contains('Crimea')] # 3
public_final = public_final[~public_final['bert_text'].str.contains('Tse ')] # 2
public_final = public_final[~public_final['bert_text'].str.contains(' är ')] # 24
public_final = public_final[~public_final['bert_text'].str.contains('krim ')] # 4

In [968]:
len(public_final)

19098

In [969]:
public_final = public_final[~public_final['bert_text'].str.contains('Slavamuie')] #7
public_final = public_final[~public_final['bert_text'].str.contains(' näin ')] # 4
public_final = public_final[~public_final['clean_text'].str.contains('polen')] # 4
public_final = public_final[~public_final['clean_text'].str.contains(' ke u')] # 25
public_final = public_final[~public_final['bert_text'].str.contains('qe ')] # 90
public_final = public_final[~public_final['bert_text'].str.contains('miksi')] # 2
public_final = public_final[~public_final['bert_text'].str.contains(' bej')] # 3
public_final = public_final[~public_final['bert_text'].str.contains('dajjal')] # 1
public_final = public_final[~public_final['bert_text'].str.contains('dhe')] # 101
public_final = public_final[~public_final['bert_text'].str.contains('tilannekatsaus')] # 18
public_final = public_final[~public_final['clean_text'].str.contains('vittorio')] # 7
public_final = public_final[~public_final['clean_text'].str.contains('sieg')] # 7
public_final = public_final[~public_final['clean_text'].str.contains('fratelli')] # 6
public_final = public_final[~public_final['clean_text'].str.contains('matteo')] # 5
public_final = public_final[~public_final['clean_text'].str.contains('igapäevuussona')] # 12
public_final = public_final[~public_final['clean_text'].str.contains('riihisoft')] # 2
public_final = public_final[~public_final['clean_text'].str.contains('cirkus')] # 4
public_final = public_final[~public_final['clean_text'].str.contains('irgc')] # 172

In [970]:
len(public_final)

18433

In [971]:
public_final = public_final[~public_final['clean_text'].str.contains('natoon')] # 18
public_final = public_final[~public_final['clean_text'].str.contains(' nuk ')] # 13
public_final = public_final[~public_final['clean_text'].str.contains('natossa')] # 6
public_final = public_final[~public_final['clean_text'].str.contains('kosoven')] # 3
public_final = public_final[~public_final['clean_text'].str.contains('ismagilov')] # 17
public_final = public_final[~public_final['clean_text'].str.contains('mufti')] #1
public_final = public_final[~public_final['clean_text'].str.contains('pesan')]
public_final = public_final[~public_final['clean_text'].str.contains(' filmiki ')] # 15
public_final = public_final[~public_final['clean_text'].str.contains(' rosja ')] # 1
public_final = public_final[~public_final['clean_text'].str.contains(' fanart ')] # 1
public_final = public_final[~public_final['clean_text'].str.contains('street')] # 6
public_final = public_final[~public_final['clean_text'].str.contains(' versi ')] # 7
public_final = public_final[~public_final['clean_text'].str.contains('tni ')] # 6
public_final = public_final[~public_final['clean_text'].str.contains('verry')]  # 4
public_final = public_final[~public_final['clean_text'].str.contains('gabanelli')]  # 4
public_final = public_final[~public_final['clean_text'].str.contains('näin ')]  # 16

In [972]:
len(public_final)

18309

In [973]:
public_final = public_final[~public_final['clean_text'].str.contains('jadi')] # 7
public_final = public_final[~public_final['clean_text'].str.contains(' vann ')] # 8
public_final = public_final[~public_final['clean_text'].str.contains('sõjalaul')] # 6

In [974]:
len(public_final)

18288

In [975]:
public_final = public_final.drop_duplicates(subset = 'clean_text')
len(public_final)

17208

In [976]:
public_final = public_final[~public_final['clean_text'].str.contains('veinide')] # 4

In [977]:
len(public_final)

17204

In [978]:
public_final = public_final[~public_final['clean_text'].str.contains(' merr ')] # 3

In [979]:
len(public_final)

17201

In [983]:
tweets_delete = pd.read_excel('filter_out.xlsx')
tweets_delete = tweets_delete[tweets_delete['bert_topic_id'] == 'tags']
tweets_delete = tweets_delete['index'].unique()

In [986]:
public_final = public_final[~public_final['index'].isin(tweets_delete)]

In [987]:
len(public_final) 

16927

In [588]:
#from datetime import datetime, timezone
#public_final['created_at'] = public_final['created_at'].dt.tz_localize(None)

In [988]:
# save file
public_final.to_excel('public_bert.xlsx', index = False)

## Describe df

In [590]:
# get nr of posts per month

public_final['created_at'] = public_final['created_at'].apply(pd.to_datetime) 

placeholder = public_final[['created_at']]
placeholder = placeholder.groupby(pd.Grouper(key='created_at', freq='1M')).size() # groupby each 1 month
placeholder.index = placeholder.index.strftime('%B')
placeholder

created_at
January       196
February     1977
March        3839
April        2066
May          1715
June          847
July          781
August       1027
September    1174
October      1304
November      938
December     1248
January      1651
dtype: int64