# Unsupervised Learning Project - Natural Language Processing

In [11]:
pip install -U textblob

Requirement already up-to-date: textblob in /Users/gabriellanemeth/opt/anaconda3/lib/python3.8/site-packages (0.15.3)
Note: you may need to restart the kernel to use updated packages.


In [12]:
import json
import csv
import tweepy
import re
import string
from datetime import datetime, timedelta

import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from textblob import TextBlob
import matplotlib.pyplot as plt

In [13]:
"""
INPUTS:
    consumer_key, consumer_secret, access_token, access_token_secret: codes 
    telling twitter that we are authorized to access this data
    hashtag_phrase: the combination of hashtags to search for
OUTPUTS:
    none, simply save the tweet info to a spreadsheet
"""
def search_for_hashtags(consumer_key, consumer_secret, access_token, access_token_secret, hashtag_phrase):
    
    #create authentication for accessing Twitter
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)

    #initialize Tweepy API
    api = tweepy.API(auth)
    
    #get the name of the spreadsheet we will write to
    fname = '_'.join(re.findall(r"#(\w+)", hashtag_phrase))

    #open the spreadsheet we will write to
    with open('%s.csv' % (fname), 'w') as file:

        w = csv.writer(file)

        #write header row to spreadsheet
        w.writerow(['timestamp', 'tweet_text', 'username', 'all_hashtags', 'followers_count'])

        #for each tweet matching our hashtags, write relevant info to the spreadsheet
        for tweet in tweepy.Cursor(api.search, q=hashtag_phrase+' -filter:retweets', \
                                   tweet_mode='extended').items(50000):
            w.writerow([tweet.created_at, tweet.full_text.replace('\n',' ').encode('ascii','ignore'), tweet.user.screen_name.encode('utf-8'), [e['text'] for e in tweet._json['entities']['hashtags']], tweet.user.followers_count])

In [14]:
consumer_key = input('Consumer Key ')
consumer_secret = input('Consumer Secret ')
access_token = input('Access Token ')
access_token_secret = input('Access Token Secret ')
    
hashtag_phrase = input('Hashtag Phrase ')

if __name__ == '__main__':
    search_for_hashtags(consumer_key, consumer_secret, access_token, access_token_secret, hashtag_phrase)

KeyboardInterrupt: Interrupted by user

In [15]:
df = pd.read_csv('Hungary_Hungarian 22.48.21_OLD.csv')

# Text preprocessing

In [16]:
pd.set_option('display.max_colwidth', None)

In [17]:
df.sample(5)

Unnamed: 0,timestamp,tweet_text,username,all_hashtags,followers_count
57,2021-07-15 15:25:44,b'@MattiaSimone97 The best bit is #poland would be absolutely fine outside the block as it stands so would #Hungary. If you punish them they will definitely say F off &amp; go . Interesting times Hope they leave .',b'brexitbadboy',"['poland', 'Hungary']",668
384,2021-07-13 11:14:58,b'Good morning Budapest \xf0\x9f\x98\x8d\xf0\x9f\x98\x8d\xf0\x9f\x98\x8d #Parlament #orsz\xc3\xa1gh\xc3\xa1z #budapest #hungary #sunrise #Parliament #GoodMorningTwitterWorld #WednesdayMotivation #photography #photoofhungary #throwback #cityscape #hungary https://t.co/QSK8hkotyw',b'AgotaSimonPhot1',"['Parlament', 'országház', 'budapest', 'hungary', 'sunrise', 'Parliament', 'GoodMorningTwitterWorld', 'WednesdayMotivation', 'photography', 'photoofhungary', 'throwback', 'cityscape', 'hungary']",8
141,2021-07-15 11:12:24,b'Good news for #Hungary. https://t.co/PV1kIke89k',b'Visegrad_PostEN',['Hungary'],945
1215,2021-07-08 11:48:18,"b'Fun experiment with Reetta. Will continue with these little studies and change of palette, and see where it leads. Reetta Riikonen (EF 91) Oil on panel, 18cm x 23cm #painting #dance #celf #dawns #experiment #wales #hungary #finland #howdoestitaniumwhitework? https://t.co/rLrECFOt4m'",b'carlchapple',"['painting', 'dance', 'celf', 'dawns', 'experiment', 'wales', 'hungary', 'finland', 'howdoestitaniumwhitework']",738
669,2021-07-11 09:10:11,b'#EFOpen #Hungary Results race 2. #MsportXtra @EF_Open https://t.co/lg1Pu9qV7o',b'MsportXtra',"['EFOpen', 'Hungary', 'MsportXtra']",6864


In [18]:
df.shape

(1395, 5)

In [19]:
df.columns

Index(['timestamp', 'tweet_text', 'username', 'all_hashtags',
       'followers_count'],
      dtype='object')

In [20]:
df.timestamp.min()

'2021-07-07 16:27:29'

In [21]:
df.timestamp.max()

'2021-07-15 21:36:51'

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1395 entries, 0 to 1394
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   timestamp        1395 non-null   object
 1   tweet_text       1395 non-null   object
 2   username         1395 non-null   object
 3   all_hashtags     1395 non-null   object
 4   followers_count  1395 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 54.6+ KB


In [23]:
df.username.value_counts().head(10)

b'DNewsHungary'       56
b'bbj_hu'             33
b'VOSTeurope'         23
b'XpatLoop'           13
b'AATSEEL_Tweets'     11
b'AllSportDB'         10
b'tamasivankovacs'     9
b'Susan_Larson_TN'     9
b'FreelanceJobRSS'     9
b'ggreilinger'         8
Name: username, dtype: int64

In [24]:
df.followers_count.max()

4723059

In [25]:
df.followers_count.argmax()

1203

In [26]:
df.iloc[1203]['tweet_text']

'b"In the wake of #Hungary\'s dehumanizing new anti-#LGBT law, is the EU finally getting serious about the authoritarian government in its member state?   Daily Brief: https://t.co/KqPDIh6Wbt https://t.co/JVusHQQOkg"'

In [27]:
df.tweet_text.sample(5)

97     b'Overheated race for #public #money, where only a 14 minutes window of opportunity was available for submission. It reminds me of High-Frequency Trading, where #data transmission speed makes a difference. #EU #Hungary #funding  blob:https://t.co/DjnmSykh4a'
83                                                                        b"@EU_Justice Are you getting all the info on #Hungary? If you read Europa, don't miss out on what 8 other sources have to say #LGBTQ #Politics #EuropeanUnion #Poland https://t.co/2yX3fFrpjV"
561                                                                  b'FOURTH WAVE POSSIBLE IN #HUNGARY SAYS VIROLOGIST  *************** Enjoy More News, Information &amp; Inspiration on https://t.co/qbyJG5JWsE - Your Expat Community Portal https://t.co/jV8Cp6xC0T'
750                                                                                                                         b'My dinner is served! #Hungarian #cuisine \xe2\x9d\xa4\xef\xb8\x8f\xf0\x9f\x8

# Cleaning

During the cleaning process the below items will be removed/changed: 
 - removed URLs, punctuation, numbers, stopwords, foreign characters caused words - starts with 'x'
 - changed to lowercase

In [28]:
def remove_URL(tweet):
    tweet = tweet[1:]
    tweet = tweet.split()
    new_tweet = []
    for words in tweet:
        if 'http' in words:
            new_tweet.append(words.replace(words, ''))
        else:
            new_tweet.append(words)
    return ' '.join(new_tweet)

In [29]:
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x.lower())
punc = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x)

In [30]:
def remove_stop_words(word_token):
    
    stop_words = set(stopwords.words('english'))
    
    word_tokens = word_token.split()

    tweet_clear = []

    for w in word_tokens:
        if w not in stop_words:
            tweet_clear.append(w)

    return ' '.join(tweet_clear)

In [31]:
def extra_cleaning(text):
    text = text.split()
    new = []
    for word in text:
        if word[0] == 'x':
            continue
        elif word == "hungary": 
            continue
        elif word == "hungarian":
            continue        
        else:
            new.append(word)
    return ' '.join(new)

In [32]:
df['tweet_text_cleaned'] = df['tweet_text'].apply(remove_URL)

In [33]:
df['tweet_text_cleaned'] = df.tweet_text_cleaned.map(alphanumeric).map(punc)

In [34]:
df['tweet_text_cleaned'] = df['tweet_text_cleaned'].apply(remove_stop_words)

In [35]:
df['tweet_text_cleaned'] = df['tweet_text_cleaned'].apply(extra_cleaning)

In [36]:
df['tweet_text_cleaned'].sample(5)

569                                                                                                                                                                          comments bad behaviour cost eur football
1201                                                                                      law affecting lgbt rights european union debating bring country line fundamental values activists vowing civil disobedience
475                                              forget treat kindness others treat kindness weekend weekend kindness kindnessmatters wellbeing wellness healthy mondaymotivation motivationmonday motivationalmonday
738     poland yr bond auction bust mum monetary policy non rez ownership w green variety turn earth day eur million issue oversubscribed yield warsaw exchange listed co energy ebrd eur million buy emergingmarkets
1329                                 orangealert due thunderstorm southern great plain central southern transdanubia western transdanubia northe

In [37]:
df2 = df.copy()

In [38]:
corpus = df2['tweet_text_cleaned']

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(len(vectorizer.get_feature_names()))
print(X.toarray())

6776
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# Modeling

As the dataset has short documents the <b>non-negative matrix factorization</b> would give us the best resulte for the possible topics

In [39]:
nmf = NMF(n_components=10)

doc_topic = nmf.fit(X)

In [40]:
tweet_topic_matrix = nmf.transform(X)

In [41]:
tweet_topic_matrix_df = pd.DataFrame(tweet_topic_matrix).add_prefix('topic_')

tweet_topic_matrix_df[['raw_tweets', 'clean_tweets']] = df2[['tweet_text', 'tweet_text_cleaned']]
tweet_topic_matrix_df.head(5)

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,raw_tweets,clean_tweets
0,0.0,0.0,0.016369,0.002946,0.017207,0.0,0.019593,0.012855,0.068564,0.0,"b'#Cocaine stash worth \xe2\x82\xac9m lands on roof of home in Sardinia. ""Rizzo was working as a pilot for a #Hungary company and flying wealthy people around when he was arrested at Cagliari Elmas airport after landing a flight from Germany ."" https://t.co/LBxvqcurKK'",cocaine stash worth lands roof home sardinia rizzo working pilot company flying wealthy people around arrested cagliari elmas airport landing flight germany
1,0.000406,0.0,0.000535,0.000322,0.0,0.001042,0.00363,0.00155,0.0,0.0,b'Hilarious \xf0\x9f\xa4\xa3 - Are you afraid of Sexual Harassment Panda? #Hungary https://t.co/vkFMBk3t8J',hilarious afraid sexual harassment panda
2,0.1416,0.0,0.331099,0.116986,0.0,0.251801,0.0,0.0,0.0,0.0,"b'#EU takes legal action against #Hungary, #Poland over #LGBTQ rights The legal cases could eventually land the two member states in the bloc\xe2\x80\x99s highest court. https://t.co/KFYGbprovF'",eu takes legal action poland lgbtq rights legal cases could eventually land two member states bloc highest court
3,0.154353,0.0,0.13702,0.0,0.013779,0.0,0.0,0.0,0.006473,0.00045,"b'#EU #Poland Somehow I doubt that we will see a #Polexit ... that might be followed by potential #Hungary leaving ... If one or both would leave, the entire organization might fold down #ToBeContinued https://t.co/QoFcSu6v8o'",eu poland somehow doubt see polexit might followed potential leaving one would leave entire organization might fold tobecontinued
4,0.138778,0.0,0.28905,0.0,0.0,0.245385,0.0,0.0,0.0,0.0,b'#EU launches legal action against #Hungary and #Poland for violating #rights_LGBTQ+ citizens World News \xc2\xb7',eu launches legal action poland violating rights lgbtq citizens world news


In [42]:
word_topic_matrix_df = pd.DataFrame(nmf.components_, columns=vectorizer.get_feature_names()).T.add_prefix('topic_')
word_topic_matrix_df.head(5)

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9
aa,0.0,0.0,0.0,0.0,0.00095,0.0,0.000676,0.0,0.002153,0.0
aaahh,0.0,0.0,0.000175,0.0,0.000188,0.000179,0.0,0.0,0.000124,0.0
abandon,0.06391,0.006221,0.0,0.046398,0.0,0.0,0.0,0.0,0.0,0.007665
abandoned,0.0,0.0,0.0,0.0,0.0,0.0,0.034671,0.00907,0.0,0.008787
abazina,0.000331,0.0,0.0,0.0,0.0,1.3e-05,0.0,0.00025,0.001301,0.000423


In [43]:
word_topic_matrix_df.sort_values(by = ['topic_2'], ascending=False).head(5)

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9
poland,0.0,0.0,4.107992,0.000289,0.0,0.0,0.0,0.0,0.265484,0.002136
rights,0.001452,0.055304,2.319103,0.0,0.0,0.062522,0.0,0.166686,0.0,0.0
legal,0.136041,0.175842,1.299287,0.028515,0.0,0.032919,0.057636,0.0,0.0,0.0
commission,0.800417,0.000512,0.940748,0.032783,0.0,0.0,0.30381,0.004203,0.0,0.0
action,0.203963,0.517153,0.863837,0.0,0.0,0.222845,0.0,0.0,0.0,0.0


In [44]:
tweet_topic_matrix_df.sort_values(by = ['topic_2'], ascending=False).head(5)

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,raw_tweets,clean_tweets
135,0.479506,0.0,0.528254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"b'EU founding values: Commission starts legal action against #Hungary and #Poland for violations of fundamental rights of #LGBTIQ people ""Equality and the respect for dignity and human rights are core values of the EU, enshrined in the EU Treaty https://t.co/cW2wBpfNCa #RuleOfLaw'",eu founding values commission starts legal action poland violations fundamental rights lgbtiq people equality respect dignity human rights core values eu enshrined eu treaty ruleoflaw
93,0.300145,0.053566,0.498554,0.03133,0.00037,0.0,0.00067,0.0,0.0,0.0,"b""Both #Hungary and #Poland show no sign of backing down after today's EU legal challenge to anti-gay policies. The Commission's legal case is shaky here, especially against Poland which along with the UK negotiated an exemption to the EU Charter of Fundamental Rights in 2009. https://t.co/audf7i4S1n""",poland show sign backing today eu legal challenge anti gay policies commission legal case shaky especially poland along uk negotiated exemption eu charter fundamental rights
29,0.27611,0.053678,0.464478,0.0,0.006236,0.504273,0.010942,0.023096,0.0,0.0,"b'EU launches legal action against Hungary, Poland over LGBTQ rights. Passers-by stand in front of the Brussels Town Hall and the rest of the histor.... https://t.co/X6vSXvmVvR #asia_news #eu #european_union #hungary #international #lgbt #lgbtq #news #phuket #poland #thaila... https://t.co/nXKHBvtoYt'",eu launches legal action poland lgbtq rights passers stand front brussels town hall rest histor asia news eu european union international lgbt lgbtq news phuket poland thaila
447,0.117763,0.0,0.451565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"b'What Do Poland And Hungary Have In Common? Poland and Hungary are both members of the European Union (EU) and the North Atlantic Treaty Organization (NATO). For more details, visit the link #POLAND #Hungary https://t.co/rn3xZRUiDd'",poland common poland members european union eu north atlantic treaty organization nato details visit link poland
138,0.12338,0.088443,0.441328,0.0,0.0,0.0,0.0,0.035445,0.0,0.0,"b'EU launches legal action against Hungary, Poland over #LGBT rights #EuropeanUnion #Hungary #Poland #HumanRights https://t.co/XCCmszcLTU'",eu launches legal action poland lgbt rights europeanunion poland humanrights


In [45]:
topic_word = pd.DataFrame(nmf.components_.round(3),
             index = ['topic_0','topic_1','topic_2','topic_3','topic_4','topic_5','topic_6','topic_7', 'topic_8',
                      'topic_9'],
             columns = vectorizer.get_feature_names())

In [46]:
topic_word

Unnamed: 0,aa,aaahh,abandon,abandoned,abazina,abazinia,abbottnews,abc,abkhaz,abkhazia,...,zoe,zoltan,zoltanspox,zone,zones,zoo,zsa,zsazsa,zsazsagabor,zygmunt
topic_0,0.0,0.0,0.064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.007,0.0,0.0,0.0,0.0,0.0,0.0,0.0
topic_1,0.0,0.0,0.006,0.0,0.0,0.0,0.0,0.02,0.0,0.0,...,0.0,0.0,0.002,0.011,0.019,0.0,0.0,0.0,0.0,0.0
topic_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.032,0.172,0.0,0.0,0.0,0.0,0.011
topic_3,0.0,0.0,0.046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.011,0.0,0.0,0.007,0.016,0.0,0.0,0.0,0.0,0.0
topic_4,0.001,0.0,0.0,0.0,0.0,0.0,0.015,0.0,0.0,0.0,...,0.0,0.015,0.0,0.0,0.0,0.015,0.064,0.014,0.0,0.0
topic_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.046,0.0,0.038,0.0,0.0,0.0,0.0,0.0
topic_6,0.001,0.0,0.0,0.035,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.016,0.012,0.008,0.0,0.004,0.0,0.0,0.015
topic_7,0.0,0.0,0.0,0.009,0.0,0.0,0.0,0.003,0.0,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0
topic_8,0.002,0.0,0.0,0.0,0.001,0.001,0.0,0.0,0.001,0.001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.004,0.0,0.0,0.002
topic_9,0.0,0.0,0.008,0.009,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.009,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [48]:
display_topics(nmf, vectorizer.get_feature_names(), 10)


Topic  0
eu, commission, orban, viktor, recovery, like, values, funds, vonderleyen, orb

Topic  1
law, lgbt, anti, action, parliament, demand, take, effect, protest, fight

Topic  2
poland, rights, legal, commission, action, fundamental, lgbtiq, european, people, violations

Topic  3
new, cases, recovered, confirmed, deaths, today, time, coronavirus, increase, update

Topic  4
budapest, summer, parliament, city, travel, dailynewshungary, first, international, tourism, step

Topic  5
lgbtq, orban, schools, let, activists, laws, anti, parliament, says, action

Topic  6
amp, news, community, information, inspiration, enjoy, portal, expat, commission, en

Topic  7
children, book, lgbt, families, fined, bookshop, chain, picture, family, parents

Topic  8
europe, france, romania, austria, canada, germany, greece, italy, portugal, language

Topic  9
great, southern, transdanubia, plain, due, severeweather, cet, ending, central, northern


In [49]:
tweet_topic_matrix_df.columns

Index(['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5',
       'topic_6', 'topic_7', 'topic_8', 'topic_9', 'raw_tweets',
       'clean_tweets'],
      dtype='object')

In [50]:
tweet_topic_matrix_df = tweet_topic_matrix_df.rename(columns={'topic_0' : 'EU_HU', 'topic_1': 'Anti_LGBT', 'topic_2':'Poland', 
                                      'topic_3':'Coronavirus', 'topic_4':'Tourism', 'topic_5':'Sex_edu',
                                      'topic_6': 'Mix', 'topic_7': 'Book_censure', 'topic_8':'International_news', 
                                      'topic_9': 'Weather'})


In [51]:
tweet_topic_matrix_df['main_topic'] = tweet_topic_matrix_df.max(axis=1)

In [52]:
tweet_topic_matrix_df['main_topic_id'] = tweet_topic_matrix_df[['EU_HU', 'Anti_LGBT', 'Poland', 'Coronavirus', 
                                                                'Tourism', 'Sex_edu','Mix', 'Book_censure', 
                                                                'International_news', 'Weather']].idxmax(axis=1)

In [53]:
tweet_topic_matrix_df.head(3)

Unnamed: 0,EU_HU,Anti_LGBT,Poland,Coronavirus,Tourism,Sex_edu,Mix,Book_censure,International_news,Weather,raw_tweets,clean_tweets,main_topic,main_topic_id
0,0.0,0.0,0.016369,0.002946,0.017207,0.0,0.019593,0.012855,0.068564,0.0,"b'#Cocaine stash worth \xe2\x82\xac9m lands on roof of home in Sardinia. ""Rizzo was working as a pilot for a #Hungary company and flying wealthy people around when he was arrested at Cagliari Elmas airport after landing a flight from Germany ."" https://t.co/LBxvqcurKK'",cocaine stash worth lands roof home sardinia rizzo working pilot company flying wealthy people around arrested cagliari elmas airport landing flight germany,0.068564,International_news
1,0.000406,0.0,0.000535,0.000322,0.0,0.001042,0.00363,0.00155,0.0,0.0,b'Hilarious \xf0\x9f\xa4\xa3 - Are you afraid of Sexual Harassment Panda? #Hungary https://t.co/vkFMBk3t8J',hilarious afraid sexual harassment panda,0.00363,Mix
2,0.1416,0.0,0.331099,0.116986,0.0,0.251801,0.0,0.0,0.0,0.0,"b'#EU takes legal action against #Hungary, #Poland over #LGBTQ rights The legal cases could eventually land the two member states in the bloc\xe2\x80\x99s highest court. https://t.co/KFYGbprovF'",eu takes legal action poland lgbtq rights legal cases could eventually land two member states bloc highest court,0.331099,Poland


The topics are defined so lets see how they appeared in the different days of the week after the law came into effect.

In [54]:
def date_time_obj(text):
    return datetime.strptime(text[2:10], '%y-%m-%d')

In [55]:
df2['timestamp'] = df2['timestamp'].apply(date_time_obj)

In [56]:
df2.head(3)

Unnamed: 0,timestamp,tweet_text,username,all_hashtags,followers_count,tweet_text_cleaned
0,2021-07-15,"b'#Cocaine stash worth \xe2\x82\xac9m lands on roof of home in Sardinia. ""Rizzo was working as a pilot for a #Hungary company and flying wealthy people around when he was arrested at Cagliari Elmas airport after landing a flight from Germany ."" https://t.co/LBxvqcurKK'",b'antaldaniel',"['Cocaine', 'Hungary']",1488,cocaine stash worth lands roof home sardinia rizzo working pilot company flying wealthy people around arrested cagliari elmas airport landing flight germany
1,2021-07-15,b'Hilarious \xf0\x9f\xa4\xa3 - Are you afraid of Sexual Harassment Panda? #Hungary https://t.co/vkFMBk3t8J',b'gaborcihlar',['Hungary'],8,hilarious afraid sexual harassment panda
2,2021-07-15,"b'#EU takes legal action against #Hungary, #Poland over #LGBTQ rights The legal cases could eventually land the two member states in the bloc\xe2\x80\x99s highest court. https://t.co/KFYGbprovF'",b'VassilisTsarnas',"['EU', 'Hungary', 'Poland', 'LGBTQ']",2199,eu takes legal action poland lgbtq rights legal cases could eventually land two member states bloc highest court


In [57]:
df2.groupby('timestamp').count()

Unnamed: 0_level_0,tweet_text,username,all_hashtags,followers_count,tweet_text_cleaned
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-07-07,62,62,62,62,62
2021-07-08,301,301,301,301,301
2021-07-09,208,208,208,208,208
2021-07-10,118,118,118,118,118
2021-07-11,117,117,117,117,117
2021-07-12,160,160,160,160,160
2021-07-13,103,103,103,103,103
2021-07-14,122,122,122,122,122
2021-07-15,204,204,204,204,204


# Sentiment analysis

In [58]:
def sent_polar(text):
    test_pol = TextBlob(text)
    return test_pol.sentiment.polarity

In [59]:
df2['sent_polarity'] = df2.tweet_text_cleaned.apply(sent_polar)

In [60]:
def sent_pol_bi(num):
    if num > 0:
        return 1
    else:
        return -1

In [61]:
df2['sent_pol_bi'] = df2.sent_polarity.apply(sent_pol_bi)

In [62]:
def sent_subj(text):
    test_pol = TextBlob(text)
    return test_pol.sentiment.subjectivity

In [63]:
df2['sent_subject'] = df2.tweet_text_cleaned.apply(sent_subj)

In [64]:
df2.head(3).head(3)

Unnamed: 0,timestamp,tweet_text,username,all_hashtags,followers_count,tweet_text_cleaned,sent_polarity,sent_pol_bi,sent_subject
0,2021-07-15,"b'#Cocaine stash worth \xe2\x82\xac9m lands on roof of home in Sardinia. ""Rizzo was working as a pilot for a #Hungary company and flying wealthy people around when he was arrested at Cagliari Elmas airport after landing a flight from Germany ."" https://t.co/LBxvqcurKK'",b'antaldaniel',"['Cocaine', 'Hungary']",1488,cocaine stash worth lands roof home sardinia rizzo working pilot company flying wealthy people around arrested cagliari elmas airport landing flight germany,0.4,1,0.55
1,2021-07-15,b'Hilarious \xf0\x9f\xa4\xa3 - Are you afraid of Sexual Harassment Panda? #Hungary https://t.co/vkFMBk3t8J',b'gaborcihlar',['Hungary'],8,hilarious afraid sexual harassment panda,0.133333,1,0.911111
2,2021-07-15,"b'#EU takes legal action against #Hungary, #Poland over #LGBTQ rights The legal cases could eventually land the two member states in the bloc\xe2\x80\x99s highest court. https://t.co/KFYGbprovF'",b'VassilisTsarnas',"['EU', 'Hungary', 'Poland', 'LGBTQ']",2199,eu takes legal action poland lgbtq rights legal cases could eventually land two member states bloc highest court,0.166667,1,0.166667


In [65]:
df_total = pd.concat([df2, tweet_topic_matrix_df], axis=1, join="inner")

In [66]:
df_total.columns

Index(['timestamp', 'tweet_text', 'username', 'all_hashtags',
       'followers_count', 'tweet_text_cleaned', 'sent_polarity', 'sent_pol_bi',
       'sent_subject', 'EU_HU', 'Anti_LGBT', 'Poland', 'Coronavirus',
       'Tourism', 'Sex_edu', 'Mix', 'Book_censure', 'International_news',
       'Weather', 'raw_tweets', 'clean_tweets', 'main_topic', 'main_topic_id'],
      dtype='object')

In [67]:
df_total_cleaned = df_total[['timestamp', 'tweet_text','tweet_text_cleaned', 'sent_polarity', 'sent_pol_bi',
                             'sent_subject', 'EU_HU', 'Anti_LGBT', 'Poland', 'Coronavirus','Tourism', 'Sex_edu', 
                             'Mix', 'Book_censure', 'International_news','Weather','main_topic', 'main_topic_id']]

In [68]:
df_total_cleaned.head(3)

Unnamed: 0,timestamp,tweet_text,tweet_text_cleaned,sent_polarity,sent_pol_bi,sent_subject,EU_HU,Anti_LGBT,Poland,Coronavirus,Tourism,Sex_edu,Mix,Book_censure,International_news,Weather,main_topic,main_topic_id
0,2021-07-15,"b'#Cocaine stash worth \xe2\x82\xac9m lands on roof of home in Sardinia. ""Rizzo was working as a pilot for a #Hungary company and flying wealthy people around when he was arrested at Cagliari Elmas airport after landing a flight from Germany ."" https://t.co/LBxvqcurKK'",cocaine stash worth lands roof home sardinia rizzo working pilot company flying wealthy people around arrested cagliari elmas airport landing flight germany,0.4,1,0.55,0.0,0.0,0.016369,0.002946,0.017207,0.0,0.019593,0.012855,0.068564,0.0,0.068564,International_news
1,2021-07-15,b'Hilarious \xf0\x9f\xa4\xa3 - Are you afraid of Sexual Harassment Panda? #Hungary https://t.co/vkFMBk3t8J',hilarious afraid sexual harassment panda,0.133333,1,0.911111,0.000406,0.0,0.000535,0.000322,0.0,0.001042,0.00363,0.00155,0.0,0.0,0.00363,Mix
2,2021-07-15,"b'#EU takes legal action against #Hungary, #Poland over #LGBTQ rights The legal cases could eventually land the two member states in the bloc\xe2\x80\x99s highest court. https://t.co/KFYGbprovF'",eu takes legal action poland lgbtq rights legal cases could eventually land two member states bloc highest court,0.166667,1,0.166667,0.1416,0.0,0.331099,0.116986,0.0,0.251801,0.0,0.0,0.0,0.0,0.331099,Poland


In [69]:
df_total_cleaned.to_csv('df_total.csv')

Please see visualisation of the results on the Tableau dashboard