In [None]:
import pandas as pd
import numpy as np
import re

# Imports from plotting
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator

# sentiment discovery imports
from textblob import TextBlob

# Lemmatization for wordclouds
from textblob import Word

# wordcloud library
from wordcloud import WordCloud

# stopwords
from nltk.corpus import stopwords


# Tf-Idf
from sklearn.feature_extraction.text import TfidfVectorizer
import heapq

# utilities
from tqdm.auto import tqdm
import datetime

In [None]:
df = pd.read_pickle('pre-processed-data.pkl')

## Initial Language Distribution 

In [None]:
initial = pd.read_csv('TweetsAboutCovid-19.csv')

languages = np.array(initial['language'], dtype=str)

In [None]:
lang, count = np.unique(languages, return_counts=True)

order = np.argsort(count)[::-1]

plt.figure(figsize=(18,6))
sns.barplot(x=lang[order], y=np.log2(count[order])+1)
plt.yticks(range(1,20,2), [2**i for i in range(0,19,2)])
plt.ylabel('Number of Tweets in Dataset')
plt.xlabel('Language Code')
plt.title('Initial distribution of languages', fontsize=20);
plt.savefig('inital_languages.pdf', bbox_inches='tight')


## Hashtag and Mentions Analysis

In [None]:
mentions = []
for tweet in initial['tweet']:
    mention = re.findall(r'(?:^|[^a-zA-Z0-9_＠!@#$%&*])(?:(?:@|＠)(?!\/))([a-zA-Z0-9/_.]{1,15})(?:\b(?!@|＠)|$)', tweet)
    
    mentions.append(mention)

In [None]:
initial['mentions'] = mentions

In [None]:
fig = plt.figure(figsize=(12, 8))
plt.subplots_adjust(hspace= 0.25)
# Initiating subplots
sub1 = fig.add_subplot(2,2,1) # two rows, two columns, fist cell
sub2 = fig.add_subplot(2,2,2) # two rows, two columns, second cell
sub3 = fig.add_subplot(2,2,(3,4)) # two rows, two colums, combined third and fourth cell

languages = ['nl', 'en', 'de', 'fr', 'in', 'ja', 'pt', 'es']
study_hashtags = [0.16, 0.14, 0.25, 0.16, 0.1, 0.04, 0.11, 0.12]
study_mentions = [0.62, 0.5, 0.28, 0.55, 0.77, 0.48, 0.45, 0.62]

# Hashtags
for i in range(len(languages)):
    temp = initial[initial['language'] == languages[i]]
    
    hashtag_ration = sum(temp['hashtags'].str.len() > 2)/len(temp)
    if i == 0:
        sub1.bar(x=i-0.2, height=hashtag_ration, width=0.4, color='darkblue', label='Dataset Ratio')
        sub1.bar(x=i+0.2, height=study_hashtags[i], width=0.4, color='darkred', label='Study Ratio')
    else:
        sub1.bar(x=i-0.2, height=hashtag_ration, width=0.4, color='darkblue')
        sub1.bar(x=i+0.2, height=study_hashtags[i], width=0.4, color='darkred')

sub1.set_ylabel('Ratio of Tweets including Hashtags')    
sub1.legend()
sub1.set_xticks(range(len(languages)), languages);
sub1.set_title('Usage of Hashtags')

# Mentions
for i in range(len(languages)):
    temp = initial[initial['language'] == languages[i]]
    
    mentions_ration = sum(temp['mentions'].str.len() > 0)/len(temp)
    if i == 0:
        sub2.bar(x=i-0.2, height=mentions_ration, width=0.4, color='darkblue', label='Dataset Ratio')
        sub2.bar(x=i+0.2, height=study_mentions[i], width=0.4, color='darkred', label='Study Ratio')
    else:
        sub2.bar(x=i-0.2, height=mentions_ration, width=0.4, color='darkblue')
        sub2.bar(x=i+0.2, height=study_mentions[i], width=0.4, color='darkred')

sub2.set_ylabel('Ratio of Tweets including Mentions')    
sub2.legend()
sub2.set_xticks(range(len(languages)), languages);
sub2.set_title('Usage of Mentions')


# Scatter of change in Hashtag and Mention ratio
for i in range(len(languages)):
    temp = initial[initial['language'] == languages[i]]
    
    mentions_ration = sum(temp['mentions'].str.len() > 0)/len(temp)
    hashtag_ration = sum(temp['hashtags'].str.len() > 2)/len(temp)
    
    if i == 0:
        sub3.scatter(x=[mentions_ration], y=[hashtag_ration], color='darkblue', label='Dataset Ratio', marker='x')
        sub3.scatter(x=[study_mentions[i]], y=[study_hashtags[i]], color='darkred', label='Study Ratio', marker='x')
    else:
        sub3.scatter(x=[mentions_ration], y=[hashtag_ration], color='darkblue', marker='x')
        sub3.scatter(x=[study_mentions[i]], y=[study_hashtags[i]], color='darkred', marker='x')
        
    sub3.text(x=mentions_ration, y=hashtag_ration+0.02, s=languages[i])
    sub3.plot([mentions_ration, study_mentions[i]], [hashtag_ration, study_hashtags[i]], color='black', ls='--')

sub3.set_title('Change in Ratios compared to study')
sub3.set_ylabel('Ratio of Hashtags')
sub3.set_xlabel('Ratio of Mentions')
sub3.legend()

fig.suptitle('Comparison of Hashtag and Mention usage to Weerkamp et al (2011)', fontsize=16)
plt.savefig('hashtag-mentions.pdf', bbox_inches='tight');

## Activity and Sentiment Analysis

In [None]:
# perfrom sentiment analysis using textblob
# Does ~4k iterations per second
polarity = []
sentiment = []
for tweet in tqdm(df['translation']):
    blob = TextBlob(tweet)
    pol = 0
    for sentence in blob.sentences:
        pol += sentence.sentiment.polarity
    polarity.append(pol)
    sentiment.append('positive' if pol > 0.5 else 'negative')

In [None]:
posting_times = [i.timestamp() for i in df['created_at']]

In [None]:
# Add columns to dataframe
df['polarity'] = polarity
df['sentiment'] = sentiment
df['timestamps'] = posting_times

In [None]:
sns.set_theme(style="whitegrid")

fig, ax = plt.subplots()

sns.violinplot(data=df, y='language', x='timestamps', orient='h', inner=None,
               ax=ax, hue='sentiment', split=True, palette={"positive": "r", "negative": "b"})

# setting axis ticks
plt.xticks([1618963200 + i*86400 for i in range(4)], ['21-04-21','21-04-22','21-04-23','21-04-24'])
ax.xaxis.set_minor_locator(MultipleLocator(86400/4))
ax.xaxis.set_minor_formatter(lambda x, i: str(int((x%86400)//3600)) + ':00')
ax.tick_params(which='minor', labelsize=7)
ax.tick_params(which='major', length=12)

# setting labels
plt.ylabel('Language')
plt.xlabel('Time (UTC)')
plt.title('Activity and sentiment in differnet languages');

plt.savefig('activity-sentiment.pdf', bbox_inches='tight')

## WordCloud creation using TF-IDF

In [None]:
# load stopwords from nltk corpus
s_words = set(stopwords.words())

s_words.add('wa')
s_words.add('is')
s_words.add('time')
s_words.add('year')
s_words.add('today')
s_words.add('day')
s_words.add('amp')

In [None]:
# initialize lists
es = []
en = []
de = []
fr = []

for entry in tqdm(df.iterrows(), total=len(df), desc='Lemmatizing words and building language documents'):
        data = entry[1]

        text_wo_stopwords = ' '.join([i for i in data['translation'].split() if Word(i).lemmatize().lower() not in s_words and len(i) > 1])
        
        if data['language'] == 'es':
            es.append(text_wo_stopwords)
        if data['language'] == 'en':
            en.append(text_wo_stopwords)
        if data['language'] == 'de':
            de.append(text_wo_stopwords)
        if data['language'] == 'fr':
            fr.append(text_wo_stopwords)

In [None]:
mat = np.array([' '.join(es), ' '.join(en), ' '.join(de), ' '.join(fr)])

vectorizer = TfidfVectorizer(ngram_range=(1, 1))
X = vectorizer.fit_transform(mat)
X_words = np.array(vectorizer.get_feature_names())
X, X_words

In [None]:
language_arr = []
k = 25

# iterate over languages and extract the 25 most relavent words
for lang in tqdm(range(4), desc='building output'):

    arr = np.array(X[lang].todense())[0]

    # remove all zero entries to speed up top k selection
    words = X_words[arr != 0]
    arr = arr[arr != 0]
    top_k_ind = heapq.nlargest(k, enumerate(arr), key=lambda x: x[1])

    # separate the wrights and normalize them
    top_k_w = [i[1] for i in top_k_ind]
    top_k_w = list(np.array(top_k_w)/sum(top_k_w))

    # add all data to the output array
    temp = {'language': lang, 'words': [], 'weights': []}
    for i in range(0, len(top_k_ind)):
        # transform to uppercase for uniform appearance
        temp['words'].append(words[top_k_ind[i][0]].upper())

        # round to 5 digits to save space in export
        temp['weights'].append(float(f'{top_k_w[i]:.5f}'))

    language_arr.append(temp)

In [None]:
language_arr

In [None]:
f, ax = plt.subplots(2,2, figsize=(12,8))
languages = ['Spanish', 'English', 'German', 'French']
for j in range(4):
    wc = WordCloud(background_color="white", height=400, width=600)
    wc.fit_words({language_arr[j]['words'][i]: language_arr[j]['weights'][i] for i in range(k)})

    ax[j//2, j%2].imshow(wc)
    ax[j//2, j%2].axis('off')
    ax[j//2, j%2].set_title(languages[j], fontsize=15)
    
plt.suptitle('Most relevant words in different languages', fontsize=20)
plt.savefig('wordclouds.pdf', bbox_inches='tight')
plt.show()