# Visualizzazione Scientifica - Twitter Sentiment e Time Series Analysis - Marco Molinati, 923530


## Import delle librerie utilizzate


In [None]:
from wordcloud import WordCloud, STOPWORDS
from textblob import TextBlob
from string import punctuation
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from PIL import Image
from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from collections import Counter
import chart_studio.plotly as py
import collections
import csv
import cufflinks as cf
import en_core_web_sm
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import re
import seaborn as sns
import spacy
from statistics import mean
from scipy import stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
import pickle
from sklearn.preprocessing import StandardScaler
import string
import tweepy
from tqdm.notebook import tqdm
%matplotlib inline
cf.go_offline()
init_notebook_mode(connected=True)
plt.style.use('seaborn')

colorscale = px.colors.sequential.YlGnBu[1:]
print("Scala colore:", colorscale)

## Data Collection

```python
client = tweepy.Client(bearer_token='')

keyword = input("Please enter keyword or hashtag to search: ")
noOfTweet = int(input("Please enter how many tweets to analyze: "))

tweets = client.search_recent_tweets(query=keyword, max_results=noOfTweet)

tweet_list = []
for tweet in (tweets.data):
    tweet_info = {
        'created_at': tweet.created_at,
        'id' : tweet.id,
        'original': tweet.text,
        'source': tweet.source,
    }
    tweet_list.append(tweet_info)

tweet_list = pd.DataFrame(tweet_list)
```


In [None]:
tw_list = pd.read_csv(
    'data/Ukraine_war.csv', parse_dates=True)
tw_list["clean_tweet"] = tw_list["content"]
tw_list.head()

## Data Cleaning e Text Preprocessing


### Pulizia colonna "user" del DataFrame

```python
i = 0
for username in tqdm(tw_list.user):
    split = username.split(' ', 4)[3]
    tick = split.replace("'", '')
    comma = tick.replace(",", '')
    tw_list.iloc[i, 3] = comma
    i += 1
```


### Rimozione degli hashtag, menzioni e caratteri speciali dal testo dei tweets


In [None]:
def clean_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(
        lambda elem: re.sub(
            r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem
        )
    )
    return df


clean_tweets = clean_text(tw_list, "clean_tweet")

### Rimozione stopwords, punctuations, Lemmatizzazione  e Tokenizzazione delle parole


In [None]:
nlp = en_core_web_sm.load()
tokenizer = RegexpTokenizer(r"\w+")
lemmatizer = WordNetLemmatizer()
stop = set(stopwords.words("english"))
punctuation = list(string.punctuation)
stop.update(punctuation)
w_tokenizer = WhitespaceTokenizer()


def furnished(text):
    final_text = []
    for i in w_tokenizer.tokenize(text):
        if i.lower() not in stop:
            word = lemmatizer.lemmatize(i)
            final_text.append(word.lower())
    return " ".join(final_text)


tw_list.clean_tweet = tw_list.clean_tweet.apply(furnished)
tw_list.head()

## Time Series Analysis


In [None]:
tw_list['date'] = pd.to_datetime(tw_list['date'])

print(tw_list.info())

In [None]:
tw_list['Year'] = tw_list.date.dt.year
tw_list['Month'] = tw_list.date.dt.month
tw_list['Weekday'] = tw_list.date.dt.day_name()


tw_list.head()

### Analisi Settimanale e Mensile


In [None]:
cats = ['Monday', 'Tuesday', 'Wednesday',
        'Thursday', 'Friday', 'Saturday', 'Sunday']
weeks_df = tw_list.groupby(['Weekday']).count().reindex(cats)
weeks_df = weeks_df[['content']]
weeks_df

In [None]:
sorter = ['Monday', 'Tuesday', 'Wednesday',
          'Thursday', 'Friday', 'Saturday', 'Sunday']
sorterIndex = dict(zip(sorter, range(len(sorter))))

weeks_df['Day_id'] = weeks_df.index
weeks_df['Day_id'] = weeks_df['Day_id'].map(sorterIndex)
weeks_df.sort_values('Day_id', inplace=True)

In [None]:
fig = go.Figure(go.Scatter(x=weeks_df.Day_id, y=weeks_df.content,
                           line=dict(color=colorscale[4])
                           ))

fig.update_layout(
    xaxis=dict(
        tickmode='array',
        tickvals=[0, 1, 2, 3, 4, 5, 6],
        ticktext=sorter
    )
)

fig.show()

In [None]:
months_df = tw_list.groupby(['Month']).count()
months_df = months_df[['content']]
months_df[:3]

### Analisi oraria nelle 24h


In [None]:
tw_list['hour'] = pd.DatetimeIndex(tw_list['date']).hour
tw_list['date'] = pd.DatetimeIndex(tw_list['date']).date
tw_list['count'] = 1
tw_list_filtered = tw_list[['hour', 'date', 'count']]
tw_list_filtered.head()

In [None]:
df_tweets_hourly = tw_list_filtered.groupby(["hour"]).sum().reset_index()
df_tweets_hourly.head()

In [None]:
sns.set(style="white")
plot = sns.lmplot(x="hour", y="count", order=2,
                  data=df_tweets_hourly, height=8.27, aspect=11.7/8.27)
plt.xticks(np.arange(0, 24, 1))
plt.show()
# plot.savefig('seaborn.png')

fig = px.scatter(df_tweets_hourly, x="hour", y="count",
                 trendline="ols")

fig.update_traces(marker=dict(
    color=colorscale[4]))
fig.show()

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=df_tweets_hourly.hour,
        y=df_tweets_hourly['count'],
        marker=dict(color=colorscale[4]),
        mode="lines+markers"
    ))

fig.add_trace(
    go.Bar(
        x=df_tweets_hourly.hour,
        y=df_tweets_hourly['count'],
        marker=dict(color=df_tweets_hourly['count'], coloraxis="coloraxis")
    ))

fig.update_layout(
    coloraxis=dict(colorscale='YlGnBu'),
    showlegend=False,
    title_text="Volume Orario dei Tweet",
    uniformtext_minsize=8,
)

fig.show()

## Text Classification

### Polarity and subjectivity with TextBlob and NLTK

```python
tw_list[["polarity", "subjectivity"]] = tw_list["clean_tweet"].apply(
    lambda Text: pd.Series(TextBlob(Text).sentiment)
)

for index, row in tw_list["clean_tweet"].iteritems():
    print(index, end='\r')
    score = SentimentIntensityAnalyzer().polarity_scores(row)
    neg = score["neg"]
    neu = score["neu"]
    pos = score["pos"]
    comp = score["compound"]

    if neg > pos:
        tw_list.loc[index, "sentiment"] = "negative"
    elif pos > neg:
        tw_list.loc[index, "sentiment"] = "positive"
    else:
        tw_list.loc[index, "sentiment"] = "neutral"

    tw_list.loc[index, "neg"] = neg
    tw_list.loc[index, "neu"] = neu
    tw_list.loc[index, "pos"] = pos
    tw_list.loc[index, "compound"] = comp
```


In [None]:
tw_list = pd.read_csv(
    'data/Ukraine_war_IntensityAnalyzer.csv', parse_dates=True)
tw_list["date_short"] = tw_list.date.str.slice(0, 10)
tw_list.head()

In [None]:
# Percentuale sentimenti Totale, Prima dell'invasione e durante l'invasione

total_pos = len(tw_list.loc[tw_list["sentiment"] == "positive"])
total_neg = len(tw_list.loc[tw_list["sentiment"] == "negative"])
total_neu = len(tw_list.loc[tw_list["sentiment"] == "neutral"])
total_tweets = len(tw_list)
print("Total Positive Tweets % : {:.2f}".format(
    (total_pos / total_tweets) * 100))
print("Total Negative Tweets % : {:.2f}".format(
    (total_neg / total_tweets) * 100))
print("Total Neutral Tweets % : {:.2f}".format(
    (total_neu / total_tweets) * 100))

total_before = len(tw_list[tw_list.date < "2022-02-20"])
before_pos = len(
    tw_list[(tw_list.sentiment == "positive") & (tw_list.date < "2022-02-20")])
before_neg = len(
    tw_list[(tw_list.sentiment == "negative") & (tw_list.date < "2022-02-20")])
before_neu = len(tw_list[(tw_list.sentiment == "neutral")
                 & (tw_list.date < "2022-02-20")])

total_after = len(tw_list[tw_list.date >= "2022-02-20"])
current_pos = len(
    tw_list[(tw_list.sentiment == "positive") & (tw_list.date >= "2022-02-20")])
current_neg = len(
    tw_list[(tw_list.sentiment == "negative") & (tw_list.date >= "2022-02-20")])
current_neu = len(
    tw_list[(tw_list.sentiment == "neutral") & (tw_list.date >= "2022-02-20")])

print("Before Positive Tweets % : {:.2f}".format(
    (before_pos / total_before) * 100))
print("Before Negative Tweets % : {:.2f}".format(
    (before_neg / total_before) * 100))
print("Before Neutral Tweets % : {:.2f}".format(
    (before_neu / total_before) * 100))
print("After Positive Tweets % : {:.2f}".format(
    (current_pos / total_after) * 100))
print("After Negative Tweets % : {:.2f}".format(
    (current_neg / total_after) * 100))
print("After Neutral Tweets % : {:.2f}".format(
    (current_neu / total_after) * 100))

### Grafici a misuratore radiale


In [None]:
fig = go.Figure(go.Indicator(
    domain={'x': [0, 1], 'y': [0, 1]},
    value=total_pos,
    mode="gauge+number+delta",
    title={'text': "Tweets Positivi"},
    delta={'reference': total_neg, 'decreasing': {'color': colorscale[5]}},
    gauge={'bar': {'color': colorscale[1]},
           'axis': {'range': [None, total_tweets]},
           'threshold': {
        'line': {'color': colorscale[5], 'width': 4},
        'thickness': 0.75,
        'value': total_neg}
    }))
fig.show()
#fig.write_image("gauge_pos.png")

In [None]:
fig = go.Figure(go.Indicator(
    domain={'x': [0, 1], 'y': [0, 1]},
    value=total_neg,
    mode="gauge+number+delta",
    title={'text': "Tweets Negativi"},
    delta={'reference': total_pos, 'increasing': {'color': colorscale[5]}},
    gauge={'bar': {'color': colorscale[3]},
           'axis': {'range': [None, total_tweets]},
           'threshold': {
        'line': {'color': colorscale[5], 'width': 4},
        'thickness': 0.75,
        'value': total_pos}
    }))
fig.show()
# fig.write_image("gauge_negative.png")

In [None]:
delta = (total_pos + total_neg) - total_neu
fig = go.Figure(go.Indicator(
    domain={'x': [0, 1], 'y': [0, 1]},
    value=total_neu,
    mode="gauge+number+delta",
    title={'text': "Tweets Neutrali"},
    delta={'reference': delta, 'decreasing': {'color': colorscale[5]}},
    gauge={'bar': {'color': colorscale[7]},
           'axis': {'range': [None, total_tweets]},
           'threshold': {
        'line': {'color': colorscale[5], 'width': 4},
        'thickness': 0.75,
        'value': (total_pos + total_neg)}
    }))
fig.show()
# fig.write_image("gauge_neu.png")

### Grafici a torta


In [None]:
labels = ["Positivo", "Negativo", "Neutrale"]
values_tot = [total_pos, total_neg, total_neu]
values_before = [before_pos, before_neg, before_neu]
values_current = [current_pos, current_neg, current_neu]

fig = make_subplots(rows=1, cols=3, specs=[
                    [{'type': 'domain'}, {'type': 'domain'}, {'type': 'domain'}]],
                    subplot_titles=['Percentuale Totale', "Percentuale prima dell'invasione", "Percentuale durante l'invasione"])
fig.add_trace(go.Pie(labels=labels, values=values_tot, name='Percentuale Totale', textinfo="label+percent"),
              1, 1)
fig.add_trace(go.Pie(labels=labels, values=values_before, name="Percentuale prima dell'invasione", textinfo="label+percent"),
              1, 2)
fig.add_trace(go.Pie(labels=labels, values=values_current, name="Percentuale durante l'invasione", textinfo="label+percent"),
              1, 3)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4)

fig.update_layout(
    title_text="Percentuale dei sentimenti in funzione del tempo",)
fig.update_traces(marker=dict(colors=colorscale))

fig.show()
# fig.write_image("pie_tot.png")

### Visualizzazione Statica e Animata dell'andamento dei sentimenti (in percentuale) nei tweet in funzione del tempo

In [None]:
pos_list = []
neg_list = []
neu_list = []
for i in tw_list["date_short"].unique():
    temp = tw_list[tw_list["date_short"] == i]
    positive_temp = temp[temp["sentiment"] == "positive"]
    negative_temp = temp[temp["sentiment"] == "negative"]
    neutral_temp = temp[temp["sentiment"] == "neutral"]
    pos_list.append(((positive_temp.shape[0] / temp.shape[0]) * 100, i))
    neg_list.append(((negative_temp.shape[0] / temp.shape[0]) * 100, i))
    neu_list.append(((neutral_temp.shape[0] / temp.shape[0]) * 100, i))

neu_list = sorted(neu_list, key=lambda x: x[1])
pos_list = sorted(pos_list, key=lambda x: x[1])
neg_list = sorted(neg_list, key=lambda x: x[1])

x_cord_neg = []
y_cord_neg = []

x_cord_pos = []
y_cord_pos = []

x_cord_neu = []
y_cord_neu = []

for i in neg_list:
    x_cord_neg.append(i[0])
    y_cord_neg.append(i[1])

for i in pos_list:
    x_cord_pos.append(i[0])
    y_cord_pos.append(i[1])

for i in neu_list:
    x_cord_neu.append(i[0])
    y_cord_neu.append(i[1])

In [None]:
def pickleDump(data, name):
    filename = name + ".pk"
    with open(filename, 'wb') as fi:
        pickle.dump(data, fi)


def pickleLoad(name):
    filename = name + ".pk"
    with open(filename, 'rb') as fi:
        variable = pickle.load(fi)
        return variable

In [None]:
data = {"X_pos": y_cord_pos, "Y_pos": x_cord_pos}
df = pd.DataFrame(data)

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=y_cord_pos[:1000],
        y=x_cord_pos[:1000],
        mode="lines+markers",
        name="Positive",
        line=dict(color=colorscale[4]),
    )
)
fig.add_trace(
    go.Scatter(
        x=y_cord_neg[:1000],
        y=x_cord_neg[:1000],
        mode="lines+markers",
        name="Negative",
        line=dict(color=colorscale[5]),
    )
)
fig.add_trace(
    go.Scatter(
        x=y_cord_neu[:1000],
        y=x_cord_neu[:1000],
        mode="lines+markers",
        name="Neutral",
        line=dict(color=colorscale[3]),
    )
)
fig.show()
# fig.write_image("timeseries_static.png")

In [None]:
trace1 = go.Scatter(x=y_cord_pos, y=x_cord_pos,
                    mode='lines+markers',
                    line=dict(color=colorscale[4]),
                    name="Positive"
                    )
trace2 = go.Scatter(x=y_cord_neg, y=x_cord_neg,
                    mode='lines+markers',
                    line=dict(color=colorscale[5]),
                    name="Negative"
                    )
trace3 = go.Scatter(x=y_cord_neu, y=x_cord_neu,
                    mode='lines+markers',
                    line=dict(color=colorscale[3]),
                    name="Neutral"
                    )

frames = [dict(data=[dict(type='scatter',
                          x=y_cord_pos[:k+1],
                          y=x_cord_pos[:k+1]),
                     dict(type='scatter',
                          x=y_cord_neg[:k+1],
                          y=x_cord_neg[:k+1]),
                     dict(type='scatter',
                          x=y_cord_neu[:k+1],
                          y=x_cord_neu[:k+1]),
                     ],
               traces=[0, 1, 2],
               )for k in range(1, len(y_cord_pos)-1)]

layout = go.Layout(
    xaxis=dict(range=['2021-12-31', '2022-03-05'], autorange=False),
    yaxis=dict(range=[0, 90], autorange=False, title="Percentage"),
    title="Percentuale dei sentimenti in funzione del tempo",
    # showlegend=False,
    hovermode='x unified',
    updatemenus=[
        dict(
            type='buttons', showactive=False,
            buttons=[dict(label='Play',
                          method='animate',
                          args=[None,
                                dict(frame=dict(duration=5,
                                                redraw=False),
                                     transition=dict(
                                    duration=5),
                                    fromcurrent=True,
                                    mode='immediate')]
                          ),
                     dict(label='Pause',
                          method='animate',
                          args=[[None], {'frame': {'duration': 0, 'redraw': False},
                                         'mode': 'immediate',
                                         'transition': {'duration': 0}}]
                          )]
        ),
    ]
)

fig = go.Figure(data=[trace1, trace2, trace3],
                frames=frames, layout=layout)
fig.show()

### Bigrammi, Trigrammi e Parole più utilizzate nei tweets

In [None]:
def get_top_n_gram(corpus, ngram_range, n=None):

    vectorizer = CountVectorizer(
        analyzer='word', ngram_range=ngram_range, stop_words='english')
    words = vectorizer.fit_transform(corpus.astype('U').values)

    sum_words = words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx])
                  for word, idx in vectorizer.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
# top words
n = get_top_n_gram(tw_list["clean_tweet"], (1, 1), 30)
n1 = pd.DataFrame(n)
n1.columns = ["words", "occurrences"]

# n2_bigram
n2_bigrams = get_top_n_gram(tw_list["clean_tweet"], (2, 2), 10)
bigrams = pd.DataFrame(n2_bigrams)
bigrams.columns = ["words", "occurrences"]

# n3_trigram
n3_trigrams = get_top_n_gram(tw_list["clean_tweet"], (3, 3), 10)
trigrams = pd.DataFrame(n3_trigrams)
trigrams.columns = ["words", "occurrences"]

In [None]:
fig = make_subplots(
    rows=2,
    cols=2,
    specs=[[{}, {}], [{"colspan": 2}, None]],
    subplot_titles=("Top 10 Bigrams", "Top 10 Trigrams", "Top 30 Words"),
)

fig.add_trace(
    go.Bar(x=bigrams.words, y=bigrams.occurrences, marker=dict(color=bigrams.occurrences,
           coloraxis="coloraxis")),  # marker_color=px.colors.sequential.Darkmint),
    row=1,
    col=1,
)

fig.add_trace(
    go.Bar(x=trigrams.words, y=trigrams.occurrences, marker=dict(color=trigrams.occurrences, coloraxis="coloraxis")
           ),
    row=1,
    col=2,
)

fig.add_trace(
    go.Bar(x=n1.words, y=n1.occurrences, marker=dict(color=n1.occurrences, coloraxis="coloraxis"),
           ),
    row=2,
    col=1,
)

fig.update_layout(
    coloraxis=dict(colorscale=colorscale),
    showlegend=False,
    title_text="Dati sulle parole più utilizzate",
    uniformtext_minsize=8,
    height=900,
)
fig.update_xaxes(tickangle=-45)
fig.update_traces(texttemplate="%{value}")
fig.show()
# fig.write_image("top_words.png")

In [None]:
fig = px.treemap(n1, path=['words'], values='occurrences',
                 color='occurrences', color_continuous_scale=colorscale)
fig.show()

## Clustering

In [None]:
# pickleDump(economy_related_words, "economy_related_words")
# pickleDump(social_related_words, "social_related_words")
# pickleDump(culture_related_words, "culture_related_words")
# pickleDump(health_related_words, "health_related_words")

economy_related_words = pickleLoad("economy_related_words")
social_related_words = pickleLoad("social_related_words")
culture_related_words = pickleLoad("culture_related_words")
health_related_words = pickleLoad("health_related_words")

In [None]:
nlp = en_core_web_sm.load()
tokenizer = RegexpTokenizer(r"\w+")
lemmatizer = WordNetLemmatizer()
stop = set(stopwords.words("english"))
punctuation = list(string.punctuation)
stop.update(punctuation)
w_tokenizer = WhitespaceTokenizer()

# clean the set of words


def furnished(text):
    final_text = []
    for i in text.split():
        if i.lower() not in stop:
            word = lemmatizer.lemmatize(i)
            final_text.append(word.lower())
    return " ".join(final_text)


economy = furnished(economy_related_words)
social = furnished(social_related_words)
culture = furnished(culture_related_words)
health = furnished(health_related_words)

In [None]:
# delete duplicates
string1 = economy
words = string1.split()
economy = " ".join(sorted(set(words), key=words.index))

string1 = social
words = string1.split()
social = " ".join(sorted(set(words), key=words.index))

string1 = culture
words = string1.split()
culture = " ".join(sorted(set(words), key=words.index))

string1 = health
words = string1.split()
health = " ".join(sorted(set(words), key=words.index))

## Jaccard Similarity

In [None]:
def jaccard_similarity(group, tweet):
    group = set(group)
    try:
        tweet = set(tweet)
        nominator = group.intersection(tweet)
        denominator = group.union(tweet)
        similarity = len(nominator)/len(denominator)
        return similarity
    except:
        print(tweet)


def get_scores(group, tweets):
    scores = []
    for tweet in tweets:
        s = jaccard_similarity(group, tweet)
        scores.append(s)
    return scores

In [None]:
tw_list = tw_list.dropna()
tw_list.head()

In [None]:
# scores
e_scores = get_scores(economy, tw_list.clean_tweet.to_list())
s_scores = get_scores(social, tw_list.clean_tweet.to_list())
c_scores = get_scores(culture, tw_list.clean_tweet.to_list())
h_scores = get_scores(health, tw_list.clean_tweet.to_list())

data = {
    "username": tw_list.user.to_list(),
    "economic_score": e_scores,
    "social_score": s_scores,
    "culture_score": c_scores,
    "health_scores": h_scores,
}
scores_df = pd.DataFrame.from_dict(data, orient='index')
scores_df = scores_df.transpose()
scores_df.head()

In [None]:
# Actual assigning of classes to the tweets

def get_clusters(l1, l2, l3, l4):
    econ = []
    socio = []
    cul = []
    heal = []
    for i, j, k, l in zip(l1, l2, l3, l4):
        m = max(i, j, k, l)
        if m == i:
            econ.append(1)
        else:
            econ.append(0)
        if m == j:
            socio.append(1)
        else:
            socio.append(0)
        if m == k:
            cul.append(1)
        else:
            cul.append(0)
        if m == l:
            heal.append(1)
        else:
            heal.append(0)

    return econ, socio, cul, heal

In [None]:
l1 = scores_df.economic_score.to_list()
l2 = scores_df.social_score.to_list()
l3 = scores_df.culture_score.to_list()
l4 = scores_df.health_scores.to_list()

econ, socio, cul, heal = get_clusters(l1, l2, l3, l4)

data = {
    "username": scores_df.username.to_list(),
    "economic": econ,
    "social": socio,
    "culture": cul,
    "health": heal,
}
cluster_df = pd.DataFrame(data)
cluster_df.head()

In [None]:
a = cluster_df[["economic", "social", "culture", "health"]].sum(axis=1) > 1
c = cluster_df[["economic", "social", "culture", "health"]].sum(axis=1)
cluster_df.loc[(a), ["economic", "social", "culture", "health"]] = 1 / c

pivot_clusters = cluster_df.groupby(["username"]).sum()
pivot_clusters["economic"] = pivot_clusters["economic"].astype(int)
pivot_clusters["social"] = pivot_clusters["social"].astype(int)
pivot_clusters["culture"] = pivot_clusters["culture"].astype(int)
pivot_clusters["health"] = pivot_clusters["health"].astype(int)
pivot_clusters["total"] = (
    pivot_clusters["health"]
    + pivot_clusters["culture"]
    + pivot_clusters["social"]
    + pivot_clusters["economic"]
)
pivot_clusters.loc["Total"] = pivot_clusters.sum()  # add a totals row
print(pivot_clusters.shape)
pivot_clusters.tail()

### Grafico a torta Clustering

In [None]:
a = pivot_clusters.drop(["total"], axis=1)
labels = a.columns
values = a.loc["Total"]

fig = go.Figure(
    data=[go.Pie(labels=labels, values=values, textinfo="label+percent")])
fig.update_traces(marker=dict(colors=px.colors.sequential.YlGnBu))
fig.show()
# fig.write_image("pie_categories.png")

### Utenti con più tweets

In [None]:
d = pivot_clusters.sort_values(by="total", ascending=False)
e = d.head(20)
e.drop(e.head(2).index, inplace=True)
fig = px.bar(
    e,
    x=e.index,
    y=e.total,
    title="Top 20 tweets per username",
    labels={
        "index": "Usernames",
        "total": "Occurrences",
    },
    color="total",
    color_continuous_scale=px.colors.sequential.YlGnBu[1:],
)
fig.update_traces(texttemplate="%{value}", textposition="outside")
fig.update_layout(uniformtext_minsize=8)
fig.update_layout(xaxis_tickangle=-45)
# fig.write_image("top_user.png")

In [None]:
# drop the totals row
pivot_clusters.drop(pivot_clusters.tail(1).index, inplace=True)

In [None]:
pivot_clusters.tail()

### Scatter Matrix

In [None]:
fig = go.Figure(data=go.Splom(
                dimensions=[dict(label='economic',
                                 values=pivot_clusters['economic']),
                            dict(label='social',
                                 values=pivot_clusters['social']),
                            dict(label='culture',
                                 values=pivot_clusters['culture']),
                            dict(label='health',
                                 values=pivot_clusters['health'])],
                diagonal_visible=False,  # remove plots on diagonal
                marker=dict(color=colorscale[4])
                ))

fig.update_layout(
    height=800,
)

fig.show()

### Definizione coppie per K-Means

- Economic - Social
- Social - Culture
- Economic - Health
- Economic - Culture


In [None]:
X_es = pivot_clusters[["economic", "social"]].values
X_sc = pivot_clusters[["social", "culture"]].values
X_eh = pivot_clusters[["economic", "health"]].values
X_ec = pivot_clusters[["economic", "culture"]].values

### Elbow Method + K-Means Clustering

In [None]:
def distortionInertia(features):
    distortions = []
    inertias = []
    K = range(1, 10)

    for k in K:
        # Building and fitting the model
        kmeanModel = KMeans(n_clusters=k).fit(features)
        kmeanModel.fit(features)

        distortions.append(sum(np.min(cdist(features, kmeanModel.cluster_centers_,
                                            'euclidean'), axis=1)) / features.shape[0])
        inertias.append(kmeanModel.inertia_)
    return distortions, inertias


def elbowPlot(features):
    distortions, inertias = distortionInertia(features)

    fig = make_subplots(rows=1, cols=2)
    fig.add_trace(
        go.Scatter(x=list(range(10)), y=distortions, name='Distorsione',
                   line=dict(color=colorscale[3])),
        row=1, col=1
    )
    fig.update_xaxes(title_text="K", row=1, col=1)
    fig.update_yaxes(title_text="Distorsione", row=1, col=1)

    fig.add_trace(
        go.Scatter(x=list(range(10)), y=inertias, name='Inerzia',
                   line=dict(color=colorscale[5])),
        row=1, col=2
    )
    fig.update_xaxes(title_text="K", row=1, col=2)
    fig.update_yaxes(title_text="Inerzia", row=1, col=2)

    fig.update_layout(  # height=600, width=800,
        title_text="Elbow Method: Distorsione vs. Inerzia")
    fig.show()


def computeKmeans(features, n_clusters):
    # color_scale =
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(features)
    y_kmeans = kmeans.predict(features)
    centers = kmeans.cluster_centers_

    fig = go.Figure()

    # Add traces
    fig.add_trace(go.Scatter(x=features[:5000, 0], y=features[:5000, 1], mode='markers',
                  name='Clusters', marker=dict(size=8, color=y_kmeans, colorscale=colorscale, line=dict(width=1,
                                                                                                        color='DarkSlateGrey'))))  # , colorscale='Teal'

    fig.add_trace(go.Scatter(x=centers[:5000, 0], y=centers[:5000, 1], mode='markers', name='Centroidi', opacity=0.5, marker=dict(size=20,
                                                                                                                                  color='DarkSlateGrey')))

    # fig.update_layout(width=800)

    fig.show()

In [None]:
elbowPlot(X_es)
computeKmeans(X_es, 3)

In [None]:
elbowPlot(X_sc)
computeKmeans(X_sc, 2)

In [None]:
elbowPlot(X_eh)
computeKmeans(X_eh, 3)

In [None]:
elbowPlot(X_ec)
computeKmeans(X_ec, 3)

In [None]:
def KmeansPlot(features, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(features)
    y_kmeans = kmeans.predict(features)
    centers = kmeans.cluster_centers_

    return y_kmeans, centers

### Visualizzazione compatta K-Means

In [None]:
y_es, centers_es = KmeansPlot(X_es, 3)
y_sc, centers_sc = KmeansPlot(X_sc, 2)
y_eh, centers_eh = KmeansPlot(X_eh, 3)
y_ec, centers_ec = KmeansPlot(X_ec, 3)

fig = make_subplots(
    rows=2, cols=2, subplot_titles=("Economic - Social", "Social - Culture", "Economic - Health", "Economic - Culture")
)
# ES
fig.add_trace(
    go.Scatter(
        x=X_es[:5000, 0],
        y=X_es[:5000, 1],
        mode="markers",
        showlegend=False,
        marker=dict(color=y_es, colorscale=colorscale, line=dict(width=0.5, color='DarkSlateGrey'))), row=1, col=1)
fig.add_trace(
    go.Scatter(
        x=centers_es[:5000, 0],
        y=centers_es[:5000, 1],
        mode="markers",
        showlegend=False,
        opacity=0.5,
        marker=dict(size=15, color="DarkSlateGrey"),), row=1, col=1)

# SC
fig.add_trace(
    go.Scatter(
        x=X_sc[:5000, 0],
        y=X_sc[:5000, 1],
        mode="markers",
        showlegend=False,
        marker=dict(color=y_es, colorscale=colorscale, line=dict(width=0.5, color='DarkSlateGrey'))), row=1, col=2)
fig.add_trace(
    go.Scatter(
        x=centers_sc[:5000, 0],
        y=centers_sc[:5000, 1],
        mode="markers",
        showlegend=False,
        opacity=0.5,
        marker=dict(size=15, color="DarkSlateGrey"),), row=1, col=2)

# EH
fig.add_trace(
    go.Scatter(
        x=X_eh[:5000, 0],
        y=X_eh[:5000, 1],
        mode="markers",
        showlegend=False,
        marker=dict(color=y_es, colorscale=colorscale, line=dict(width=0.5, color='DarkSlateGrey'))), row=2, col=1)
fig.add_trace(
    go.Scatter(
        x=centers_eh[:5000, 0],
        y=centers_eh[:5000, 1],
        mode="markers",
        showlegend=False,
        opacity=0.5,
        marker=dict(size=15, color="DarkSlateGrey"),), row=2, col=1)

# EC
fig.add_trace(
    go.Scatter(
        x=X_ec[:5000, 0],
        y=X_ec[:5000, 1],
        mode="markers",
        showlegend=False,
        marker=dict(color=y_es, colorscale=colorscale, line=dict(width=0.5, color='DarkSlateGrey'))), row=2, col=2)
fig.add_trace(
    go.Scatter(
        x=centers_ec[:5000, 0],
        y=centers_ec[:5000, 1],
        mode="markers",
        showlegend=False,
        opacity=0.5,
        marker=dict(size=15, color="DarkSlateGrey"),), row=2, col=2)

# Update title and height
fig.update_layout(
    title_text="Visualizzazione Compatta K-Means Clustering", height=800)

fig.show()
# fig.write_image("kmeans_summary.png")

## Word Clouds

In [None]:
def create_wordcloud(text, cmap, name):
    filename = str(name) + ".png"
    mask = np.array(Image.open("twitter.png"))
    stop = set(stopwords.words('english'))
    wc = WordCloud(background_color="white",
                   mask=mask,
                   max_words=300,
                   stopwords=stop,
                   colormap=cmap,
                   repeat=True,
                   )
    wc.generate(str(text))
    wc.to_file(filename)
    path = filename
    display(Image.open(path))

In [None]:
# Creating wordcloud for all tweets
create_wordcloud(tw_list["clean_tweet"].values, 'tab20', "All")

In [None]:
# Creating wordcloud for positive sentiment
create_wordcloud(tw_list[tw_list.sentiment == 'positive']
                 ["clean_tweet"].values, 'GnBu', "Positive")

In [None]:
# Creating wordcloud for negative sentiment
create_wordcloud(tw_list[tw_list.sentiment == 'negative']
                 ["clean_tweet"].values, 'OrRd', "Negative")