## <b>Sentiment analysis</b>
<i>Code below is for the sentiment analysis and uses a transformer based on XLM-RoBERTa, which is fine-tuned for twitter sentiment analysis. It is also possible to use a different transformer (<a href="https://huggingface.co/models" target="_blank">huggingface transformers</a>). The transformer is used to vectorize the text into a high-dimensional vector space, so the computer is able to read the data.</i>


In [7]:
#Import needed libraries
    #transformer for using the transformers library
    #pipeline for using the sentiment analysis from the transformers library
import pandas as pd
import transformers
from transformers import pipeline

#Read data including newly created topics
outputgermany = pd.read_csv('Data\\topics70_tweets_germany.csv')

#Create dataframe
tweets_germany = pd.DataFrame(outputgermany)

#Define transformer you want to use
model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"

#Set model using the sentiment analysis pipeline from huggingface
    #model is your transformer, which means it uses your preferred transformer
    #tokenizer means that the sentiment analysis will use the preferred transformer to transform the text into embeddings
    #return all scores = True, means that all scores for all sentiments will be returned, instead of only for the predicted class
sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path) #remove return_all_scores

#Perform sentiment analysis and add the sentiment and scores to dataframe
tweets_germany = (
    tweets_germany
    .assign(sentiment = lambda x: x['emoji_clean_text'].apply(lambda s: sentiment_task(s)))
    .assign(
         label = lambda x: x['sentiment'].apply(lambda s: (s[0]['label'])),
         score = lambda x: x['sentiment'].apply(lambda s: (s[0]['score']))
    )
)

#Check if the dataframe has three new columns with data (sentiment, label, and score)
tweets_germany

#Save dataframe as csv
tweets_germany.to_csv('Data\\sentimenttopic70_tweets_germany.csv', index = False, header='true')

Create histogram of the sentiment distribution

In [3]:
import pandas as pd

#Read data including newly created topics
outputgermany = pd.read_csv('Data\\sentimenttopic70_tweets_germany.csv')

#Create dataframe
tweets_germany = pd.DataFrame(outputgermany)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Create a distribution plot for the sentiments
sns.displot(tweets_germany.label)
plt.title("Distribution of sentiments", fontsize=20, color = 'black')
plt.show()

Create boxplot of the sentiment scores

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="ticks", palette="pastel")
sns.set(rc={'figure.figsize':(11.7,8.27),'font.size':20, 'axes.titlesize':20, 'axes.labelsize': 15.0, 'xtick.labelsize': 15.0})

sns.boxplot(x="label",
            y="score", palette=["r", "grey", "g"], order=['Negative', 'Neutral', 'Positive'], orient='v',linewidth=1.5,
            data=tweets_germany).set(xlabel='', ylabel='Score', title='Score distribution for each sentiment', ylim=(0.3,1))

plt.savefig("Figures\\boxplotsentiments.png")


Boxplot of the sentiments per topic

In [6]:
import numpy as np

#Add the topic name to the tweets
conditions = [
    (tweets_germany['Topic'] == -1),
    (tweets_germany['Topic'] == 0) | (tweets_germany['Topic'] == 2)| (tweets_germany['Topic'] == 4) | (tweets_germany['Topic'] == 6) | (tweets_germany['Topic'] == 13), 
    (tweets_germany['Topic'] == 1) | (tweets_germany['Topic'] == 5)| (tweets_germany['Topic'] == 10)  | (tweets_germany['Topic'] == 14) | (tweets_germany['Topic'] == 15), 
    (tweets_germany['Topic'] == 3) | (tweets_germany['Topic'] == 7)| (tweets_germany['Topic'] == 8) | (tweets_germany['Topic'] == 9) | (tweets_germany['Topic'] == 11)| (tweets_germany['Topic'] == 12)
    ]

values = ['outlier', 'coronapolicies','prevention','lockdownactivities']
tweets_germany['nametopic'] = np.select(conditions, values)

#Convert labels into -1,0,1 so it can be used in array
conditions = [
    (tweets_germany['label'] == 'Negative'),
    (tweets_germany['label'] == 'Positive'),
    (tweets_germany['label'] == 'Neutral') 
    ]

values = ['-1', '1','0']
tweets_germany['numbersentiment'] = np.select(conditions, values)

#Convert topicname into numbers
conditions = [
    (tweets_germany['nametopic'] == 'outlier'),
    (tweets_germany['nametopic'] == 'coronapolicies'),
    (tweets_germany['nametopic'] == 'lockdownactivities'),
    (tweets_germany['nametopic'] == 'prevention')
    ]

values = ['-1', '1','2','3']
tweets_germany['numbertopicname'] = np.select(conditions, values)

In [8]:
#Create dataframes of the topics
coronapolicies = tweets_germany[tweets_germany['nametopic'] == 'coronapolicies']
lockdownactivities = tweets_germany[tweets_germany['nametopic'] == 'lockdownactivities']
prevention = tweets_germany[tweets_germany['nametopic'] == 'prevention']

In [11]:
import seaborn as sns
import matplotlib.pyplot as plt

def boxplottopic (data, title, png):
    sns.set_theme(style="ticks", palette="pastel")
    sns.set(rc={'figure.figsize':(11.7,8.27),'font.size':20, 'axes.titlesize':20, 'axes.labelsize': 15.0, 'xtick.labelsize': 15.0})

    sns.boxplot(x="label",
        y="score", palette=["r", "grey", "g"], order=['Negative', 'Neutral', 'Positive'], orient='v',linewidth=1.5,
        data=data).set(xlabel='', ylabel='Score', title=title, ylim=(0.3,1))

    plt.savefig(png)

In [None]:
boxplottopic(coronapolicies, 'Corona and policies: score distribution for each sentiment', 'Figures\\boxplotcoronapolicies.png')
boxplottopic(lockdownactivities, 'Lockdown activities: score distribution for each sentiment', 'Figures\\boxplotlockdownactivities.png')
boxplottopic(prevention, 'Prevention: score distribution for each sentiment', 'Figures\\boxplotprevention.png')

Create graphs containing the distribution of the scores of each sentiment per topic

In [16]:
#Create classes for the sentiment scores
conditions = [
    (tweets_germany['score'] < 0.2),
    (tweets_germany['score'] > 0.2) & (tweets_germany['score'] <0.4 ),
    (tweets_germany['score'] > 0.4) & (tweets_germany['score'] <0.6 ),
    (tweets_germany['score'] > 0.6) & (tweets_germany['score'] <0.8 ),
    (tweets_germany['score'] > 0.8) & (tweets_germany['score'] <1 ),
    ]

values = ['0-0.2', '0.2-0.4', '0.4-0.6', '0.6-0.8', '0.8-1']
tweets_germany['class_score'] = np.select(conditions, values)

In [17]:
#Create dataframes of the topics again to include the class_score column
coronapolicies = tweets_germany[tweets_germany['nametopic'] == 'coronapolicies']
lockdownactivities = tweets_germany[tweets_germany['nametopic'] == 'lockdownactivities']
prevention = tweets_germany[tweets_germany['nametopic'] == 'prevention']

In [18]:
#Sort values so that they will appear right in the figures
coronapolicies = coronapolicies.sort_values('class_score')
lockdownactivities=lockdownactivities.sort_values('class_score')
prevention= prevention.sort_values('class_score')

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def sentimentscore (data, title):
    sns.set(font_scale=1.2)
    ax = sns.displot(data, x='class_score', col='label', height=4, col_order=["Positive", "Neutral", "Negative"],  stat='percent', common_norm=False,)
    plt.subplots_adjust(top=1)
    ax.set(xlabel='Sentiment score', ylabel='Percentage')
    plt.ylim(0, 60)
    plt.suptitle(title, fontsize=20, y=1.15, color = 'black')
    plt.show()

In [None]:
sentimentscore(coronapolicies, 'Corona and policies: Distribution of sentiment scores')
sentimentscore(lockdownactivities, 'Lockdown activities: Distribution of sentiment scores')
sentimentscore(prevention, 'Prevention: Distribution of sentiment scores')

Create scattertext for positive and negative sentiments

In [None]:
import scattertext as st

#Create df for positive and negative tweets
tweetsposneg = tweets_germany[(tweets_germany['label'] == 'Positive') | (tweets_germany['label'] == 'Negative')]

#Parse text to extract the words
tweetsposneg['parsed'] = tweetsposneg.text_clean.apply(st.whitespace_nlp_with_sentences)

#Create corpus
corpus = st.CorpusFromParsedDocuments(
	tweetsposneg,
	parsed_col = 'parsed',
	category_col='label',
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))

#Create html link for figure
html = st.produce_scattertext_explorer(
    corpus,
    category='Positive', category_name='Positive', not_category_name='Negative',
    minimum_term_frequency=0, pmi_threshold_coefficient=0,
    width_in_pixels=1000, 
    transform=st.Scalers.dense_rank
)
open("sentimentwords.html", 'wb').write(html.encode('utf-8'))

Create map containing the points of each tweet displaying the score of that tweet

In [None]:
import geoplot as gplt
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geoplot.crs as gcrs
import pyproj
plt.style.use('seaborn')

#Create geodataframe of the tweets
geotweets = gpd.GeoDataFrame(tweets_germany, geometry=gpd.points_from_xy(tweets_germany.x, tweets_germany.y))

geotweets = geotweets.set_crs('EPSG:4326')

#Create dataframe of the map of Germany
%matplotlib inline
bundeslander = gpd.read_file(r'Data\bundeslander.shp')

#set crs to same projection
bundeslander = bundeslander.to_crs(epsg=4326)

geom_germany = bundeslander
geom_germany['geometry'] = geom_germany.buffer(0)

#Create dataframes of the negative and positive scores
negative = geotweets[geotweets['label'] == 'Negative']

negative_score = negative[['score', 'geometry']]

positive = geotweets[geotweets['label'] == 'Positive']

positive_score = positive[['score', 'geometry']]

#Create maps
proj = gcrs.Mercator()

fig = plt.figure(figsize=(15, 10), )
ax1 = plt.subplot(121, projection=proj,  facecolor='white')
ax2 = plt.subplot(122, projection=proj, facecolor='white')

gplt.pointplot(negative_score,  hue='score', cmap='Blues',legend=True, legend_kwargs={'orientation':'horizontal'}, ax=ax1)
gplt.polyplot(geom_germany, ax=ax1, facecolor='white')
ax1.set_title('Score distribution of the negative tweets', fontsize=20)

gplt.pointplot(positive_score,  hue='score', cmap='Blues',legend=True, legend_kwargs={'orientation':'horizontal'}, ax=ax2)
gplt.polyplot(geom_germany, ax=ax2, facecolor='white')
ax2.set_title('Score distribution of the positive tweets', fontsize=20)