## <b>Public events analysis</b>
<i>This script can be used to analyse if the differences in topics and sentiments over time and space are caused by pubic events. In addition, a Kruskal-Wallis test and a Mann-Whitney test are executed to see if the number of COVID-19 cases have an influence on the sentiment of people </i>


In [35]:
#Import needed libraries
import pandas as pd
import numpy as np
import geopandas as gpd
import pyproj
import matplotlib.pyplot as plt
plt.style.use('seaborn')

#Read the data with pandas
outputgermany = pd.read_csv('Data\\sentimenttopic70_tweets_germany.csv')
tweets_germany = pd.DataFrame(outputgermany)

#Convert DateTime column to two columns containing the date and the time
tweets_germany[['Date', 'Time']] = tweets_germany['t_datetime_goed'].str.split(expand=True)

#Add topic names again
conditions = [
    (tweets_germany['Topic'] == -1),
    (tweets_germany['Topic'] == 0) | (tweets_germany['Topic'] == 2)| (tweets_germany['Topic'] == 4) | (tweets_germany['Topic'] == 6) | (tweets_germany['Topic'] == 13), 
    (tweets_germany['Topic'] == 1) | (tweets_germany['Topic'] == 5)| (tweets_germany['Topic'] == 10)  | (tweets_germany['Topic'] == 14) | (tweets_germany['Topic'] == 15), 
    (tweets_germany['Topic'] == 3) | (tweets_germany['Topic'] == 7)| (tweets_germany['Topic'] == 8) | (tweets_germany['Topic'] == 9) | (tweets_germany['Topic'] == 11)| (tweets_germany['Topic'] == 12)
    ]

values = ['outlier', 'coronapolicies','prevention','lockdownactivities']
tweets_germany['nametopic'] = np.select(conditions, values)

#Convert labels into -1,0,1 so it can be used in array
conditions = [
    (tweets_germany['label'] == 'Negative'),
    (tweets_germany['label'] == 'Positive'),
    (tweets_germany['label'] == 'Neutral') 
    ]

values = ['-1', '1','0']
tweets_germany['numbersentiment'] = np.select(conditions, values)

#Convert topicname into numbers
conditions = [
    (tweets_germany['nametopic'] == 'outlier'),
    (tweets_germany['nametopic'] == 'coronapolicies'),
    (tweets_germany['nametopic'] == 'lockdownactivities'),
    (tweets_germany['nametopic'] == 'prevention')
    ]

values = ['-1', '1','2','3']
tweets_germany['numbertopicname'] = np.select(conditions, values)

#Create geodataframe
geotweets = gpd.GeoDataFrame(tweets_germany, geometry=gpd.points_from_xy(tweets_germany.x, tweets_germany.y))
geotweets = geotweets.set_crs('EPSG:4326')

Create df of the cases per day per state

In [None]:
cases = pd.read_csv('Data\\covid_19_cases.csv', encoding='unicode_escape')

In [None]:
cases = pd.read_csv('Data\\covid_19_cases.csv', encoding='unicode_escape')
cases['Date_goed'] = pd.to_datetime(cases['Date'], format='%d-%m-%y')
cases = cases.rename({'Region':'State'}, axis=1)

Create one df of the tweets and cases

In [None]:
covidcases = cases[['State', 'Date', 'CurrentlyPositive']]
covidcases['Date'] = covidcases['Date'].astype(str)
tweets_germany['Date'] = tweets_germany['Date'].astype(str)

mergecases = tweets_germany.merge(covidcases, on=['State', 'Date'], how='inner')

#Create column with month and year of tweet
mergecases['month_year'] = pd.to_datetime(mergecases['Date'], format = '%d-%m-%y').dt.to_period('m')

Kruskal-Wallis test

In [None]:
import scipy.stats as stats

#Test for Germany

# convert `y` to a numpy array for more convenient indexing
y = np.array(mergecases['CurrentlyPositive'])

# find unique group labels and their corresponding indices
label, idx = np.unique(mergecases['numbersentiment'], return_inverse=True)

# make a list of arrays containing the y-values corresponding to each unique label
groups = [y[idx == i] for i, l in enumerate(label)]

# use `*` to unpack the list as a sequence of arguments to `stats.kruskal`
H, p = stats.kruskal(*groups)

print(H, p)

In [53]:
#Test for the states of Germany

def kruskalwallis (state):
    df = mergecases[mergecases['State'] == state]
    y = np.array(df['CurrentlyPositive'])
    label, idx = np.unique(df['numbersentiment'], return_inverse=True)
    groups = [y[idx == i] for i, l in enumerate(label)]
    H, p = stats.kruskal(*groups)
    print(state, H, p)


In [None]:
states = geotweets['State'].unique().tolist()
del states[4]

for state in states:
    kruskalwallis(state)

Mann-Whitney test

In [None]:
#MW for germany

case = np.array(mergecases['CurrentlyPositive']).astype(float)
sent = np.array(mergecases['numbersentiment']).astype(float)

print(stats.mannwhitneyu(case, sent))

In [76]:
#states where kruskal wallis was signicant in list
states = ['Bayern', 'Hessen', 'Saarland', 'Thüringen']

In [78]:
labels = geotweets['label'].unique().tolist()

def mannwhitney (state, label):
    df = mergecases[mergecases['State'] == state]
    df2 = df[df['label'] == label]

    case = np.array(df2['CurrentlyPositive']).astype(float)
    sent = np.array(df2['numbersentiment']).astype(float)

    print(state, label, stats.mannwhitneyu(case, sent))


In [None]:
for state in states:
    for label in labels:
        mannwhitney(state, label)