In [None]:
%%capture
!pip install plotly
!pip install nltk
!pip install wordcloud

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
color = sns.color_palette()
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.express as px
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# Visualisierungen
- Mean Sentiment over time per party                ✅
- HOF per MP                                        ✅
-  Männliche vs. weibliche Politiker vergleich      ✅
- Minority vergleich (überhaupt möglich?)
- Hate unter Politikern
- Wordclouds

# Data Analysis

## Plot Gender

In [None]:
df = pd.read_csv("mentions_predicted_exploded.csv")
df.head(2)

In [None]:
df['created_at']= pd.to_datetime(df['created_at'])
type(df.created_at[0])

In [None]:
# Pivot table and resample
df_pivot = df.pivot_table(index='created_at', columns='Geschlecht', values='model_predictions').resample('D').mean()

# Create plot
fig, ax = plt.subplots(figsize=(15, 10))

df_pivot.plot(ax=ax, ylabel="Mean HOF per Day", color=["red", "blue", "green"], xlabel="Date",
              title="Mean-HOF of Genders over time")

# Add text to plot
for i, row in df_pivot.iterrows():
    for col in df_pivot.columns:
        ax.text(i, row[col], round(row[col], 2), ha='center', va='bottom', fontsize=8)

plt.show()

### T-Test zur Signifikanz zw. Mann und Frau

In [None]:
from scipy import stats

frau_list = df_pivot["Frau"].tolist()
mann_list = df_pivot["Herrn"].tolist()

stats.ttest_rel(frau_list, mann_list)
# Statisitisch Signifikant! Das heißt, Männer bekommen in DE statistisch signifikant mehr Hate ab als Frauen

In [None]:
frau = df[df.Geschlecht == "Frau"]
print(len(frau))
mann = df[df.Geschlecht == "Herrn"]
print(len(mann))
print(df.Geschlecht.value_counts())

In [None]:
frau_hof = frau[frau["model_predictions"] == 1]
mann_hof = mann[mann["model_predictions"] == 1]
print(len(frau_hof))
print(len(mann_hof))
# 19.6% HOF bei Frauen
# 20.8% HOF bei Männern -> hauptsächlich wegen Karl Lauterbach

# 5 höchsten Ämter:
# 1. Bundespräsident: Steinmeier
# 2. Präsident des Bundestags: Bärbel Bas
# 3. Bundeskanzler: Olaf Scholz
# 4. Bundesratspräsident: Peter Tschentscher
# 5. Präsident des Bundesverfassungsgericht: Stephan Harbarth

In [None]:
print(len(frau.full_name.unique()))

In [None]:
print(len(mann.full_name.unique()))

In [None]:
len(df)

In [None]:
df["HOF"] = df["model_predictions"]
df.loc[(df.model_predictions == 1),'Label']='HOF'
df.loc[(df.model_predictions == 0),'Label']='NOT'

In [None]:
df.Geschlecht.value_counts()

In [None]:
# Plot
g = sns.catplot("Label", col="Geschlecht", col_wrap=3,
                data=df,
                kind="count", height=3.5, aspect=.8, 
                palette='tab10')

#fig.suptitle('sf')
plt.show()
g.savefig('./plots/gender_hof_distribution.png')

In [None]:
from time_plots import plot_gender_percentage

In [None]:
plot_gender_percentage(df,"./plots/gender_hof_distribution")

## Test

In [None]:
df.created_at.min()

In [None]:
df1 = df[df["created_at"] <= "2022-02-07 00:00:00"]
df2 = df[(df["created_at"] > "2022-02-07 00:00:00") & (df["created_at"] <= "2022-02-14 00:00:00")]
df3 = df[(df["created_at"] > "2022-02-14 00:00:00") & (df["created_at"] <= "2022-02-21 00:00:00")]
df4 = df[(df["created_at"] > "2022-02-21 00:00:00") & (df["created_at"] <= "2022-02-28 00:00:00")]
df5 = df[(df["created_at"] > "2022-02-28 00:00:00") & (df["created_at"] <= "2022-03-07 00:00:00")]
df6 = df[(df["created_at"] > "2022-03-07 00:00:00") & (df["created_at"] <= "2022-03-14 00:00:00")]
df7 = df[(df["created_at"] > "2022-03-14 00:00:00") & (df["created_at"] <= "2022-03-21 00:00:00")]
df8 = df[(df["created_at"] > "2022-03-21 00:00:00") & (df["created_at"] <= "2022-03-28 00:00:00")]
df9 = df[(df["created_at"] > "2022-03-28 00:00:00")]

In [None]:
df1.created_at

In [None]:
frau_liste = [23.5,21.1,22.7,17.3,16.1,16.4,19.2,17.9,14.6]
mann_liste = [21.8,22.5,22.1,19.5,21.5,21.3,20.6,18.6,20.1]

stats.ttest_rel(frau_liste, mann_liste)
# Statisitsch Signifikant: 

In [None]:
plot_gender_percentage(df9,"./plots/gender_hof_distribution")

## Hate per MP

In [None]:
df.head()

In [None]:
wagenknecht = df[df["mentioned_list"] == "swagenknecht"]
lauterbach = df[df["mentioned_list"] == "karl_lauterbach"] # 19.78948
dahmen = df[df["full_name"] == "Dr. Janosch Dahmen"] #5.37
scholz = df[df["full_name"] == "Olaf Scholz"] #4.86
buschmann = df[df["full_name"] == "Dr. Marco Buschmann"] #5.14
brandner = df[df["full_name"] == "Stephan Brandner"] #1.74

print(len(df))
print(len(brandner)) #19,78948 

In [None]:
percentage = [19.78948, 5.37, 4.86, ]

In [None]:
new = pd.DataFrame()

new["name"] = names
new["percentage"] = percentage

In [None]:
names = val_counts[:10].index.tolist()
names

In [None]:
hof = df[df["Label"] == "HOF"]
no = df[df["Label"] == "NOT"]

In [None]:
val_counts = pd.DataFrame(df.full_name.value_counts())
val_counts[:10]

In [None]:
val_counts["percentage"] = val_counts["full_name"] / len(df)
val_counts[:10]

In [None]:
hof_val_counts = pd.DataFrame(hof.full_name.value_counts())
hof_val_counts[:10]

In [None]:
not_val_counts = pd.DataFrame(no.full_name.value_counts())
not_val_counts

In [None]:
hof_val_counts = hof_val_counts.reset_index()
not_val_counts = not_val_counts.reset_index()

merge = hof_val_counts.merge(not_val_counts, left_on='index', right_on='index',suffixes=['', '_'])
merge = merge.rename(columns = {"full_name": "hof_mentions", "full_name_": "not_mentions"})
#merge.drop(["level_0", "level_0_"], axis=1)
merge

In [None]:
merge.set_index('index', inplace=True)
merge

In [None]:
merges = merge[:15]

font_color = '#525252'
hfont = {'fontname':'Calibri'}
facecolor = '#eaeaf2'
color_red = '#fd625e'
color_blue = '#01b8aa'
index = merges.index
column0 = merges['hof_mentions']
column1 = merges['not_mentions']
title0 = 'HOF Mentions'
title1 = 'NOT Mentions'

fig, axes = plt.subplots(figsize=(10,5), facecolor=facecolor, ncols=2, sharey=True)
fig.tight_layout()

axes[0].barh(index, column0, align='center', color=color_red, zorder=10)
axes[0].set_title(title0, fontsize=18, pad=15, color=color_red, **hfont)
axes[1].barh(index, column1, align='center', color=color_blue, zorder=10)
axes[1].set_title(title1, fontsize=18, pad=15, color=color_blue, **hfont)

# If you have positive numbers and want to invert the x-axis of the left plot
axes[0].invert_xaxis() 

# To show data from highest to lowest
plt.gca().invert_yaxis()

axes[0].set(yticks=merges.index, yticklabels=merges.index)
axes[0].yaxis.tick_left()
axes[0].tick_params(axis='y', colors='black') # tick color

axes[0].set_xticks([30000,60000,90000,120000,150000])
axes[1].set_xticks([100000,200000,300000,400000])

axes[0].set_xticklabels(["30k", "60k", "90k", "120k", "150k"])
axes[1].set_xticklabels(["100k", "200k", "300k", "400k"])

for label in (axes[0].get_xticklabels() + axes[0].get_yticklabels()):
    label.set(fontsize=13, color=font_color, **hfont)
for label in (axes[1].get_xticklabels() + axes[1].get_yticklabels()):
    label.set(fontsize=13, color=font_color, **hfont)

plt.subplots_adjust(wspace=0, top=0.85, bottom=0.1, left=0.18, right=0.95)

filename = 'hof_not_on_mps_bidirectional'
plt.savefig(filename+'.png', facecolor=facecolor)

In [None]:
von_afd = hof[hof["party"] == " AfD"]
von_spd = hof[hof["party"] == " SPD"]
von_csu = hof[hof["party"] == " CDU/CSU"]
von_linke = hof[hof["party"] == " Die Linke"]
von_grüne = hof[hof["party"] == " Bündnis 90/Die Grünen"]
von_fdp = hof[hof["party"] == " FDP"]
von_afd.mentioned_party.value_counts()

In [None]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# set font
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Helvetica'

# set the style of the axes and the text color
plt.rcParams['axes.edgecolor']='#333F4B'
plt.rcParams['axes.linewidth']=0.8
plt.rcParams['xtick.color']='#333F4B'
plt.rcParams['ytick.color']='#333F4B'
plt.rcParams['text.color']='#333F4B'

val_counts = val_counts[:10]
# we first need a numeric placeholder for the y axis
my_range=list(range(1,len(val_counts.index)+1))

fig, ax = plt.subplots(figsize=(5,3.5))

# create for each expense type an horizontal line that starts at x = 0 with the length 
# represented by the specific expense percentage value.
plt.hlines(y=my_range, xmin=0, xmax=val_counts['percentage'], color='#007ACC', alpha=0.2, linewidth=5)

# create for each expense type a dot at the level of the expense percentage value
plt.plot(val_counts['percentage'], my_range, "o", markersize=5, color='#007ACC', alpha=0.6)

# set labels
ax.set_xlabel('Mention Count in %', fontsize=15, fontweight='black', color = '#333F4B')
ax.set_ylabel('')

# set axis
ax.tick_params(axis='both', which='major', labelsize=12)
plt.yticks(my_range, val_counts.index)

# add an horizonal label for the y axis 
fig.text(-0.23, 0.96, 'Mentioned in a Tweet', fontsize=15, fontweight='black', color = '#333F4B')

# change the style of the axis spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

ax.spines['left'].set_bounds((1, len(my_range)))
#ax.set_xlim(0,25)

ax.spines['left'].set_position(('outward', 8))
ax.spines['bottom'].set_position(('outward', 5))

plt.savefig('plots/mentioned_in_a_tweet.png', dpi=300, bbox_inches='tight')

In [None]:
# set font
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Helvetica'

# set the style of the axes and the text color
plt.rcParams['axes.edgecolor']='#333F4B'
plt.rcParams['axes.linewidth']=0.8
plt.rcParams['xtick.color']='#333F4B'
plt.rcParams['ytick.color']='#333F4B'
plt.rcParams['text.color']='#333F4B'

hof_val_counts = not_val_counts[:10]
# we first need a numeric placeholder for the y axis
my_range=list(range(1,len(hof_val_counts.index)+1))

fig, ax = plt.subplots(figsize=(5,3.5))

# create for each expense type an horizontal line that starts at x = 0 with the length 
# represented by the specific expense percentage value.
plt.hlines(y=my_range, xmin=0, xmax=hof_val_counts['full_name'], color='#007ACC', alpha=0.2, linewidth=5)

# create for each expense type a dot at the level of the expense percentage value
plt.plot(hof_val_counts['full_name'], my_range, "o", markersize=5, color='#007ACC', alpha=0.6)

# set labels
ax.set_xlabel('Mention Count', fontsize=15, fontweight='black', color = '#333F4B')
ax.set_ylabel('')

# set axis
ax.tick_params(axis='both', which='major', labelsize=12)
plt.yticks(my_range, hof_val_counts.index)

# add an horizonal label for the y axis 
fig.text(-0.23, 0.96, 'Mentioned in a NOT Tweet', fontsize=15, fontweight='black', color = '#333F4B')

# change the style of the axis spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

ax.spines['left'].set_bounds((1, len(my_range)))
#ax.set_xlim(0,25)

ax.spines['left'].set_position(('outward', 8))
ax.spines['bottom'].set_position(('outward', 5))

plt.savefig('plots/not_on_mps.png', dpi=300, bbox_inches='tight')