In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

In [2]:
df = pd.read_csv("data_train/data.csv")

In [3]:
df["length"] = df.text.map(len)
df_vulgar = df[df.label==1]
df_non_vulgar = df[df.label==0]

In [4]:
df.head()

Unnamed: 0,text,label,source,length
0,gleich an die wand stellen und erschiessen..,1,fb-hate-speech,44
1,Ihr seids empfindlich . . Is beste wos uns pas...,1,fb-hate-speech,207
2,"Mei bitte, sie sollen sich alle gegenseitig ab...",1,fb-hate-speech,70
3,"Unglaublich, was für ausländerfeindliche Komme...",1,fb-hate-speech,152
4,Supie! Sie löschen sich gegenseitig aus! Hoffe...,1,fb-hate-speech,58


In [10]:
len(df)

98107

# distribution of vulgar vs non-vulgar

In [5]:
df_grouped = df.groupby("label").count()

In [8]:
df_grouped

Unnamed: 0_level_0,text,source,length
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,83075,83075,83075
1,15032,15032,15032


In [14]:
fig = go.Figure([go.Bar(x=["non-vulgar", "vulgar"], y=df_grouped.text)])
fig.update_layout(title="We see clearly that there are more examples of non-vulgar text, the dataset is highly imbalanced")
fig.show()

# distribution of sources

In [7]:
df_grouped = df.groupby("source").count()

In [8]:
df_grouped

Unnamed: 0_level_0,index,text,label,length
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fb-hate-speech,666,666,666,666
germeval2018,8407,8407,8407,8407
germeval2021,4150,4150,4150,4150
rp-mod-crowd,84515,84515,84515,84515
tweets-refugees,369,369,369,369


In [15]:
fig = go.Figure([go.Bar(x=df_grouped.index, y=df_grouped.text)])
fig.update_layout(title="RP is by far the largest data source.")
fig.show()

In [10]:
df_vulgar_grouped = df_vulgar.groupby("source").count()
df_non_vulgar_grouped = df_non_vulgar.groupby("source").count()

In [16]:
fig = go.Figure([go.Bar(x=df_vulgar_grouped.index, y=df_vulgar_grouped.text, name="vulgar"), go.Bar(x=df_non_vulgar_grouped.index, y=df_non_vulgar_grouped.text, name="non-vulgar")])
fig.update_layout(title="All data sources have more non-vulgar texts. Facebook dataset is the exception because only vulgar texts were chosen.")
fig.show()

# length of comments

In [17]:
fig = go.Figure(data=[go.Histogram(x=df.length)])
fig.update_layout(title="We see a shard dropoff at length of 500 words.")
fig.show()

In [19]:
fig = go.Figure(data=[go.Histogram(x=df_vulgar.length), go.Histogram(x=df_non_vulgar.length)])
fig.update_layout(title="Distribution of word length does not differ between vulgar and non-vulgar texts.")
fig.update_traces(opacity=0.75)
fig.show()