### Download german spacy pipeline

In [6]:
!python -m spacy download de_core_news_lg

Collecting de-core-news-lg==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_lg-3.3.0/de_core_news_lg-3.3.0-py3-none-any.whl (567.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m567.8/567.8 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m22.2.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_lg')


In [7]:
!pip install fasttext


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m22.2.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [8]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import spacy
import re
import umap
nlp = spacy.load('de_core_news_lg')

In [9]:
data = pd.read_csv("data/prepared_dataframe.csv")

In [10]:
# split data in training and test
data_train = data.sample(round(0.75 * len(data)), random_state = 1).reset_index()
data_test = data[~data.index.isin(data_train.index)].reset_index()


In [11]:
data_pre = data_train

In [12]:
# create spacy docs with a nlp pipeline
# https://spacy.io/usage/processing-pipelines
tweet_docs = [nlp(tweet) for tweet in data_pre["tweets_clean"]]

In [13]:
# extract the generated lemmas for words that are no stopwords and have a length of more than two
# Lemmatization is the process of reducing inflected forms of a word while still ensuring that the reduced
# form belongs to the language. This reduced form or root word is called a lemma.
tweet_words = [
    [
    word.lemma_ for word in doc if ((not word.is_stop) and (len(word) >= 3))
    ]
    for doc in tweet_docs]

In [14]:
tweet_words

[['läppisch',
  'Euro',
  'Afd',
  'Welt',
  'Demokratieuntergang',
  'herbei',
  'schreiben',
  '....',
  'süß'],
 ['finden', 'eklig', 'Body', 'Shamer', 'Schäm'],
 ['Umwelt',
  'Scheiße',
  'Mann',
  'hören',
  'grün',
  'Sumpf',
  'schwachköpfe',
  'steuergeldern',
  'finanzieren',
  'kosten',
  'Arbeitsplatz',
  'Stadt',
  'sterben',
  'Unsinn',
  'nahrung',
  'aussprechen'],
 ['gehören', 'mal', '26,5', 'Stunde'],
 ['Abschiebung',
  'kriminell',
  'Migrant',
  'Heimat',
  'eventuell',
  'Gefahr',
  'drohen',
  'setzen',
  'Bevölkerung',
  'Gefahr',
  'Opfer',
  'Verbrechen',
  'bevor',
  'ausländisch',
  'Straftäter',
  'abschieben',
  'linker',
  'gutmenschen-terror'],
 ['~faktor',
  'mensch~',
  'Banane',
  'Geländewag',
  '300',
  'funktionieren',
  'Kommunismus',
  'werft',
  'eur',
  'marx-werke',
  'Müll',
  'Freunde'],
 ['Sozialschmarotzer', 'vorher', 'fragen'],
 ['Diskussionsveranstaltung',
  '\\""die',
  'Fehler',
  'gsp-denkens',
  'verkehrt',
  'Kritik',
  'Peter',
  'che

In [15]:
import fasttext
import fasttext.util
from gensim.models.fasttext import *
# ft = fasttext.load_model("/Users/jannis/cc.de.300.bin")
ft = load_facebook_model("/Users/jannis/cc.de.300.bin")

In [16]:
len(tweet_words)

2273

In [17]:
# continue training pretrained model
ft.build_vocab(tweet_words, update=True)
ft.train(tweet_words, total_examples=len(tweet_words), epochs=10)

(98934, 214680)

In [18]:
ft.wv.most_similar(["Deutschland", "Merkel"], topn = 5)

[('Bundeskanzlerin', 0.9851183295249939),
 ('Kanzlerin', 0.9834180474281311),
 ('Merkel-Deutschland', 0.9822767376899719),
 ('Merkel-Regierung', 0.9808183908462524),
 ('Bundesregierung', 0.9793848991394043)]

In [19]:
ft.wv.most_similar(["Korruption", "Europa"], topn = 5)

[('Deutschland', 0.9799163341522217),
 ('Globalisierung', 0.9747747778892517),
 ('EU-Wirtschaft', 0.9738207459449768),
 ('europäischen', 0.9736407995223999),
 ('Wirtschaftsmächten', 0.9735116362571716)]

In [20]:
ft.wv.most_similar(["Polizei"], topn = 5)

[('Polizeit', 0.9683308005332947),
 ('Bundespolizei', 0.9553856253623962),
 ('Polizeidirektion', 0.9544669389724731),
 ('Polizisten', 0.9539645314216614),
 ('Verwaltungspolizei', 0.9527090787887573)]

In [21]:
# reduce dimensions
# fasttext.util.reduce_model(ft, 100)

In [22]:
# unique list of words
uni_tweet_words = list(ft.wv.key_to_index.keys())

In [23]:
# Remove tweets where there are no words in the fasttext model
def remove_nan_tweets(tweet_words, model):
    tweet_words_dict = {}
    for tweet in tweet_words:
        sum_occurr = 0
        for word in tweet:
            if word in model:
                sum_occurr +=1
        if sum_occurr > 0:
            tweet_words_dict[tweet_words.index(tweet)] = True

    return tweet_words_dict

tweet_words_dict = remove_nan_tweets(tweet_words, uni_tweet_words)
    
# subset data_pre and tweet_words, only tweets where at least one word is in the word2vec model
data_pre = data_pre.iloc[list(tweet_words_dict.keys()), :]
data_pre = data_pre.reset_index()
tweet_words = [tweet_words[i] for i in list(tweet_words_dict.keys())]

In [24]:
len(data_pre)

2257

## Classify new tweets

In [25]:
# calculate center of mass vector for list of words (used here for article as
# collection of words)
def get_com_vector(words : list) -> np.array:
    # list of words in the word2vec model
    words = [word for word in words if word in uni_tweet_words]
    # get the vectors
    vectors = np.array([ft.wv[word] for word in words])
    # return the sum of all vectors devided by the amount of words from words in the model
    vector = np.sum(vectors, axis=0)
    return vector / len(words)

# get vector for each tweet
tweet_vectors = []
for tweet in tweet_words:
    vec = get_com_vector(tweet)
    tweet_vectors.append(vec)


In [28]:
# function to preprocess and transform new tweets
def new_tweet_vector(tweet : str):
    prep_new = tweet
    doc_new = nlp(prep_new)

    words_new = [
                    word.lemma_ for word in doc_new
                    if (not word.is_stop) and (len(word)>2)
                 ]

    # calculate vector for new article
    new_tweet_v = get_com_vector(words_new)

    return new_tweet_v, words_new

In [29]:
# create tweet vectors of test tweets
new_tweet_v = []
words_new = []
for tweet in data_test.tweets_clean:
    new_tweet_v.append(new_tweet_vector(tweet)[0])
    words_new.append(new_tweet_vector(tweet)[1])

  return vector / len(words)


In [30]:
# Remove tweets where there are no words in the word2vec model  
tweet_words_dict = remove_nan_tweets(words_new, uni_tweet_words)   

# subset data_test and tweet_words, only tweets where at least one word is in the word2vec model
data_test = data_test.iloc[list(tweet_words_dict.keys()), :]
data_test = data_test.reset_index()
new_tweet_v = [new_tweet_v[i] for i in list(tweet_words_dict.keys())]

In [31]:
# Encode the classes
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_transformed = encoder.fit_transform(data_pre["granulareKlassifikation"])
encoder.classes_

array(['ABUSE', 'INSULT', 'OTHER', 'PROFANITY'], dtype=object)

In [32]:
# Train a Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(tweet_vectors, y_transformed)
rfc_predictions = rfc.predict(new_tweet_v)
# rfc_human_readeable = encoder.inverse_transform(rfc_predictions)

In [33]:
# transform predictions to classes
encoder_dict = dict(enumerate(encoder.classes_.flatten(), 0))
rfc_predictions = [encoder_dict[x] for x in rfc_predictions]

In [34]:
# evaluate predictions
acc = 0
for i in range(len(rfc_predictions)):
    if rfc_predictions[i] == data_test["granulareKlassifikation"][i]:
        acc += 1
accuracy = acc / len(rfc_predictions)        

In [35]:
accuracy

0.9165562913907285

In [36]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(data_test["granulareKlassifikation"], rfc_predictions))

              precision    recall  f1-score   support

       ABUSE       1.00      0.80      0.89        86
      INSULT       1.00      0.77      0.87       170
       OTHER       0.88      1.00      0.94       472
   PROFANITY       1.00      0.74      0.85        27

    accuracy                           0.92       755
   macro avg       0.97      0.83      0.89       755
weighted avg       0.93      0.92      0.91       755



In [37]:
rfc_predictions[:10]

['ABUSE',
 'INSULT',
 'INSULT',
 'INSULT',
 'OTHER',
 'OTHER',
 'INSULT',
 'OTHER',
 'OTHER',
 'OTHER']

In [38]:
data_test["granulareKlassifikation"][:10]

0     ABUSE
1    INSULT
2    INSULT
3    INSULT
4    INSULT
5    INSULT
6    INSULT
7     OTHER
8     OTHER
9     OTHER
Name: granulareKlassifikation, dtype: object

In [None]:
# value counts of predictions
[cat_predictions.count(x) for x in ['ABUSE', 'PROFANITY','INSULT', 'OTHER']]

In [None]:
data_test["granulareKlassifikation"].value_counts()

In [None]:
data_test.head(10)