In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from joblib import dump, load

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

import spacy
import itertools
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, recall_score, precision_score
from sklearn.metrics import classification_report

In [None]:
# Function for constructing confusion matrix
# model is the name of the classifier to be used

def plot_confusion_matrix(model, X_train, X_test, y_train, y_test):
    fig, axes = plt.subplots(1, 2, figsize=(16,7))
    # Block to evaluate training data
    yhat_test = model.predict(X_test)
    #yhat_test = np.argmax(yhat_test, axis=1)
    y_label_test = y_test

    mat = confusion_matrix(y_label_test, yhat_test)
    df = pd.DataFrame(mat, index = ["AfD", "Union", "FDP", "Grüne", "Linke", "SPD"],
                      columns = ["AfD", "Union", "FDP", "Grüne", "Linke", "SPD"])
    
    sn.heatmap(df, annot=True ,cmap='Blues', fmt='g', ax=axes[0]).set_title('Test Data');
    
    # Block to evaluate test data
    yhat_train = model.predict(X_train)
    y_label_train = y_train

    mat = confusion_matrix(y_label_train, yhat_train)
    df = pd.DataFrame(mat, index = ["AfD", "Union", "FDP", "Grüne", "Linke", "SPD"],
                      columns = ["AfD", "Union", "FDP", "Grüne", "Linke", "SPD"])
    
    sn.heatmap(df, annot=True ,cmap='Blues', fmt='g', ax=axes[1]).set_title('Train Data');
    plt.show()
    print('Party: \t Test \t Train\nAfd:\t',sum(y_label_test == 0), "\t" , sum(y_label_train == 0))
    print('Union:\t',sum(y_label_test == 1), "\t" , sum(y_label_train == 1))
    print('FDP:\t',sum(y_label_test == 2), "\t" , sum(y_label_train == 2))
    print('Grüne:\t',sum(y_label_test == 3), "\t" , sum(y_label_train == 3))
    print('Linke:\t',sum(y_label_test == 4), "\t" , sum(y_label_train == 4))
    print('SPD:\t',sum(y_label_test == 5), "\t" , sum(y_label_train == 5))
    # Accuracy for test and train data
    print('\nAcc:\t', "{:2.2f}%".format(accuracy_score(y_label_test,yhat_test)*100), "{:2.2f}%".format(accuracy_score(y_label_train,yhat_train)*100))
    print("\n"+classification_report(y_label_test, yhat_test,  digits=4)) 

In [None]:
# make_pipeline () is the function to create the pipeline to be used
def make_pipeline(vectorizer, model, verbose=True):
    return Pipeline([("vectorizer",vectorizer),
                        ("model",model)], verbose=verbose)

In [None]:
# Creating a data frame for our tweets. Each party becomes a unique index from 0 to 5
data = pd.DataFrame(columns=['tweet', 'party'])

afd = pd.read_csv('../cleaned-data/AfD.csv')['text']
afd = pd.DataFrame([[i, 0] for i in afd], columns=['tweet', 'party'])

data = data.append(afd, ignore_index=True)

cdu = pd.read_csv('../cleaned-data/CDU.csv')['text']
csu = pd.read_csv('../cleaned-data/CSU.csv')['text']

cdu = pd.DataFrame([[i, 1] for i in cdu], columns=['tweet', 'party'])
csu = pd.DataFrame([[i, 1] for i in csu], columns=['tweet', 'party'])

data = data.append(cdu, ignore_index=True)
data = data.append(csu, ignore_index=True)

fdp = pd.read_csv('../cleaned-data/FDP.csv')['text']
fdp = pd.DataFrame([[i, 2] for i in fdp], columns=['tweet', 'party'])

data = data.append(fdp, ignore_index=True)

gru = pd.read_csv('../cleaned-data/GRÜNE.csv')['text']
gru = pd.DataFrame([[i, 3] for i in gru], columns=['tweet', 'party'])

data = data.append(gru, ignore_index=True)

lin = pd.read_csv('../cleaned-data/LINKE.csv')['text']
lin = pd.DataFrame([[i, 4] for i in lin], columns=['tweet', 'party'])

data = data.append(lin, ignore_index=True)

spd = pd.read_csv('../cleaned-data/SPD.csv')['text']
spd = pd.DataFrame([[i, 5] for i in spd], columns=['tweet', 'party'])

data = data.append(spd, ignore_index=True)

data = data.dropna()
data

In [None]:
# Downloading a large spacy model for German
#!python -m spacy download de_core_news_lg
nlp = spacy.load('de_core_news_lg')

# Function to lemmatize our words
def spacy_tokenizer(tweet):
    # Creating our token object
    mytokens = nlp(tweet)

  # Lemmatizing each token
    mytokens = [word.lemma_.strip() for word in mytokens if word.lemma_ != "-PRON-" ]

  # return preprocessed list of tokens
    return mytokens

In [None]:
# Splitting our data 
X_train, X_test, y_train, y_test = train_test_split(data['tweet'], data['party'].to_numpy(dtype=np.int64),
                                                        random_state=42, test_size=0.30, shuffle=True)

In [None]:
# Using lemmatization. Make pipeline with CountVectorizer and LinearSVC classifier
pipe_svc_cnt_lem = make_pipeline(CountVectorizer(tokenizer=spacy_tokenizer), LinearSVC(max_iter=10000, dual=False), True)
pipe_svc_cnt_lem.fit(X_train, y_train)
dump(pipe_svc_cnt_lem, 'pipe_svc_cnt_lem.joblib') # Saving our fitted model

In [None]:
# Loading and analyzing the trained model
svc_model_cnt_lem = load('pipe_svc_cnt_lem.joblib') 
print("Report for the LinearSVC model with CountVectorizer and lemmatization")
plot_confusion_matrix(svc_model_cnt_lem, X_train, X_test, y_train, y_test)

In [None]:
# Using lemmatization. Make pipeline with TfidfVectorizer and LinearSVC classifier
pipe_tfidf_cnt_lem = make_pipeline(TfidfVectorizer(tokenizer=spacy_tokenizer), LinearSVC(max_iter=10000, dual=False), True)
pipe_tfidf_cnt_lem.fit(X_train, y_train)
dump(pipe_tfidf_cnt_lem, 'pipe_tfidf_cnt_lem.joblib')

In [None]:
# Loading and analyzing the trained model
pipe_svc_tfidf_lem = load('pipe_tfidf_cnt_lem.joblib')
print("Report for the LinearSVC model with TFidfVectorizer and lemmatization")
plot_confusion_matrix(pipe_svc_tfidf_lem, X_train, X_test, y_train, y_test)

In [None]:
# Creating pipelines for the CountVectorizer and various classifiers
pipe_svc_cnt = make_pipeline(CountVectorizer(), LinearSVC(max_iter=10000, dual=False), True)
pipe_lg_cnt = make_pipeline(CountVectorizer(), 
                            LogisticRegression(max_iter=10000, solver='lbfgs', dual=False, random_state=0), True)
pipe_nb_cnt = make_pipeline(CountVectorizer(), MultinomialNB(), True)

In [None]:
pipe_svc_cnt.fit(X_train, y_train)
dump(pipe_svc_cnt, 'pipe_svc_cnt.joblib')

pipe_lg_cnt.fit(X_train, y_train)
dump(pipe_lg_cnt, 'pipe_lg_cnt.joblib')

pipe_nb_cnt.fit(X_train, y_train)
dump(pipe_nb_cnt, 'pipe_nb_cnt.joblib')

In [None]:
# Creating pipelines for the TfidfVectorizer and various classifiers
pipe_svc_tfidf = make_pipeline(TfidfVectorizer(), LinearSVC(max_iter=10000, dual=False), True)
pipe_lg_tfidf = make_pipeline(TfidfVectorizer(), 
                            LogisticRegression(max_iter=10000, solver='lbfgs', dual=False, random_state=0), True)
pipe_nb_tfidf = make_pipeline(TfidfVectorizer(), MultinomialNB(), True)

In [None]:
pipe_svc_tfidf.fit(X_train,y_train)
dump(pipe_svc_tfidf, 'pipe_svc_tfidf.joblib')

pipe_lg_tfidf.fit(X_train,y_train)
dump(pipe_lg_tfidf, 'pipe_lg_tfidf.joblib')

pipe_nb_tfidf.fit(X_train,y_train)
dump(pipe_nb_tfidf, 'pipe_nb_tfidf.joblib')

In [None]:
# Comparison of results for CountVectorizer and TfIdfVectorizer for LinearSVC classifier
svc_model_cnt = load('pipe_svc_cnt.joblib')
print("Report for the LinearSVC model with CountVectorizer")
plot_confusion_matrix(svc_model_cnt, X_train, X_test, y_train, y_test)
svc_model_tfidf = load('pipe_svc_tfidf.joblib')
print("Report for the LinearSVC model with TfIdfVectorizer")
plot_confusion_matrix(svc_model_tfidf, X_train, X_test, y_train, y_test)

In [None]:
# Comparison of results for CountVectorizer and TfIdfVectorizer for LogisticRegression classifier
lg_model_cnt = load('pipe_lg_cnt.joblib')
print("Report for the LogisticRegression model with CountVectorizer")
plot_confusion_matrix(lg_model_cnt, X_train, X_test, y_train, y_test)
lg_model_tfidf = load('pipe_lg_tfidf.joblib')
print("Report for the LogisticRegression model with TfIdfVectorizer")
plot_confusion_matrix(lg_model_tfidf, X_train, X_test, y_train, y_test)

In [None]:
# Comparison of results for CountVectorizer and TfIdfVectorizer for MultinomialNB classifier
nb_model_cnt = load('pipe_nb_cnt.joblib')
print("Report for the MultinomialNB model with CountVectorizer")
plot_confusion_matrix(nb_model_cnt, X_train, X_test, y_train, y_test)
nb_model_tfidf = load('pipe_nb_tfidf.joblib')
print("Report for the MultinomialNB model with TfIdfVectorizer")
plot_confusion_matrix(nb_model_tfidf, X_train, X_test, y_train, y_test)