The code of this notebook uses the combined "GermanFakeNC" and Fake "News Dataset German" to predict whether a given article is FakeNews or not. The labels are "True" for Fake News and "False" for Real News.

There are 9 "results" in total:
Three classifiers are used (Complement Naive Bayes, Logistic Regression and Random Forest), each one using three different representations of the input: 

A pure BOW representation (binary CountVectorizer)

Word frequencies (CountVectorizer)

tf.idf representations (TfidfVectorizer)

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from typing import Union
from typing_extensions import Literal
from scipy import sparse as sp
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from spacy.lang.de.stop_words import STOP_WORDS

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import classification_report,accuracy_score,f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB #good for imbalanced data
from sklearn.ensemble import RandomForestClassifier

In [2]:
RAND = 42 #random state
N_CORES = 3 #number of CPU cores to use

In [3]:
raw_data = pd.read_pickle(r"datasets/combined.pkl")

In [4]:
def save_cmPlot(confusion_matrix,colour:Literal["Blues","Greens","Reds"],title:str,filename:str):
    fig = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix,
                                 display_labels=["Fake","Legitimate"])
    fig = fig.plot(include_values=True,
                    cmap=plt.cm.get_cmap(colour), 
                    ax=None, xticks_rotation="horizontal")
    fig.ax_.set_title(title)
    plt.savefig(r"plots/"+filename,dpi=350)
    plt.close()

In [5]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    raw_data["Body"],
    raw_data["Fake"],
    test_size=0.333,
    random_state=RAND
)

In [6]:
# For the "default" scenario set vec_args = dict():
vec_args={"stop_words":STOP_WORDS,"encoding":"utf-8"}
vectorizers = [
        CountVectorizer(**vec_args,binary=True),    #pure BOW
        CountVectorizer(**vec_args),
        TfidfVectorizer(**vec_args)
    ]
# For the "default" scenario comment out the next two lines, but uncomment the third one:
weights = {True:0.921,False:0.079} #inverse proportion of Fake News to Real News
cls_args = {"class_weight":weights,"random_state":RAND,"n_jobs":N_CORES}
# cls_args = {"random_state":RAND,"n_jobs":N_CORES}
classifiers = [
        ComplementNB(),
        LogisticRegression(**cls_args,multi_class = "ovr"), #ovr used just so I can use 
                                                            #more than one core
        RandomForestClassifier(**cls_args)
    ]

In [7]:
AVG_METHOD = "micro"

In [8]:
f1s  = dict() # (classifier,vectorizer) -> f1
accs = dict() # ((classifier,vectorizer),"class") -> accuracy
cms  = dict() # (classifier,vectorizer) -> confusion matrix
for vectorizer in vectorizers:
    Xtrain_vectors = vectorizer.fit_transform(X_train_raw)
    Xtest_vectors  = vectorizer.transform(X_test_raw)
    for classifier in classifiers:
        experiment = (classifier,vectorizer)
        #training
        classifier.fit(Xtrain_vectors,y_train)
        #testing
        y_predictions = classifier.predict(Xtest_vectors)
        
        f1 = f1_score(y_test,y_predictions,average=AVG_METHOD)
        f1s[experiment] = f1

        cm = confusion_matrix(y_test,y_predictions,labels=[True,False])
        accuracies = cm.diagonal()/cm.sum(axis=1)

        cms[experiment] = cm

        accs[experiment,"Fake"] = accuracies[0] #True
        accs[experiment,"Legitimate"] = accuracies[1] #False

In [9]:
top3 = dict(sorted(f1s.items(),key=lambda x: x[1],reverse=True)[:3])
top3

{(LogisticRegression(class_weight={False: 0.079, True: 0.921}, multi_class='ovr',
                     n_jobs=3, random_state=42), CountVectorizer(binary=True,
                  stop_words={'a', 'ab', 'aber', 'ach', 'acht', 'achte', 'achten',
                              'achter', 'achtes', 'ag', 'alle', 'allein', 'allem',
                              'allen', 'aller', 'allerdings', 'alles',
                              'allgemeinen', 'als', 'also', 'am', 'an', 'andere',
                              'anderem', 'anderen', 'andern', 'anders', 'auch',
                              'auf', 'aus', ...})): 0.9900205319199733,
 (LogisticRegression(class_weight={False: 0.079, True: 0.921}, multi_class='ovr',
                     n_jobs=3, random_state=42),
  CountVectorizer(stop_words={'a', 'ab', 'aber', 'ach', 'acht', 'achte', 'achten',
                              'achter', 'achtes', 'ag', 'alle', 'allein', 'allem',
                              'allen', 'aller', 'allerdings', 'alles',
 

In [10]:
for experiment,f1 in top3.items():
    classifier,vectorizer = experiment
    clsifiername = classifier.__class__.__name__
    vecrizername = vectorizer.__class__.__name__ 
    title = clsifiername + \
            " with " + \
            vecrizername + \
            ("(binary)" if vectorizer.binary else "")
    save_cmPlot(cms[experiment],
            colour="Reds",
            title=title,
            filename=clsifiername+"_"+vecrizername+\
                ("(binary)" if vectorizer.binary else "")+".png"
            )

In the following, the classification will be performed on the titles. Only the best performing experiment will be used.

In [11]:
(best_clsifier,best_vecrizer),_ = \
    sorted(top3.items(),key=lambda x: x[1],reverse=True)[:1][0]
best_vecrizer

CountVectorizer(binary=True,
                stop_words={'a', 'ab', 'aber', 'ach', 'acht', 'achte', 'achten',
                            'achter', 'achtes', 'ag', 'alle', 'allein', 'allem',
                            'allen', 'aller', 'allerdings', 'alles',
                            'allgemeinen', 'als', 'also', 'am', 'an', 'andere',
                            'anderem', 'anderen', 'andern', 'anders', 'auch',
                            'auf', 'aus', ...})

In [12]:
titles_X_train, titles_X_test, titles_y_train, titles_y_test = \
    train_test_split(
        raw_data["Title"],
        raw_data["Fake"],   #True: Is Fake News, False: Isn't
        test_size=0.333, 
        random_state=RAND 
    )

In [13]:
#calling these methods again will overwrite all previous progress (exactly what is needed):
titles_Xtrain_vecs = best_vecrizer.fit_transform(titles_X_train)
titles_Xtest_vecs  = best_vecrizer.transform(titles_X_test)
#training
best_clsifier.fit(titles_Xtrain_vecs,titles_y_train)
#testing
titles_y_predictions = best_clsifier.predict(titles_Xtest_vecs)

In [14]:
title_f1 = f1_score(y_test,y_predictions,average=AVG_METHOD)
title_cm = confusion_matrix(titles_y_test,titles_y_predictions,labels=[True,False])
title_accuracies = title_cm.diagonal()/title_cm.sum(axis=1)

In [17]:
clsifiername = best_clsifier.__class__.__name__
vecrizername = best_vecrizer.__class__.__name__ 
title =     clsifiername + \
            " with " + \
            vecrizername + \
            ("(binary)" if best_vecrizer.binary else "")
save_cmPlot(title_cm,
            colour="Greens",
            title=title,
            filename="TITLE"+clsifiername+"_"+vecrizername+\
                ("(binary)" if vectorizer.binary else "")+".png"
            )

In [18]:
titledict = {
    "Title Classifier": best_clsifier,
    "Title Vectorizer": best_vecrizer,
    "Title F1":         title_f1,
    "Acc Fake":         title_accuracies[0],
    "Acc Legitimate":   title_accuracies[1]
}
from pprint import pformat
with open(r"detection_metrics.txt","w",encoding="utf-8") as f:
    print("Article Bodies:",
          "F1s:",
          pformat(f1s),
          "Accuracies:",
          pformat(accs),
          "-------------",
          "Titles:",
          pformat(titledict),
          sep="\n",file=f)