In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from typing import Tuple,Union,List
from os.path import splitext
from itertools import chain

import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
plt.rcParams['xtick.major.pad']='8'

In [3]:
import spacy 
nlp = spacy.load("de_core_news_sm",disable=["ner"])

In [4]:
#Options for spacy nlp.pipe:
N_CORES = 3 
BATCH_SIZE = 300

In [5]:
def save_to_file(df : Union[pd.DataFrame,pd.Series], filepath : str):
    '''
    Saves dataframe to file in utf-8, or pickles it. File extension 
    can be either 'csv' or 'pkl'.'''
    _, ext = splitext(filepath)
    if ext == ".csv":
        df.to_csv(
            filepath.encode('unicode-escape').decode(),
            header=True,encoding="utf-8"
            )
    elif ext == ".pkl":
        df.to_pickle(
            filepath.encode('unicode-escape').decode()
            )
    else:
        raise Exception("File extention must be either 'csv' or 'pkl'")

In [6]:
def prepare_FNDatasetGer() -> pd.DataFrame:
    FNDatasetGerman = pd.read_csv(r"datasets/FNDatasetGer.csv",encoding="utf-8")
    FNDatasetGerman.drop(columns=["id","url","Kategorie","Datum","Quelle","Art"],inplace=True)
    FNDatasetGerman = FNDatasetGerman.convert_dtypes().astype({"Fake":bool})
    FNDatasetGerman.rename(columns={"Titel":"Title"},inplace=True)
    return FNDatasetGerman

def prepare_germanFakeNC() -> pd.DataFrame:
    germanFakeNC = pd.read_json(r"datasets/texts_GermanFakeNC.json",encoding="utf-8")
    germanFakeNC.rename(columns={"title":"Title","text":"Body"},inplace=True)
    germanFakeNC["Fake"] = True
    return germanFakeNC

In [7]:
def prepare_data() -> pd.DataFrame:
    FNDatasetGerman = prepare_FNDatasetGer()
    germanFakeNC = prepare_germanFakeNC()
    #integrate germanFakeNC into FNDatasetGerman:
    combined_data = pd.concat([FNDatasetGerman,germanFakeNC])
    combined_data.drop_duplicates(subset="Body",inplace=True)
    #save for prediction later-on:
    save_to_file(combined_data,"datasets/combined.pkl")
    return combined_data

In [8]:
def seperate(data : pd.DataFrame) -> Tuple[pd.DataFrame,pd.DataFrame]:
    '''
    Seperates the combined dataset into Real News and Fake News and drops the "Fake" column'''
    grouped_data = data.groupby(data.Fake)
    fn_data = grouped_data.get_group(True)
    fn_data.name = "FakeNews"
    rn_data = grouped_data.get_group(False)
    rn_data.name = "RealNews"
    fn_data.drop("Fake",inplace=True,axis=1)
    rn_data.drop("Fake",inplace=True,axis=1)
    return rn_data,fn_data

In [9]:
def tokenize(texts : pd.Series) -> pd.DataFrame:
    '''
    Takes Series of texts and analyses on the word-token level.
    Returns DataFrame with columns: 
    - Token, \n
    - Lemma, \n
    - POS (universal dependencies schema), \n
    - Tag (finegrained tag schema), \n
    - is_punctuation (whether the Token consists of pure punctuation characters) \n
    - is_spacy (whether the Token consists of white-space characters, e.g. "\\r\\n"\n
    Each token has one row dedicated to it.'''
    return pd.DataFrame(
            list((i,
                token.text if not token.is_punct else token.text[:3],
                token.lemma_,
                token.pos_,
                token.tag_,
                token.is_stop,
                token.is_punct,
                token.is_space) 
                for i,doc in 
                enumerate(nlp.pipe(texts,batch_size=BATCH_SIZE,n_process=N_CORES)) 
                for token in doc
            ),
            columns=["i","Token","Lemma","POS","Tag","is_stopword","is_punctuation","is_space"]
        ).set_index("i")#DON'T SET inplace=True: returns None?!

In [10]:
def n_grams(texts : pd.Series) -> List[pd.DataFrame]:
    '''
    Take series of texts and returns bigrams and trigrams for tokens and lemmas each. 
    Instead of a data frame, the columns/series are returned in a list. 
    Each n-gram has it's own row'''
    def generate_ngrams(doc, n :int):
        # Helper function. Creates the N-grams like this: 
        # [Doc1:
        # [("tok1 - tok2"),("lem1 - lem2"),
        #  ("tok2 - tok3"),("lem2 - lem3"),
        #  ("tok3 - tok4"),("lem3 - lem4")],
        #  Doc2:
        # [("tok1 - tok2"),... and so on
        for ngram in list(zip(*[doc[i:] for i in range(n)])):
            if not any(token.is_space or token.is_stop or token.is_punct for token in ngram):
                yield (" ".join(token.text   for token in ngram),
                       " ".join(token.lemma_ for token in ngram))
        ########
    #gram gets the n-grams for each document:
    #(tagger still enabled for accurate lemmatization)
    gram = lambda texts,n: (
             generate_ngrams(doc,n)
             for doc 
             in  nlp.pipe(texts,batch_size=BATCH_SIZE,n_process=N_CORES,disable=["parser"])
    )
    bigram_df = pd.DataFrame(
        list(chain.from_iterable(gram(texts,2))),
        columns=["Token Bigrams","Lemma Bigrams"]
    )
    trigram_df = pd.DataFrame(
        list(chain.from_iterable(gram(texts,3))),
        columns=["Token Trigrams","Lemma Trigrams"]
    )
    return [bigram_df["Token Bigrams"],
            bigram_df["Lemma Bigrams"],
            trigram_df["Token Trigrams"],
            trigram_df["Lemma Trigrams"]]

In [11]:
def sentencize(texts : pd.Series) -> pd.Series:
    '''
    Takes a series of texts, and returns an equvalent series 
    where each row contains this row's sentences.\n
    Note that unlike n_grams() and tokenize(), here, the sentences don't have a row each. '''
    # get_sents = lambda doc: [sent for sent in nlp(doc,disable=["tagger"]).sents]
    # return series.apply(get_sents).rename("Sent_Tokenized_Doc",inplace=True)
    return pd.Series(
        list(list(doc.sents) 
             for doc 
             in nlp.pipe(texts,batch_size=BATCH_SIZE,n_process=N_CORES,disable=["tagger"])
            )
        )

In [12]:
def get_ColumnStats(col : pd.Series, new_name:str=None) -> pd.Series:
    '''
    Take a data frame columns / series and returns the most important statistical measures 
    for the length of each row (like mean, std, median). Optionally renames the series:\n
    new_name : str or None'''
    if not new_name:
    # str.len can be used for anything where the len function is defined:
        return col.str.len().describe()
    return col.str.len().describe().rename(new_name,inplace=True)

In [13]:
def create_freqdists(series_list:List[pd.Series],n:int,normalize:bool)->List[pd.Series]:
    '''
    Takes a list of series and returns the sorted counts of the n most common values.'''
    return [series.value_counts(normalize=normalize)[:n].copy() for series in series_list]

In [14]:
def get_tokendata_freqdists(df :pd.DataFrame,n:int, normalize:bool=True)-> List[pd.Series]:
    '''
    Takes a tokenized data frame and returns the frequencies of 
    word-tokens, lemmas, POS-tags, finegrained tags and punctuation-tokens. 
    The frequency distributions are returned as a list of series'.'''
    not_PunctStopSpace   =  df.groupby(["is_stopword","is_punctuation","is_space"],
                                        sort = False).get_group((False,False,False))
    punct_space_grouping =  df.groupby(["is_punctuation","is_space"],sort=False)
    notPuncts  =            punct_space_grouping.get_group((False,False))
    puncts     =            punct_space_grouping.get_group((True,False))

    return create_freqdists([
        not_PunctStopSpace["Token"].rename("Token Frequencies"),
        not_PunctStopSpace["Lemma"].rename("Lemma Frequencies"),
        notPuncts["POS"]           .rename("POS-Tag Frequencies"),
        notPuncts["Tag"]           .rename("Finegrained Tag Frequencies"),
        puncts["Token"]            .rename("Punct-Token Frequencies"),
    ],n=n,normalize=normalize)

In [15]:
def token_analysis(df_col :pd.Series,df_name:str,n:int,normalize:bool):
    '''
    Takes a data frame columns / series of texts and analyzes each row's tokens. 
    The frequency distributions are saved as plots, but also in .csv format. 
    Tokenlengths are also analyzed (mean len, median len and std) and are saved as .csv.
    '''
    tokendata = tokenize(df_col)

    #avg number of tokens per article:
    tokens_per_text_stats = get_ColumnStats(
            pd.Series(tokendata["Token"].groupby(tokendata.index,sort=False).groups),
            new_name="Tokens per Text"
        )
    save_to_file(tokens_per_text_stats,"stats/%s_%s_Tokens_per_Text.csv"%(df_name,df_col.name))
    
    #Calculate distributions of tokens, lemmas, POS-tags, fine tags, punctuation:
    for freqdist in get_tokendata_freqdists(tokendata,n=n,normalize=normalize): 
        unique_name=(df_name,df_col.name,freqdist.name)
        save_plot(
            freqdist,
            title   = "%s %s %s"    %unique_name,
            filepath= "%s_%s_%s.png"%unique_name
        )
        save_to_file(freqdist,"freqdists/%s_%s_%s.csv"%unique_name)
    
    #Calculate number of letters per token:
    tokenlen_stats = get_ColumnStats(tokendata["Token"],new_name="Letters Per Token")
    save_to_file(tokenlen_stats,"stats/%s_%s_Tokenlength.csv"%(df_name,df_col.name))

In [16]:
def ngram_analysis(df_col :pd.Series,df_name:str,n:int,normalize:bool):
    '''
    Takes a data frame columns / series of texts and analyzes each row's bigrams and trigrams. 
    The results (all frequency distributions) are saved as plots, but also in .csv format.'''
    ngram_freqdists = create_freqdists(n_grams(df_col),n=n,normalize=normalize)
    for freqdist in ngram_freqdists:
        unique_name=(df_name,df_col.name,freqdist.name)
        save_plot(
            freqdist,
            title   = "%s %s %s"    %unique_name,
            filepath= "%s_%s_%s.png"%unique_name
        )
        save_to_file(freqdist,"freqdists/%s_%s_%s.csv"%unique_name)

In [17]:
def sent_analysis(df_col :pd.Series,df_name:str):
    '''
    Takes data frame column / series of texts and analyses the average number of 
    tokens per sentence, and the average number of sentences per article/title. 
    Results (mean, median, std) are saved in .csv format.'''
    unique_name=(df_name,df_col.name)

    sentdata = sentencize(df_col)
    #Calculate number of sentences per article/title:
    # sents_per_article_stats = sentdata.groupby(sentdata.index)\
    #     .apply(lambda x: x.to_list()).str.len().describe()
    sents_per_article_stats = sentdata.str.len().describe()
    sents_per_article_stats.rename("Sents_per_Article",inplace=True)
    save_to_file(sents_per_article_stats,"stats/%s_%s_Sents_per_Article.csv"%unique_name)
    #Calculate number of tokens per sentence:
    sentlen_stats = get_ColumnStats(sentdata.explode(),new_name="Tokens_per_Sentence")
    save_to_file(sentlen_stats,"stats/%s_%s_Tokens_per_Sentence.csv"%unique_name)

In [18]:
def save_plot(df_col :pd.Series, title:str, filepath:str):
    '''
    Creates a plot for a frequency distribution and saves it.'''
    plt.ylabel("Frequency")
    x_axis = df_col.index.to_list()
    y_axis = df_col.to_list()
    plt.plot(x_axis,y_axis)
    plt.title(title)
    rotation = 90
    if "Punct" in df_col.name:
        rotation = 0
    elif not "Token" in df_col.name and not "Lemma" in df_col.name:
        rotation = 80
    plt.xticks(x_axis,rotation=rotation)
    plt.tick_params(axis='x', which='major', labelsize=7.3, pad=4.1)
    plt.tight_layout()
    plt.margins(0.03)
    plt.savefig(r"plots/"+filepath,dpi=350,bbox_inches = "tight")
    plt.close()

Main Program following:

In [19]:
realN_data, fakeN_data = seperate(prepare_data())
realN_data

Unnamed: 0,Title,Body
2040,Erdogan bleibt hart und treibt Offensive trotz...,Die Türkei will die Offensive gegen die Kurden...
2041,Frankreich und Deutschland wollen EU neue Impu...,Die Parlamente von Deutschland und Frankreich ...
2042,Puigdemont als Kataloniens Regierungschef zur ...,Die katalanischen Separatisten wollen den ins ...
2043,UBS lockt Aktionäre mit höheren Dividenden und...,Die UBS will ihre Aktionäre angesichts trübere...
2044,Ein Deutscher unter Toten nach Anschlag auf Ho...,Bei dem Anschlag auf das Hotel Intercontinenta...
...,...,...
63790,Chrissy Teigen hat Angst vor Wochenbettdepression,Das schwangere Model fühlt sich diesmal aber f...
63863,Lehrer entging durch Hochzeit mit Schülerin Ve...,55-Jähriger muss nach Sex mit damals 15-Jährig...
63864,Warum die Taiwaner Toilettenpapier bunkern,Aus Angst vor Preiserhöhungen bei Klopapier ka...
63866,\r\nDie neue Premium-Klasse von Samsung\r\n ...,Am Vorabend der Eröffnung des Mobile World Con...


In [20]:
fakeN_data.name

'FakeNews'

The next cell is basically the main method. The computational load and time needed are very large, because of the size of the data set (even with optimizations!)

In [None]:
normalize=True
n_most_common = 50
for data in [fakeN_data, realN_data]:
    for colname in data:
        token_analysis(data[colname],data.name,n=n_most_common,normalize=normalize)
        ngram_analysis(data[colname],data.name,n=n_most_common,normalize=normalize)
        sent_analysis (data[colname],data.name)