# Data preprocessing

In [None]:
! pip install polyglot
! pip install pyicu     # polyglot requirement
! pip install Morfessor # polyglot requirement
! pip install pycld2    # polyglot requirement
! pip install morfeusz2

In [None]:
# Imports
import pandas as pd
from polyglot.detect import Detector
from polyglot.downloader import downloader
import re
import morfeusz2
from typing import List

downloader.download('LANG:pl')

In [None]:
def drop_title_and_url(df):
    """Drops 'Title' and 'Url' columns.

    Args:
        df (pd.Dataframe): News dataset.

    Returns:
        pd.Dataframe: News dataset without 'Title' and 'Url' columns.
    """
    return df.drop(columns=['Title', 'Url'])


def drop_empty(df):
    """Drops empty rows.

    Args:
        df (pd.Dataframe): News dataset.

    Returns:
        df (pd.Dataframe): News dataset without empty rows.
    """
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df


def drop_non_polish(df):
    """Detects and drops non-polish articles.

    Args:
        df (pd.Dataframe): News dataset containing non-polish articles.

    Returns:
        df (pd.Dataframe): News dataset containing only polish articles.
    """
    for index, row in df.iterrows():
        text = row['Text']
        detector = Detector(text, quiet=True)
        if not (detector.language.name == 'Polish' and 
                detector.language.confidence >= 70):
            df.drop([index], inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df


def drop_unidentified(df):
    """Drops news with 'unidentified' verdict.

    Args:
        df (pd.Dataframe): News dataset containing 'unidentified' news.

    Returns:
        pd.Dataframe: News dataset containing only fake or real news.
    """
    return df[df['Verdict'] != 'unidentified']


def drop_twitter(df):
    """Drops news from twitter containing some website elements.

    Args:
        df (pd.Dataframe): News dataset containing news with some website elements.

    Returns:
        pd.Dataframe: News dataset without noisy twitter news.
    """
    return df[df['Text'].str.contains('Nowy na Twitterze')==False]


def change_verdict_dtype(df):
    """Changes data type of 'Verdict' column to boolean.

    Args:
        df (pd.Dataframe): News dataset.

    Returns:
        df (pd.Dataframe): News dataset with boolean 'Verdict' column.
    """
    df['Verdict'].replace('false', 0, inplace=True)
    df['Verdict'].replace('true', 1, inplace=True),
    df['Verdict'] = df['Verdict'].astype(bool)
    return df


def drop_short(df):
    """Drops articles shorter than 30 characters.

    Args:
        df (pd.Dataframe): News dataset.

    Returns:
        pd.Dataframe: News dataset with articles not shorter than 30 chars.
    """
    return df[df['Text'].apply(len) >= 30]


In [None]:
# Load fake news dataset
df_fake = pd.read_excel('fakehunter_dataset.xlsx')

df_fake = drop_title_and_url(df_fake)
df_fake = drop_empty(df_fake)
df_fake = drop_unidentified(df_fake)
df_fake = drop_non_polish(df_fake)
df_fake = drop_twitter(df_fake)

In [None]:
# Load reliable news dataset
df_real = pd.read_excel('termedia_dataset.xlsx')

df_real = drop_title_and_url(df_real)
df_real = drop_empty(df_real)

In [None]:
# Create complete dataset (containing both fake and real news) and reset index
df = pd.concat([df_fake, df_real])
df.reset_index(drop=True, inplace=True)

df = change_verdict_dtype(df)

display(df.head())

# Text preprocessing

In [None]:
def delete_escape_chars(text: str) -> str:
    """Replaces escape characters with single whitespace.

    Args:
        text (str): Input article.

    Returns:
        ret_text (str): Processed article.
    """
    ret_text = text.replace('\\n', ' ').replace('\n', ' ').replace('\t', ' ').replace('\r', ' ')
    return ret_text


def strip_non_polish(text: str) -> str:
    """Replaces non-polish characters with single whitespace.

    Args:
        text (str): Input article.

    Returns:
        ret_text (str): Processed article.
    """
    reg = re.compile('[^a-zA-ZĄąĆćĘęŁłŃńÓóŚśŹźŻż]')
    ret_text = reg.sub(' ', text)
    return ret_text


def replace_whitespace(text: str) -> str:
    """Replaces multiple whitespaces with single whitespace.

    Args:
        text (str): Input article.

    Returns:
        ret_text (str): Processed article.
    """
    reg = re.compile('\s+')
    ret_text = reg.sub(' ', text)
    return ret_text


def lowercase_all(text: str) -> str:
    """Converts case of article.

    Args:
        text (str): Input article.

    Returns:
        ret_text (str): Processed article.
    """
    ret_text = str.lower(text)
    return ret_text


def tokenize(text: str) -> List[str]:
    """Performs tokenization by splitting articles into words.

    Args:
        text (str): Input article.

    Returns:
        ret_text (str): Processed article.
    """
    ret_text = str.split(text)
    return ret_text


def delete_stop_words(text: str) -> List[str]:
    """Removes stopwords.

    Args:
        text (str): Input article containing stopwords.

    Returns:
        ret_text (str): Processed article.
    """
    stop_words_txt = open('stopwords.txt')
    stop_words = stop_words_txt.read().split('\n')
    stop_words_txt.close()
    ret_text = [word for word in text if word not in stop_words]
    return ret_text


def lemmatize(df):
    """Performs lemmatization of articles in dataset.

    Args:
        df (pd.Dataframe): News dataset.

    Returns:
        df (pd.Dataframe): Lemmatized news dataset.
    """
    morf = morfeusz2.Morfeusz()
    for index, row in df.iterrows():
        text = row['Text']
        lemm_words = []
        for word in text:
            _, _, interpretation = morf.analyse(word)[0]
            lem_word = interpretation[1]
            lem_word_stripped = lem_word.split(':', 1)[0].lower()
            lemm_words.append(lem_word_stripped)
        df.loc[index, 'Text'] = ' '.join(lemm_words)
    return df


In [None]:
df['Text'] = df['Text'].apply(delete_escape_chars)
df['Text'] = df['Text'].apply(strip_non_polish)
df['Text'] = df['Text'].apply(replace_whitespace)
df['Text'] = df['Text'].apply(lowercase_all)
df['Text'] = df['Text'].apply(tokenize)
df['Text'] = df['Text'].apply(delete_stop_words)

df = lemmatize(df)

In [None]:
display(df.head())

df = drop_empty(df)
df = drop_short(df)

# Display number of fake and real news
print(f'Number of fake news: \t' + str(len(df[df['Verdict'] == False])))
print(f'Number of real news: \t' + str(len(df[df['Verdict'] == True])))
print(f'Total number of news: \t' + str(len(df)))

# Export complete dataset
df.to_excel('complete_dataset.xlsx', encoding='utf-8', index=False)

Unnamed: 0,Verdict,Text
0,False,nieoczywisty rzeczywistość fsychologia pozytyw...
1,False,bęcwalstwo polski nauka odkryty rpa nowy wirus...
2,False,magmag zaczęlo wyłączać serce sportowiec cały ...
3,False,omikron atakować kłamstwo kowidowych jednyn sp...
4,False,głowny portal zdrowie jeszu wiara yeshu ang fi...


Number of fake news: 	753
Number of real news: 	1487
Total number of news: 	2240
