# Data preprocessing and lemmatisation

In [None]:
! pip install polyglot
! pip install pyicu
! pip install Morfessor       
! pip install pycld2   
! polyglot download LANG:pl

Collecting polyglot
  Downloading polyglot-16.7.4.tar.gz (126 kB)
[?25l[K     |██▋                             | 10 kB 15.8 MB/s eta 0:00:01[K     |█████▏                          | 20 kB 16.7 MB/s eta 0:00:01[K     |███████▉                        | 30 kB 18.7 MB/s eta 0:00:01[K     |██████████▍                     | 40 kB 14.2 MB/s eta 0:00:01[K     |█████████████                   | 51 kB 12.1 MB/s eta 0:00:01[K     |███████████████▋                | 61 kB 13.6 MB/s eta 0:00:01[K     |██████████████████▏             | 71 kB 10.7 MB/s eta 0:00:01[K     |████████████████████▊           | 81 kB 11.4 MB/s eta 0:00:01[K     |███████████████████████▍        | 92 kB 12.2 MB/s eta 0:00:01[K     |██████████████████████████      | 102 kB 11.8 MB/s eta 0:00:01[K     |████████████████████████████▌   | 112 kB 11.8 MB/s eta 0:00:01[K     |███████████████████████████████▏| 122 kB 11.8 MB/s eta 0:00:01[K     |████████████████████████████████| 126 kB 11.8 MB/s 
[?25hBuild

In [None]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from polyglot.text import Text
from polyglot.detect import Detector
import string
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization
drive.mount('/content/drive') 

Mounted at /content/drive


In [None]:
def drop_title_and_url(df):
    return df.drop(columns=['Title', 'Url'])

def drop_empty(df):
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

def drop_non_polish(df):
    for index, row in df.iterrows():
        text = row['Text']
        detector = Detector(text, quiet=True)
        if not (detector.language.name == 'Polish' and 
                detector.language.confidence >= 70):
            df.drop([index], inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

def drop_unidentified(df):
    return df[df['Verdict'] != 'unidentified']

def drop_twitter(df):
    return df[df['Text'].str.contains('Nowy na Twitterze')==False]

In [None]:
# Load fake news dataset
df_fake = pd.read_excel('drive/MyDrive/Fake News Detection/data/fakehunter_dataset.xlsx')

# Drop irrelevant columns 
df_fake = drop_title_and_url(df_fake)

# Drop rows containing NaNs
df_fake = drop_empty(df_fake)

# Drop news with "unidentified" verdict
df_fake = drop_unidentified(df_fake)

# Drop non-polish news
df_fake = drop_non_polish(df_fake)

# Drop noisy twitter news
df_fake = drop_twitter(df_fake)

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

In [None]:
# Load reliable news dataset
df_real = pd.read_excel('drive/MyDrive/Fake News Detection/data/termedia_dataset.xlsx')

# Drop irrelevant columns
df_real = drop_title_and_url(df_real)

# Drop rows containing NaNs
df_real = drop_empty(df_real)

In [None]:
# Create complete dataset (containing both fake and real news) and reset index
df = pd.concat([df_fake, df_real])
df.reset_index(drop=True, inplace=True)

# Transform "Verdict" column to boolean 
df['Verdict'].replace('false', 0, inplace=True)
df['Verdict'].replace('true', 1, inplace=True)
df['Verdict'] = df['Verdict'].astype(bool)

In [None]:
! pip install morfeusz2

Collecting morfeusz2
  Downloading morfeusz2-1.99.3-20211121-cp35.cp36.cp37.cp38.cp39-abi3-manylinux2014_x86_64.whl (8.5 MB)
[K     |████████████████████████████████| 8.5 MB 9.0 MB/s 
[?25hInstalling collected packages: morfeusz2
Successfully installed morfeusz2-1.99.3


In [None]:
# Imports
import re
import string
import morfeusz2

In [None]:
# def strip_punctuation(text):
#     return text.translate(str.maketrans('', '', string.punctuation))

def strip_non_alphanumeric(text):
    reg = re.compile('[^a-zA-ZĄąĆćĘęŁłŃńÓóŚśŹźŻż]')
    return reg.sub(' ', text)

def replace_whitespace(text):
    reg = re.compile('\s+')
    return reg.sub(' ', text)

def delete_escape_chars(text):
    return text.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\r', ' ')

def delete_stop_words(text):
    stop_words_txt = open('drive/MyDrive/Fake News Detection/data/polish.stopwords.txt')
    stop_words = stop_words_txt.read().split('\n')
    stop_words_txt.close()
    return [word for word in text if word not in stop_words]

def preprocess(df):
    # df['Text'] = df['Text'].apply(strip_punctuation)
    df['Text'] = df['Text'].apply(strip_non_alphanumeric)
    df['Text'] = df['Text'].apply(replace_whitespace)
    df['Text'] = df['Text'].apply(delete_escape_chars)
    df['Text'] = df['Text'].apply(str.lower)
    df['Text'] = df['Text'].apply(str.split)
    df['Text'] = df['Text'].apply(delete_stop_words)
    return df

def lemmatise(df):
    morf = morfeusz2.Morfeusz()
    for index, row in df.iterrows():
        text = row['Text']
        lemm_words = []
        for word in text:
            _, _, interpretation = morf.analyse(word)[0]
            lem_word = interpretation[1]
            lem_word_stripped = lem_word.split(':', 1)[0].lower()
            lemm_words.append(lem_word_stripped)
        df.loc[index, 'Text'] = ' '.join(lemm_words)
    return df

In [None]:
df = preprocess(df)
df = lemmatise(df)

In [None]:
# Drop rows not containing text and reset index
df = df[df['Text'].astype(bool)]
df.reset_index(drop=True, inplace=True)

# Drop rows with text shorter than 30 chars
df = df[df['Text'].apply(len) >= 30]

# Display number of fake and real news
print(f'Number of fake news: \t' + str(len(df[df['Verdict'] == False])))
print(f'Number of real news: \t' + str(len(df[df['Verdict'] == True])))
print(f'Total number of news: \t' + str(len(df)))

# Export complete dataset
df.to_excel('complete_dataset.xlsx', encoding='utf-8', index=False)

Number of fake news: 	753
Number of real news: 	1487
Total number of news: 	2240
