In [48]:
import os
import pandas as pd
import re
import nltk
import contractions
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from langdetect import detect

[nltk_data] Downloading package stopwords to /home/void-
[nltk_data]     keishi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/void-
[nltk_data]     keishi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/void-
[nltk_data]     keishi/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [10]:
train = pd.read_csv('../Dataset_Original/UTK/train.csv')
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


Remove the author column

In [11]:
train.drop(columns=['author'], inplace=True)
train.head()

Unnamed: 0,id,title,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1


# Cleaning

## Missing & duplicate

In [12]:
train.isnull().sum()

id         0
title    558
text      39
label      0
dtype: int64

In [13]:
train.dropna(inplace=True)

In [14]:
train.drop_duplicates(subset='text', inplace=True, keep='first')
train.drop_duplicates(subset='title', inplace=True, keep='first')

In [15]:
num_unique_values = train['text'].nunique()
print(f'{num_unique_values}/{train.shape[0]}')

19509/19509


## Remove outliers

### Examples with only spaces (text length <=2)

In [18]:
train[train['text'].str.len() <= 2].head()

Unnamed: 0,id,title,text,label
82,82,Huma’s Weiner Dogs Hillary,,1
901,901,Internet Flasher,,1
4902,4902,Why Hillary Clinton's Campaign Is Collapsing |...,\n,1


In [19]:
train = train[train['text'].str.len() > 2]

### Examples with text content "source Add To The Conversation Using Facebook Comments" -> Unrelated

In [23]:
train[train['text'] == "source Add To The Conversation Using Facebook Comments"]['text']

519    source Add To The Conversation Using Facebook ...
Name: text, dtype: object

In [25]:
train = train[train['text'] != "source Add To The Conversation Using Facebook Comments"]

### Examples with non-English text

In [41]:
def not_english(text):
    try:
        return detect(text) != 'en'
    except:
        return False

not_english_examples = train[train['text'].apply(not_english)]
print(not_english_examples)

          id                                              title  \
47        47  СМИ Сербии приписали россиянам "подготовку тер...   
173      173  Trump family already ‘sworn to secrecy’ about ...   
229      229  США—КНР: на кого возлагать ответственность за ...   
233      233  La expresión “no, lo siguiente” ya es la más u...   
257      257                                   Newsticker (974)   
...      ...                                                ...   
20653  20653  Rosneft restaurera une ancienne résidence des ...   
20690  20690  Способен ли кто-нибудь помешать Соросу стать «...   
20704  20704  Kunst am Werk: Wie UralMasch zur Bilder-Galeri...   
20714  20714  ¿Quién es quién en el nuevo Consejo de Ministros?   
20740  20740  Kleiner Vorgeschmack: Erdogan lässt Warnflücht...   

                                                    text  label  text_length  
47     0 комментариев 0 поделились Фото: AP \nОднако ...      1         3585  
173                                  

In [42]:
train = train[~train['text'].apply(not_english)]

In [43]:
train = train[~train['title'].apply(not_english)]

## Preprocessing
- Combine `text` and `title` columns into one column
- Lowercase
- Expanding contractions
- Removing the URLs
- Removing the HTML tags
- Removing the numbers
- Removing the extra whitespaces
- Removing the punctuation
- Removing the stopwords
- Lemmatization
- Tokenization

In [44]:
def preprocess(df):
    # Combine `text` and `title` columns into one column
    df['combined'] = df['title'] + ' ' + df['text']

    # Lowercase
    df['combined'] = df['combined'].str.lower()

    # Expanding Contractions
    df['combined'] = df['combined'].apply(lambda x: contractions.fix(x))

    # Removing the URLs
    df['combined'] = df['combined'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x, flags=re.MULTILINE))

    # Removing the HTML tags
    df['combined'] = df['combined'].apply(lambda x: re.sub(r'<.*?>', '', x))

    # Removing the numbers
    df['combined'] = df['combined'].apply(lambda x: re.sub(r'\d+', '', x))

    # Removing the extra whitespaces
    df['combined'] = df['combined'].apply(lambda x: x.strip())

    # Removing the punctuation
    df['combined'] = df['combined'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

    # Removing the stopwords
    stop_words = set(stopwords.words('english'))
    df['combined'] = df['combined'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    df['combined'] = df['combined'].apply(lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split()))

    # Tokenization
    df['combined'] = df['combined'].apply(lambda x: word_tokenize(x))

    return df

In [49]:
train = preprocess(train)

In [50]:
# Remove rows with empty text again after processing
train['combined'].dropna(inplace=True)

In [51]:
output_dir = '../Dataset_Cleaned'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'clean_train_UTK.csv')
train.to_csv(output_file, index=False)