# Data cleaning FINAL (Michael)

## Setup

In [None]:
# import the usual suspects / basics
import pandas as pd
import numpy as np
import re
import pickle
import os

# tqdm
from tqdm import tqdm
tqdm.pandas()

# spaCy
import spacy
#!python -m spacy download en_core_web_sm # must be run just once

# fastText
import fasttext

# display all df columns (default is 20)
pd.options.display.max_columns = None

# show all data in columns so that full comment is visible
pd.options.display.max_colwidth = None

## Load data

In [None]:
df = pd.read_csv('data/undersampled_data_60_40_ft.csv')

In [None]:
df.info()

## Optional: Create smaller sample from data to speed up things while experimenting

In [None]:
sample_size = None

# uncomment to create sample of desired size
#sample_size = 10_000

if sample_size != None:
    # ratio toxic/nontoxic
    tox_perc = 0.4
    nontox_perc = 0.6

    # number of toxic/nontoxic rows
    sample_size_tox = int(sample_size * tox_perc)
    sample_size_nontox = int(sample_size * nontox_perc)

    sample_tox = df[df['toxic'] == 1].sample(sample_size_tox,
                                             random_state=42)
    sample_nontox = df[df['toxic'] == 0].sample(sample_size_nontox,
                                                random_state=42)

    df = pd.concat([sample_tox, sample_nontox])
    print(f'Using sample ({df.shape[0]} rows).')

else:
    print(f'Using full data ({df.shape[0]} rows).')

## Create corpus

In [None]:
corp = df['comment_text']

## Data cleaning

### Show data size before cleaning

In [None]:
# count 'words' (rough regex method)
num_words_before = corp.str.count(r'\S+', flags=re.I).sum()

print(f'Number of words in corpus before cleaning: {num_words_before:,}')

### Remove anchor HTML tags (\<a\>)

TODO: Do this with an HTML parser like Beautiful Soup.

In [None]:
regex = r'<a .*?>|</a>' # *? for non-greedy repetition

# count matches
print(corp.str.count(regex, flags=re.I).sum())

# show some rows containing the pattern
corp[corp.str.contains(regex, na=False, case=False)].head()

In [None]:
# replace pattern
corp = corp.str.replace(regex, '', regex=True, case=False)

# count matches again, should be 0
print(corp.str.count(regex, flags=re.I).sum())

### Remove URLs

In [None]:
regex = r'https?://\S+'
print(corp.str.count(regex, flags=re.I).sum())
corp[corp.str.contains(regex, na=False, case=False)].head()

In [None]:
corp = corp.str.replace(regex, '', regex=True, case=False)
print(corp.str.count(regex, flags=re.I).sum())

### Remove whitespace except for spaces

\r actually causes an error when loading the saved csv file with read_csv() (just C engine, Python engine works).  
\u2028 --> Unicode line seperator.

In [None]:
regex = r'[\t\n\r\f\v\u2028]'
print(corp.str.count(regex, flags=re.I).sum())
corp[corp.str.contains(regex, na=False, case=False)].head()

In [None]:
corp = corp.str.replace(regex, ' ', regex=True, case=False)
print(corp.str.count(regex, flags=re.I).sum())

### Remove numbers

In [None]:
regex = r'\d+'
print(corp.str.count(regex, flags=re.I).sum())
corp[corp.str.contains(regex, na=False, case=False)].head()

In [None]:
corp = corp.str.replace(regex, ' ', regex=True, case=False)
print(corp.str.count(regex, flags=re.I).sum())

### Manually "unmask" morst frequent swearwords, insults etc. (e.g. f*ck, cr@p)

Also correct some (on-purpose) misspellings that reflect pronunciation, e.g. "huuuge", "stooopid".

TODO: Implement autocorrection.

In [None]:
# search patterns used to create list of replacements (see next cell)

regex = r'\S*\*\S+'
#regex = r'\S*@\S+'
#regex = r'\S*#\S+'
#regex = r'\S*a{3,}\S*'
#regex = r'\S*e{3,}\S*'
#regex = r'\S*i{3,}\S*'
#regex = r'\S*o{3,}\S*'
#regex = r'\S*u{3,}\S*'

print(corp.str.count(regex, flags=re.I).sum())
all_matches = corp.str.findall(regex, flags=re.I).value_counts()
all_matches[all_matches > 5]

In [None]:
match_list = '(?i)f*ck, (?i)sh*t, (?i)s**t, (?i)f***, (?i)p***y, (?i)b*tch, (?i)f**k, (?i)p*ssy, (?i)p****, (?i)s***, (?i)a**, (?i)h*ll, (?i)h***, (?i)sh*t, (?i)pu**y, (?i)sh**, (?i)cr*p, (?i)@ss, (?i)cr@p, (?i)b@lls, (?i)f@ck, (?i)waaay, (?i)waaaay, (?i)riiiight, (?i)soo+, (?i)stooooopid, (?i)huu+ge, (?i)yuu+ge, (?i)suu+re'\
    .replace('*', r'\*').split(', ')
replace_list = 'fuck, shit, shit, fuck, pussy, bitch, fuck, pussy, pussy, shit, ass, hell, hell, shit, pussy, shit, crap, ass, crap, balls, fuck, way, way, right, so, stupid, huge, huge, sure'\
    .split(', ')

corp.replace(match_list, replace_list, regex=True, inplace=True)

### Remove multiple spaces

In [None]:
regex = r' {2,}'
print(corp.str.count(regex, flags=re.I).sum())
corp[corp.str.contains(regex, na=False, case=False)].head()

In [None]:
corp = corp.str.replace(regex, ' ', regex=True, case=False)
print(corp.str.count(regex, flags=re.I).sum())

### Show data size after cleaning

In [None]:
num_words_after = corp.str.count(r'\S+', flags=re.I).sum()

print(f'Number of words in corpus after cleaning: {num_words_after:,} (before: {num_words_before:,})')

## Preprocess data with spaCy (based on Eric's pipeline)

See: https://realpython.com/natural-language-processing-spacy-python/

TODO: Check if NLTK is faster.

In [None]:
# load English language model
nlp = spacy.load('en_core_web_sm')

### Tokenize, remove punctuation, make lower case, lemmatize, remove stop words

In [None]:
def preprocess(s):
    doc = nlp(s)
    
    tokens = [token.text.lower()
              for token in doc
              if not token.is_punct]
    
    tokens_lemma = [token.lemma_.lower()
              for token in doc
              if not token.is_punct]
    
    tokens_lemma_stop = [token.lemma_.lower()
              for token in doc
              if not token.is_punct and not token.is_stop]
    
    # convert lists to space-separated strings and return as Series
    return pd.Series([' '.join(tokens),
                      ' '.join(tokens_lemma),
                      ' '.join(tokens_lemma_stop)],
                      index=['clean_pp',
                             'clean_pp_lemma',
                             'clean_pp_lemma_stop'])

In [None]:
corp_pp = corp.progress_apply(preprocess)
corp_pp.head()

## Create new df with raw + cleaned + preprocessed comments + target

In [None]:
df_new = pd.concat([df['comment_text'],
                    corp,
                    corp_pp['clean_pp'],
                    corp_pp['clean_pp_lemma'],
                    corp_pp['clean_pp_lemma_stop'],
                    df['toxic']], axis=1)

# column names
df_new.columns = ['raw',
                  'clean',
                  'clean_pp',
                  'clean_pp_lemma',
                  'clean_pp_lemma_stop',
                  'toxic']

df_new.head()

## Drop rows with NaN's

In [None]:
# convert empty strings to NaN
df_new.replace('', np.NaN, inplace=True)

In [None]:
df_new.isna().sum()
rows_before = df_new.shape[0]
print("Rows before dropping:", rows_before)
df_new.dropna(inplace=True)
df_new.reset_index(drop=True, inplace=True)
rows_after = df_new.shape[0]
print('Rows after dropping:', rows_after)
print('Rows dropped:', rows_before - rows_after)

## Create fastText vectors

In [None]:
# # create temp file for fastText
# df_new.comment_clean_preproc.to_csv('data/fasttext_training_data_tmp.csv',
#                                     index=False, header=False)

# # run unsupervised learning to get embeddings
# ft = fasttext.train_unsupervised('data/fasttext_training_data_tmp.csv')

# # delete temp file
# os.remove('data/fasttext_training_data_tmp.csv')

In [None]:
# # add fastText vectors to df
# df_new['ft_vector'] = df_new['comment_clean_preproc']\
#     .map(ft.get_sentence_vector)

In [None]:
df_new.info()

## Save CSV file

In [None]:
df_new.to_csv('data/data_usampl_60_40_FINAL_test.csv', index=False)

In [None]:
df_check = pd.read_csv('data/data_usampl_60_40_FINAL_test.csv')
df_check.head()

In [None]:
df_check.isna().sum()