# Data cleaning (Michael)

Remove URLs etc.

## Setup

In [1]:
# import the usual suspects / basics
import pandas as pd
import numpy as np
import re
import pickle

# spaCy
import spacy

# display all df columns (default is 20)
pd.options.display.max_columns = None

# show all data in columns so that full comment is visible
pd.options.display.max_colwidth = None

## Load/unpickle data

In [2]:
#df = pd.read_csv('data/undersampled_data_60_40_ft.csv')
#with open('pickle/undersampled_data_60_40.pkl', mode='wb') as f:
#    pickle.dump(df, f)

with open('pickle/undersampled_data_60_40.pkl', mode='rb') as f:
    df = pickle.load(f)

df.shape

(360835, 6)

## Optional: Create smaller sample from data to speed up things while experimenting

In [3]:
sample_size = None

# uncomment to create sample of desired size
#sample_size = 25_000

if sample_size != None:
    # ratio toxic/nontoxic
    tox_perc = 0.4
    nontox_perc = 0.6

    # number of toxic/nontoxic rows
    sample_size_tox = int(sample_size * tox_perc)
    sample_size_nontox = int(sample_size * nontox_perc)

    sample_tox = df[df['toxic'] == 1].sample(sample_size_tox,
                                             random_state=42)
    sample_nontox = df[df['toxic'] == 0].sample(sample_size_nontox,
                                                random_state=42)

    df = pd.concat([sample_tox, sample_nontox])
    print(f'Using sample ({df.shape[0]} rows).')

else:
    print(f'Using full data ({df.shape[0]} rows).')

Using full data (360835 rows).


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360835 entries, 0 to 360834
Data columns (total 6 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   comment_text           360835 non-null  object
 1   toxic                  360835 non-null  int64 
 2   stopwords_punct_lemma  360301 non-null  object
 3   vector_spacy           360835 non-null  object
 4   pos_tags               360835 non-null  object
 5   pos_tags_str           360835 non-null  object
dtypes: int64(1), object(5)
memory usage: 16.5+ MB


## Create corpus

In [5]:
corp = df['comment_text']

## Data cleaning

### Remove anchor HTML tags (\<a\>)

In [6]:
regex = r'<a .*?>|</a>' # *? for non-greedy repetition
print(corp.str.count(regex, flags=re.IGNORECASE).sum())
corp[corp.str.contains(regex, na=False, case=False)].head(10)

77


8294                                                                                                                                             You can buy from our large and diverse collection of salwar kameez, party wear suits, bollywood collection, cotton kurtis, Anarrkali suits,Bollywood saree and many other products.....\nWe Have Some For You In Your Budget For more…\nPlz visit:- <a href= "http://www.dooiitt.com/">Designer Salwar Kameez</a>
28676                                                                                                                                                                <a href="http://www.newfitnessbooster.com/dermessence/">Dermessence</a> has most essential nutrients that this formula has and that act directly and indirectly in combating again signs from the inside out. for more information please visit http://www.newfitnessbooster.com/dermessence/
32307                                                                                             

In [7]:
corp = corp.str.replace(regex, '', regex=True, case=False)
print(corp.str.count(regex, flags=re.IGNORECASE).sum())

0


### Remove URLs

In [8]:
regex = r'https?://\S+'
print('Total matches:',
      corp.str.count(regex, flags=re.IGNORECASE).sum())
corp[corp.str.contains(regex, na=False, case=False)].head(10)

Total matches: 9725


3                                    We are already owed $488 M plus interest($2Billion) from 2006 audits the state has not collected.\nhttps://www.adn.com/energy/article/oil-audit-draft/2014/11/20/\n\nThis amount of interest doesn't seem correct...\n\n'$416 million in taxes, plus another $368 million in interest between 2007 and 2009'\n\nWhen oil companies sued the state they wanted $100 M plus $400 M interest from 2006.\nhttps://www.adn.com/business-economy/energy/2016/12/16/state-wins-case-against-oil-companies-worth-an-estimated-500-million/\n\nIs the state interest rate is much lower than the one oil companies set for us, or the legislature is letting them off with only 3 years of interest?\n\n "The new law includes the unbelievable provision that after three years the companies will pay zero additional interest on delinquent taxes."\nhttps://www.adn.com/opinions/2016/11/29/with-pfd-cut-on-the-line-oil-company-arguments-about-fine-points-of-tax-regs-will-backfire/
65              

In [9]:
corp = corp.str.replace(regex, '', regex=True, case=False)
print('Total matches after:',
      corp.str.count(regex, flags=re.IGNORECASE).sum())

Total matches after: 0


### Remove newlines (\n), carriage returns (\r), unicode line separators (U+2028)

\r actually causes an error when trying to load the saved csv file with read_csv()

In [10]:
regex = r'\n'
print('Total matches:',
      corp.str.count(regex, flags=re.IGNORECASE).sum())
corp[corp.str.contains(regex, na=False, case=False)].head(10)

Total matches: 392196


1                                                                                                                                                                                                                                                                                                                                                                            The moment of critical mass is approaching when the deeds of Gupta & Co, like huge turbine engines slow down, halt and the reverse direction of the wheels of justice are set in motion leaving no hiding room.\n\n‘...unintended consequences’…. uneasy sleep ahead for many.
2                                                                                                                                                                                                                                                                                                                                                                           

In [11]:
corp = corp.str.replace(regex, ' ', regex=True, case=False)
print('Total matches after:',
      corp.str.count(regex, flags=re.IGNORECASE).sum())

Total matches after: 0


In [None]:
regex = r'\r'
print('Before:', corp.str.count(regex, flags=re.IGNORECASE).sum())
corp = corp.str.replace(regex, ' ', regex=True, case=False)
print('After:', corp.str.count(regex, flags=re.IGNORECASE).sum())

In [None]:
regex = r'\u2028'
print('Before:', corp.str.count(regex, flags=re.IGNORECASE).sum())
corp = corp.str.replace(regex, ' ', regex=True, case=False)
print('After:', corp.str.count(regex, flags=re.IGNORECASE).sum())

### Remove numbers

In [12]:
regex = r'\d+'
print(corp.str.count(regex, flags=re.IGNORECASE).sum())
corp[corp.str.contains(regex, na=False, case=False)].head(10)

168643


3                                                                                                                                                                                                                                                                                                                                                                                      We are already owed $488 M plus interest($2Billion) from 2006 audits the state has not collected.   This amount of interest doesn't seem correct...  '$416 million in taxes, plus another $368 million in interest between 2007 and 2009'  When oil companies sued the state they wanted $100 M plus $400 M interest from 2006.   Is the state interest rate is much lower than the one oil companies set for us, or the legislature is letting them off with only 3 years of interest?   "The new law includes the unbelievable provision that after three years the companies will pay zero additional interest on delinquent taxes." 
9       

In [13]:
corp = corp.str.replace(regex, '_number_', regex=True, case=False)
print(corp.str.count(regex, flags=re.IGNORECASE).sum())

0


### "Unmask" morst frequent swearwords, insults etc. (e.g. f*ck, cr@p)

Also correct some (on-purpose) misspellings that reflect pronunciation, e.g. "huuuge", "stooopid".

In [14]:
# search patterns used to create list of replacements (see next cell)

regex = r'\S*\*\S+'
#regex = r'\S*@\S+'
#regex = r'\S*#\S+'
#regex = r'\S*a{3,}\S*'
#egex = r'\S*e{3,}\S*'
#regex = r'\S*i{3,}\S*'
#regex = r'\S*o{3,}\S*'
#regex = r'\S*u{3,}\S*'

print(corp.str.count(regex, flags=re.IGNORECASE).sum())
corp.str.findall(regex, flags=re.IGNORECASE).value_counts().head(50)

3961


comment_text
[]              357614
[sh*t]              77
[***]               49
[a**]               44
[****]              36
[s**t]              32
[*****]             31
[f***]              27
[p***y]             25
[f**k]              23
[p*ssy]             19
[p****]             19
[a**.]              17
[s***]              17
[h*ll]              16
[F***]              16
[*is*]              14
[h***]              12
[*any*]             12
[sh*t.]             12
[pu**y]             11
[sh**]              11
[cr*p]              11
[*not*]             11
[F*ck]              10
[*sigh*]            10
[f***ing]           10
[***, ***]           9
[*are*]              9
[s**t.]              9
[****, ****]         9
[**]                 9
[*&^%]               9
[*some*]             8
[*could*]            8
[b*tch]              8
[*lol*]              8
[*you*]              8
[*ss]                8
[a**es]              8
[*only*]             8
[f*ck]               8
[a**hole]            

In [15]:
match_list = '(?i)f*ck, (?i)sh*t, (?i)s**t, (?i)f***, (?i)p***y, (?i)b*tch, (?i)f**k, (?i)p*ssy, (?i)p****, (?i)s***, (?i)a**, (?i)h*ll, (?i)h***, (?i)sh*t, (?i)pu**y, (?i)sh**, (?i)cr*p, (?i)@ss, (?i)cr@p, (?i)b@lls, (?i)f@ck, (?i)waaay, (?i)waaaay, (?i)riiiight, (?i)soo+, (?i)stooooopid, (?i)huu+ge, (?i)yuu+ge, (?i)suu+re'\
    .replace('*', r'\*').split(', ')
replace_list = 'fuck, shit, shit, fuck, pussy, bitch, fuck, pussy, pussy, shit, ass, hell, hell, shit, pussy, shit, crap, ass, crap, balls, fuck, way, way, right, so, stupid, huge, huge, sure'\
    .split(', ')

corp.replace(match_list, replace_list, regex=True, inplace=True)

### Remove multiple spaces

In [16]:
regex = r' {2,}'
print(corp.str.count(regex, flags=re.IGNORECASE).sum())
corp[corp.str.contains(regex, na=False, case=False)].head()

496937


1                                                                                                                                                                                                                                                                                                                                                                                                                       The moment of critical mass is approaching when the deeds of Gupta & Co, like huge turbine engines slow down, halt and the reverse direction of the wheels of justice are set in motion leaving no hiding room.  ‘...unintended consequences’…. uneasy sleep ahead for many.
2                                                                                                                                                                                                                                                                                                                                  

In [17]:
corp = corp.str.replace(regex, ' ', regex=True, case=False)
print(corp.str.count(regex, flags=re.IGNORECASE).sum())

0


## Preprocess data

Remove stopwords and punctuation, then lemmatize. Code copied from Data_preprocess_Eric.ipynb.

In [18]:
# load English language model
nlp = spacy.load('en_core_web_sm')

# function that returns list of lemmatized tokens with stop words and
# punctuation marks removed
def preprocess(text):
    doc = nlp(text) # tokenize

    final_tokens = []

    for token in doc:
        if token.is_stop or token.is_punct:
           continue # skip punctuation marks and stop words
        final_tokens.append(token.lemma_) # lemmatize token

    return " ".join(final_tokens) # convert list to space-separated string

In [19]:
corp_pp = corp.apply(lambda comment: preprocess(comment))
corp_pp

0                                                                                                                                                                                                                                                                                                                                                                                        chance turn active proponent slavery
1                                                                                                                                                                                                                                       moment critical mass approach deed Gupta Co like huge turbine engine slow halt reverse direction wheel justice set motion leave hiding room unintended consequence uneasy sleep ahead
2                                                                                                                                                                           

## Create new df with cleaned + preprocessed comments + target

In [47]:
df_new = pd.concat([corp, corp_pp, df['toxic']], axis=1)
df_new.columns = ['comment_clean', 'comment_clean_preproc', 'toxic']
df_new

Unnamed: 0,comment_clean,comment_clean_preproc,toxic
0,"Well, what are the chances he will turn out to have been an active proponent of slavery?",chance turn active proponent slavery,0
1,"The moment of critical mass is approaching when the deeds of Gupta & Co, like huge turbine engines slow down, halt and the reverse direction of the wheels of justice are set in motion leaving no hiding room. ‘...unintended consequences’…. uneasy sleep ahead for many.",moment critical mass approach deed Gupta Co like huge turbine engine slow halt reverse direction wheel justice set motion leave hiding room unintended consequence uneasy sleep ahead,0
2,"""Hey listen to me,"" he said. ""I'm not going to put up with your crap about all this."" He shouldn't have to prove himself to a reporter, he said. Uh, actually Ben, you do. And you didn't. Buh-bye.",hey listen say go crap prove reporter say uh actually Ben Buh bye,1
3,"We are already owed $_number_ M plus interest($_number_Billion) from _number_ audits the state has not collected. This amount of interest doesn't seem correct... '$_number_ million in taxes, plus another $_number_ million in interest between _number_ and _number_' When oil companies sued the state they wanted $_number_ M plus $_number_ M interest from _number_. Is the state interest rate is much lower than the one oil companies set for us, or the legislature is letting them off with only _number_ years of interest? ""The new law includes the unbelievable provision that after three years the companies will pay zero additional interest on delinquent taxes.""",owe $ number M plus interest($_number_Billion number audits state collect interest correct $ number million taxis plus $ number million interest number number oil company sue state want $ number m plus $ number m interest number state interest rate low oil company set legislature let number year interest new law include unbelievable provision year company pay zero additional interest delinquent taxis,0
4,"There is a reason there are no teeth to the law. It is an unlawful law. There is no way anyone can be forced to give someone else free electricity. Not yet at least. You want to be green , pay for it yourself like every body else must.",reason tooth law unlawful law way force free electricity want green pay like body,0
...,...,...,...
360830,Do you still beat your wife? Simple question.,beat wife simple question,0
360831,"The fascist dictator continues the insanity against all human and civil rights by the National Security State formerly the purview of Hitler, Stalin, Mao, Pol Pot, ad nauseum.",fascist dictator continue insanity human civil right National Security State purview Hitler Stalin Mao Pol Pot ad nauseum,1
360832,Sean Hannity is a lightweight foolish commentator on Fox News. He is in over his head in trying to act tough with the big boys.,Sean Hannity lightweight foolish commentator Fox News head try act tough big boy,0
360833,There are a number of countries which make it impossible for their nationals to give up citizenship. Even if a new Cdn citizen wanted to give up their citizenship in their country of origin they may not be able to.,number country impossible national citizenship new Cdn citizen want citizenship country origin able,0


## Save CSV file with cleaned and preprocessed comments

In [48]:
df_new.to_csv('data/data_usampl_60_40_comments_cleaned_preproc.csv', index=False)