### Importing dependencies

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import seaborn as sns
import nltk
import contractions
import unicodedata
import spacy

In [3]:
text = pd.read_csv('train.csv')

In [6]:
text.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,sum,offensive
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0,1,1
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0,0,0


### Classifying comments in two classes of offensive and non-offensive

In [5]:
text['sum'] = text[['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']].sum(axis = 1)
text['offensive'] = text['sum'].apply(lambda x: 1 if x != 0 else x)
text['comment_text'] = text['comment_text'].astype(str)

### Build Train and Test Datasets

In [13]:
train = text[:20000]
test = text[20000:40000]

train_Comment = train.comment_text.values
test_comment = test.comment_text.values

train_Comment = train.comment_text[:20000]
train_Offense = train.offensive[:20000]
test_comment = test.comment_text[20000:40000]
test_Offense = test.offensive[20000:40000]


### Text Wrangling & Normalization

In [15]:
nlp = spacy.load('en', parse = False, tag=False, entity=False)
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('not')

def strip_tags(text):
    text = text.lower()
    text = re.sub(r"http\s?:\s?\S+", "", text)
    text = re.sub(r"(^[@|#]\[w|d]+)", "" , text)
    text = re.sub(r"e\s?-\s?mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"j/k", "jk", text)
    text = re.sub(r"http\s?;\s?\S+", "", text)
    return text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD' , text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def expand_contractions(text):
    return contractions.fix(text)

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

def remove_special_characters(text , remove_digits = False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern , "" , text)
    return text

def remove_stopwords(text, is_lower_case=False, stopwords=None):
    if not stopwords:
        stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    
    filtered_text = " ".join(filtered_tokens)
    return filtered_text


In [16]:
def pre_process_document(document):
    
    # lower case
    document = document.lower()
    
    # remove extra newlines
    document = document.translate(document.maketrans("\n\t\r", "   "))
    
    # strip tags
    document = strip_tags(document)
    
    # remove accented characters
    document = remove_accented_chars(document)
    
    # expand contractions
    document = expand_contractions(document)
    
    # text lemmatizatin
    document = lemmatize_text(document)
    
    # remove special characters
    document = remove_special_characters(document , remove_digits= True)
    
    # remove stopwords
    document = remove_stopwords(document , is_lower_case= True , stopwords= stop_words)
    
    #remove extra whitespace
    document = re.sub(' +', ' ' , document)
    
    document = document.strip()
    
    return document


pre_process_corpus = np.vectorize(pre_process_document)

In [21]:
%%time

norm_train_reviews = pre_process_corpus(train_Comment)
norm_test_reviews = pre_process_corpus(test_comment)

CPU times: user 58min 7s, sys: 1min 57s, total: 1h 5s
Wall time: 10min 9s


In [22]:
train_data = {'comment' :  norm_train_reviews , 'offensive' : train_Offense ,'toxic' : train.toxic, 
              'severe_toxic' : train.severe_toxic, 'obscene' : train.obscene, 'threat': train.threat
              ,'insult': train.insult, 'identity_hate': train.identity_hate }

df_train = pd.DataFrame(data= train_data , columns= ['comment' , 'offensive' ,'toxic',
                                                     'severe_toxic', 'obscene', 'threat','insult', 'identity_hate'])

test_data = {'comment' :  norm_test_reviews , 'offensive' : test.offensive ,'toxic' : test.toxic, 
              'severe_toxic' : test.severe_toxic, 'obscene' : test.obscene, 'threat': test.threat
              ,'insult': test.insult, 'identity_hate': test.identity_hate }

df_test = pd.DataFrame(data= test_data , columns= ['comment' , 'offensive' ,'toxic',
                                                     'severe_toxic', 'obscene', 'threat','insult', 'identity_hate'])

In [36]:
df_train.to_csv('train-dataset.csv')
df_test.to_csv('hold-out.csv')