In [6]:
import numpy as np
import pandas as pd
import re

df = pd.read_csv('data/train.csv').drop(['id'], axis=1)
df.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [11]:
#we have 160k datapoints in the training set
print(f'The shape of the data is: {df.shape}')

The shape of the data is: (159571, 7)


In [12]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [8]:
df['comment_text'] = df['comment_text'].map(lambda x : clean_text(x))

In [14]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, random_state=42, test_size=0.33, shuffle=True)

X_train = train.comment_text
X_test = test.comment_text
print(f'Shape of the training data: {X_train.shape}')
print(f'Shape of the testing data: {X_test.shape}')

Shape of the training data: (106912,)
Shape of the testing data: (52659,)


In [15]:
outcome_list = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult','identity_hate']

In [21]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])
for category in outcome_list:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matthuntebrinker/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


... Processing toxic
Test accuracy is 0.9599688562259063
... Processing severe_toxic
Test accuracy is 0.9906378776657362
... Processing obscene
Test accuracy is 0.9789209821682903
... Processing threat
Test accuracy is 0.9974363356691164
... Processing insult
Test accuracy is 0.9713629199187224
... Processing identity_hate
Test accuracy is 0.9919861752027194
