In [1]:
import numpy as np
import pandas as pd
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

## Data

In [3]:
# nltk utils
lemmatizer = nltk.stem.WordNetLemmatizer()
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

In [2]:
df = pd.read_csv("../data/jigsaw-toxic-comment-train.csv", index_col=0)
df = df.sample(10000)
df.head()

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
68b0139915d46a29,Eurovision is a high key international event? ...,0,0,0,0,0,0
b0410de746c4738f,""" \n A tag has been placed on Emily Avila, req...",0,0,0,0,0,0
86d6493aafedbc71,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
57524024cd781b40,""" \n\n \n """"Hey why now are only 58 dates? th...",0,0,0,0,0,0
cb2adffa16d4ac2e,If those two articles that I took to AfD survi...,0,0,0,0,0,0


## Preprocessing

In [4]:
%%time
# preprocess text
df["comment_text"] = df["comment_text"].apply(lambda x: x.lower().replace("\n", " "))
df["comment_text"] = df["comment_text"].apply(lambda x: tokenizer.tokenize(x))
df["comment_text"] = df["comment_text"].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in x]))

# split
train_data, test_data = train_test_split(df)

# TF-IDF
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data["comment_text"])
X_test = vectorizer.transform(test_data["comment_text"])

CPU times: user 2.84 s, sys: 39.7 ms, total: 2.88 s
Wall time: 2.88 s


In [5]:
# preprocess target
y_train = train_data[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
y_test = test_data[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

## Train Model

In [6]:
%%time
# train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

CPU times: user 6.06 s, sys: 11.2 ms, total: 6.07 s
Wall time: 6.07 s


RandomForestClassifier()

In [7]:
# metrics
y_pred = model.predict(X_test)
acc, f1 = accuracy_score(y_test, y_pred), f1_score(y_test, y_pred, average="micro")

print(acc, f1)

0.9004 0.418230563002681


## Save preds

In [10]:
# Save preds
test_data[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
test_data.to_csv("Preds.csv", index=False)