In [113]:
import pandas as pd
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import fasttext
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix, classification_report

In [99]:
df = pd.read_csv('data/undersampled_data_60_40.csv')
df = df.dropna(subset=['stopwords_punct_lemma'])

#drop columns I do not need
df = df.drop(columns=['vector_spacy','pos_tags','pos_tags_str'], axis=1)

In [100]:
X = df['comment_text']
y = df['toxic']

### Create labels fasttext as per convention

In [101]:
df['toxic_label_ft'] = "__label__" + df['toxic'].astype(str)
df.head(3)

Unnamed: 0,comment_text,toxic,stopwords_punct_lemma,toxic_label_ft
0,"Well, what are the chances he will turn out to...",0,chance turn active proponent slavery,__label__0
1,The moment of critical mass is approaching whe...,0,moment critical mass approach deed Gupta Co li...,__label__0
2,"""Hey listen to me,"" he said. ""I'm not going to...",1,hey listen say go crap prove reporter say \n\n...,__label__1


In [103]:
df['toxic_label_comment_text'] = df['toxic_label_ft'] + " " + df['stopwords_punct_lemma']
df.head(3)

Unnamed: 0,comment_text,toxic,stopwords_punct_lemma,toxic_label_ft,toxic_label_comment_text
0,"Well, what are the chances he will turn out to...",0,chance turn active proponent slavery,__label__0,__label__0 chance turn active proponent slavery
1,The moment of critical mass is approaching whe...,0,moment critical mass approach deed Gupta Co li...,__label__0,__label__0 moment critical mass approach deed ...
2,"""Hey listen to me,"" he said. ""I'm not going to...",1,hey listen say go crap prove reporter say \n\n...,__label__1,__label__1 hey listen say go crap prove report...


In [104]:
def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ',text)
    text = re.sub(r' +', ' ',text)
    text = text.replace('\n', ' ').replace('\r', ' ')
    return text.strip().lower()

In [105]:
tqdm.pandas()
df['toxic_label_comment_text'] = df['toxic_label_comment_text'].progress_apply(lambda text: preprocess(text))
df['stopwords_punct_lemma'] = df['stopwords_punct_lemma'].progress_apply(lambda text: preprocess(text))

100%|██████████| 360301/360301 [00:02<00:00, 150723.79it/s]
100%|██████████| 360301/360301 [00:02<00:00, 161129.22it/s]


In [106]:
train,test = train_test_split(df,test_size=0.2, random_state=42, stratify=y) 

In [107]:
train.to_csv("data/fasttext_train", columns=["toxic_label_comment_text"], index=False, header=False)
test.to_csv("data/fasttext_test", columns=["toxic_label_comment_text"], index=False, header=False)

In [108]:
model = fasttext.train_supervised(input="data/fasttext_train", lr=0.5, epoch=25, wordNgrams=2)
model.test("data/fasttext_test")

Read 7M words
Number of words:  111543
Number of labels: 2
Progress: 100.0% words/sec/thread: 2179224 lr:  0.000000 avg.loss:  0.078711 ETA:   0h 0m 0s ETA:   0h 0m 0s


(72061, 0.8612286812561579, 0.8612286812561579)

In [109]:
df['vector_fast_text'] = df['stopwords_punct_lemma'].progress_apply(lambda text: model.get_sentence_vector(text))

100%|██████████| 360301/360301 [00:06<00:00, 53176.92it/s]


In [110]:
df

Unnamed: 0,comment_text,toxic,stopwords_punct_lemma,toxic_label_ft,toxic_label_comment_text,vector_fast_text
0,"Well, what are the chances he will turn out to...",0,chance turn active proponent slavery,__label__0,__label__0 chance turn active proponent slavery,"[-0.057783302, 0.045883816, -0.04878547, -0.01..."
1,The moment of critical mass is approaching whe...,0,moment critical mass approach deed gupta co li...,__label__0,__label__0 moment critical mass approach deed ...,"[-0.03851747, 0.0294842, -0.03536485, -0.00215..."
2,"""Hey listen to me,"" he said. ""I'm not going to...",1,hey listen say go crap prove reporter say u...,__label__1,__label__1 hey listen say go crap prove report...,"[0.08621803, -0.06944817, 0.083605714, 0.00305..."
3,We are already owed $488 M plus interest($2Bil...,0,owe 488 m plus interest 2billion 2006 audits s...,__label__0,__label__0 owe 488 m plus interest 2billion 20...,"[-0.021724382, 0.018108185, -0.022645112, -0.0..."
4,There is a reason there are no teeth to the la...,0,reason tooth law unlawful law way force free e...,__label__0,__label__0 reason tooth law unlawful law way f...,"[-0.04083619, 0.032266207, -0.039522655, -0.00..."
...,...,...,...,...,...,...
360830,Do you still beat your wife? Simple question.,0,beat wife simple question,__label__0,__label__0 beat wife simple question,"[-0.116752625, 0.09984541, -0.11092692, 0.0111..."
360831,The fascist dictator continues the insanity ag...,1,fascist dictator continue insanity human civil...,__label__1,__label__1 fascist dictator continue insanity ...,"[0.032213554, -0.021368029, 0.024845082, -0.00..."
360832,Sean Hannity is a lightweight foolish commenta...,0,sean hannity lightweight foolish commentator f...,__label__0,__label__0 sean hannity lightweight foolish co...,"[-0.02342204, 0.014474757, -0.01676093, -0.005..."
360833,There are a number of countries which make it ...,0,number country impossible national citizenship...,__label__0,__label__0 number country impossible national ...,"[-0.014181848, 0.013809968, -0.008141792, -0.0..."


In [114]:
df.to_csv('data/undersampled_data_60_40_ft.csv')