In [None]:
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import fasttext
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix, classification_report

## Get data

The data was cleaned and pre-processed already in data-cleaning-2-michael file. 

In [None]:
df = pd.read_csv('data/data_usampl_60_40_comments_cleaned_preproc.csv')
df = df.dropna(subset=['comment_clean_preproc']) # Will remove  759 NAs lines 
df

In [None]:
X = df['comment_clean_preproc']
y = df['toxic']

### Create labels fasttext as per convention

In [None]:
df['toxic_label_ft'] = "__label__" + df['toxic'].astype(str)
df.head(3)

In [None]:
df['toxic_label_comment_text'] = df['toxic_label_ft'] + " " + df['comment_clean_preproc']
df.head(3)

## Split and train

In [None]:
train,test = train_test_split(df,test_size=0.2, random_state=42, stratify=y) 

In [None]:
train.to_csv("data/fasttext_train", columns=["toxic_label_comment_text"], index=False, header=False)
test.to_csv("data/fasttext_test", columns=["toxic_label_comment_text"], index=False, header=False)

In [None]:
model = fasttext.train_supervised(input="data/fasttext_train", lr=0.5, epoch=25, wordNgrams=2)
model.test("data/fasttext_test")

### Get vectors

In [None]:
tqdm.pandas()
df['vector_fast_text'] = df['comment_clean_preproc'].progress_apply(lambda text: model.get_sentence_vector(text))

In [None]:
df.head(3)

In [None]:
df.drop(["toxic_label_ft","toxic_label_comment_text"],axis=1,inplace=True)

In [None]:
df.head(3)

In [None]:
df.to_csv('data/undersampled_data_60_40_fast_text_vectors.csv')