In [7]:
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import fasttext
from imblearn.over_sampling import SMOTE

## Get data

The data was cleaned and pre-processed already in data-cleaning-2-michael file. 

In [2]:
df = pd.read_csv('data/merged_pp_df.csv')
df = df.dropna(subset=['stopwords_punct_lemma']) # Will remove  759 NAs lines 
df.head()

Unnamed: 0,comment_text,toxic,stopwords_punct_lemma,vector_spacy,pos_tags,pos_tags_str
0,"This is so cool. It's like, 'would you want yo...",0,cool like want mother read great idea,[ 0.57358134 0.40742856 -2.652657 -2.634505...,"[('This', 'DT'), ('is', 'VBZ'), ('so', 'RB'), ...","DT VBZ RB JJ . PRP VBZ IN , FW PRP VBP PRP$ NN..."
1,Thank you!! This would make my life a lot less...,0,thank life lot anxiety inducing let way,[ 2.39850569e+00 8.94715786e-02 -3.68755722e+...,"[('Thank', 'NNP'), ('you', 'PRP'), ('!', '.'),...",NNP PRP . . DT MD VB PRP$ NN DT NN JJR JJ . VB...
2,This is such an urgent design problem; kudos t...,0,urgent design problem kudo take impressive,[ 0.9049366 1.0650175 -1.8506068 -0.667853...,"[('This', 'DT'), ('is', 'VBZ'), ('such', 'JJ')...",DT VBZ JJ DT JJ NN NN : VB TO PRP IN VBG PRP I...
3,Is this something I'll be able to install on m...,0,able install site release,[ 2.15365 0.84712 -1.303075 -1.185065...,"[('Is', 'VBZ'), ('this', 'DT'), ('something', ...",VBZ DT NN PRP MD VB JJ TO VB IN PRP$ NN . WRB ...
4,haha you guys are a bunch of losers.,1,haha guy bunch loser,[-1.30565000e+00 -1.20353746e+00 -1.54419506e+...,"[('haha', 'NN'), ('you', 'PRP'), ('guys', 'NNS...",NN PRP NNS VBP DT NN IN NNS .


In [3]:
X = df['stopwords_punct_lemma']
y = df['toxic']

### Create labels fasttext as per convention

In [4]:
df['toxic_label_ft'] = "__label__" + df['toxic'].astype(str)
df.head(3)

Unnamed: 0,comment_text,toxic,stopwords_punct_lemma,vector_spacy,pos_tags,pos_tags_str,toxic_label_ft
0,"This is so cool. It's like, 'would you want yo...",0,cool like want mother read great idea,[ 0.57358134 0.40742856 -2.652657 -2.634505...,"[('This', 'DT'), ('is', 'VBZ'), ('so', 'RB'), ...","DT VBZ RB JJ . PRP VBZ IN , FW PRP VBP PRP$ NN...",__label__0
1,Thank you!! This would make my life a lot less...,0,thank life lot anxiety inducing let way,[ 2.39850569e+00 8.94715786e-02 -3.68755722e+...,"[('Thank', 'NNP'), ('you', 'PRP'), ('!', '.'),...",NNP PRP . . DT MD VB PRP$ NN DT NN JJR JJ . VB...,__label__0
2,This is such an urgent design problem; kudos t...,0,urgent design problem kudo take impressive,[ 0.9049366 1.0650175 -1.8506068 -0.667853...,"[('This', 'DT'), ('is', 'VBZ'), ('such', 'JJ')...",DT VBZ JJ DT JJ NN NN : VB TO PRP IN VBG PRP I...,__label__0


In [5]:
df['toxic_label_comment_text'] = df['toxic_label_ft'] + " " + df['stopwords_punct_lemma']
df.head(3)

Unnamed: 0,comment_text,toxic,stopwords_punct_lemma,vector_spacy,pos_tags,pos_tags_str,toxic_label_ft,toxic_label_comment_text
0,"This is so cool. It's like, 'would you want yo...",0,cool like want mother read great idea,[ 0.57358134 0.40742856 -2.652657 -2.634505...,"[('This', 'DT'), ('is', 'VBZ'), ('so', 'RB'), ...","DT VBZ RB JJ . PRP VBZ IN , FW PRP VBP PRP$ NN...",__label__0,__label__0 cool like want mother read great idea
1,Thank you!! This would make my life a lot less...,0,thank life lot anxiety inducing let way,[ 2.39850569e+00 8.94715786e-02 -3.68755722e+...,"[('Thank', 'NNP'), ('you', 'PRP'), ('!', '.'),...",NNP PRP . . DT MD VB PRP$ NN DT NN JJR JJ . VB...,__label__0,__label__0 thank life lot anxiety inducing let...
2,This is such an urgent design problem; kudos t...,0,urgent design problem kudo take impressive,[ 0.9049366 1.0650175 -1.8506068 -0.667853...,"[('This', 'DT'), ('is', 'VBZ'), ('such', 'JJ')...",DT VBZ JJ DT JJ NN NN : VB TO PRP IN VBG PRP I...,__label__0,__label__0 urgent design problem kudo take imp...


## Split and train

In [6]:
train,test = train_test_split(df,test_size=0.2, random_state=42, stratify=y) 

In [11]:
train.to_csv("data/fasttext_train", columns=["toxic_label_comment_text"], index=False, header=False)
test.to_csv("data/fasttext_test", columns=["toxic_label_comment_text"], index=False, header=False)

In [12]:
model = fasttext.train_supervised(input="data/fasttext_train", lr=0.5, epoch=25, wordNgrams=2)
model.test("data/fasttext_test")

Read 38M words
Number of words:  446476
Number of labels: 2
Progress: 100.0% words/sec/thread: 3233757 lr:  0.000000 avg.loss:  0.042235 ETA:   0h 0m 0s 82.6% words/sec/thread: 3234805 lr:  0.086905 avg.loss:  0.049179 ETA:   0h 0m 7s


(242400, 0.9423556105610561, 0.9423556105610561)

### Get vectors

In [17]:
def clean_text(text):
    """
    Function to remove newline characters from text.
    """
    # Replace newline characters with spaces
    cleaned_text = text.replace('\n', ' ')
    return cleaned_text

In [18]:
tqdm.pandas()
df['stopwords_punct_lemma'] = df['stopwords_punct_lemma'].progress_apply(clean_text)
df['vector_fast_text'] = df['stopwords_punct_lemma'].progress_apply(lambda text: model.get_sentence_vector(text))

100%|██████████| 1800402/1800402 [00:00<00:00, 2149941.58it/s]
100%|██████████| 1800402/1800402 [00:26<00:00, 67276.44it/s]


In [19]:
df.head(3)

Unnamed: 0,comment_text,toxic,stopwords_punct_lemma,vector_spacy,pos_tags,pos_tags_str,toxic_label_ft,toxic_label_comment_text,vector_fast_text
0,"This is so cool. It's like, 'would you want yo...",0,cool like want mother read great idea,[ 0.57358134 0.40742856 -2.652657 -2.634505...,"[('This', 'DT'), ('is', 'VBZ'), ('so', 'RB'), ...","DT VBZ RB JJ . PRP VBZ IN , FW PRP VBP PRP$ NN...",__label__0,__label__0 cool like want mother read great idea,"[-0.045399435, 0.07104978, -0.047424514, -0.01..."
1,Thank you!! This would make my life a lot less...,0,thank life lot anxiety inducing let way,[ 2.39850569e+00 8.94715786e-02 -3.68755722e+...,"[('Thank', 'NNP'), ('you', 'PRP'), ('!', '.'),...",NNP PRP . . DT MD VB PRP$ NN DT NN JJR JJ . VB...,__label__0,__label__0 thank life lot anxiety inducing let...,"[-0.06339683, 0.09824624, -0.066878095, -0.039..."
2,This is such an urgent design problem; kudos t...,0,urgent design problem kudo take impressive,[ 0.9049366 1.0650175 -1.8506068 -0.667853...,"[('This', 'DT'), ('is', 'VBZ'), ('such', 'JJ')...",DT VBZ JJ DT JJ NN NN : VB TO PRP IN VBG PRP I...,__label__0,__label__0 urgent design problem kudo take imp...,"[-0.059046045, 0.11193881, -0.052347053, -0.04..."


In [20]:
df.drop(["pos_tags","pos_tags_str","toxic_label_ft","toxic_label_comment_text"],axis=1,inplace=True)

In [21]:
df.head(3)

Unnamed: 0,comment_text,toxic,stopwords_punct_lemma,vector_spacy,vector_fast_text
0,"This is so cool. It's like, 'would you want yo...",0,cool like want mother read great idea,[ 0.57358134 0.40742856 -2.652657 -2.634505...,"[-0.045399435, 0.07104978, -0.047424514, -0.01..."
1,Thank you!! This would make my life a lot less...,0,thank life lot anxiety inducing let way,[ 2.39850569e+00 8.94715786e-02 -3.68755722e+...,"[-0.06339683, 0.09824624, -0.066878095, -0.039..."
2,This is such an urgent design problem; kudos t...,0,urgent design problem kudo take impressive,[ 0.9049366 1.0650175 -1.8506068 -0.667853...,"[-0.059046045, 0.11193881, -0.052347053, -0.04..."


In [22]:
df.drop(["vector_spacy"],axis=1,inplace=True)

In [23]:
df.head(3)

Unnamed: 0,comment_text,toxic,stopwords_punct_lemma,vector_fast_text
0,"This is so cool. It's like, 'would you want yo...",0,cool like want mother read great idea,"[-0.045399435, 0.07104978, -0.047424514, -0.01..."
1,Thank you!! This would make my life a lot less...,0,thank life lot anxiety inducing let way,"[-0.06339683, 0.09824624, -0.066878095, -0.039..."
2,This is such an urgent design problem; kudos t...,0,urgent design problem kudo take impressive,"[-0.059046045, 0.11193881, -0.052347053, -0.04..."


In [24]:
df.to_csv('data/alldata_fast_text_vectors.csv')