This notebook file is used to develop the MLP that would be used.



In [19]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score



Read the datasets.


In [20]:
data_frame = pd.read_csv("datasets/train.csv")
data_frame.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [21]:
#Lets keep it simple and drop some useless columns.

data_frame = data_frame.drop(columns=["id", "severe_toxic"])
data_frame.head()

Unnamed: 0,comment_text,toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0


In [22]:

def preprocess(text):
    return simple_preprocess(text, deacc=True, min_len=2, max_len=15)

data_frame['preprocessed_text'] = data_frame['comment_text'].apply(preprocess)
data_frame.head()



Unnamed: 0,comment_text,toxic,obscene,threat,insult,identity_hate,preprocessed_text
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,"[explanation, why, the, edits, made, under, my..."
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,"[aww, he, matches, this, background, colour, s..."
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,"[hey, man, really, not, trying, to, edit, war,..."
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,"[more, can, make, any, real, suggestions, on, ..."
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,"[you, sir, are, my, hero, any, chance, you, re..."


In [23]:
# Train the Word2Vec model
embedding_size = 100
word2vec_model = Word2Vec(data_frame['preprocessed_text'], vector_size=embedding_size, window=5, min_count=2, workers=4)
word2vec_model


<gensim.models.word2vec.Word2Vec at 0x7fd320905ee0>

In [24]:
# Convert text to embeddings
def text_to_embedding(text):
    words = [word for word in text if word in word2vec_model.wv]
    if words:
        return np.mean(word2vec_model.wv[words], axis=0)
    else:
        return np.zeros(embedding_size)

data_frame['embedding'] = data_frame['preprocessed_text'].apply(text_to_embedding)


In [25]:
data_frame.head()


Unnamed: 0,comment_text,toxic,obscene,threat,insult,identity_hate,preprocessed_text,embedding
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,"[explanation, why, the, edits, made, under, my...","[1.0434402, -0.1309201, -0.24954712, -0.394823..."
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,"[aww, he, matches, this, background, colour, s...","[0.6622115, -0.49376237, -0.121675886, -0.1719..."
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,"[hey, man, really, not, trying, to, edit, war,...","[0.8088037, -0.50269395, -0.928083, -0.4291520..."
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,"[more, can, make, any, real, suggestions, on, ...","[0.9010477, 0.030773602, -0.7089923, -0.733294..."
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,"[you, sir, are, my, hero, any, chance, you, re...","[1.4950477, -0.61495525, -0.9125719, -1.188175..."


In [26]:
data_frame["embedding"]

0         [1.0434402, -0.1309201, -0.24954712, -0.394823...
1         [0.6622115, -0.49376237, -0.121675886, -0.1719...
2         [0.8088037, -0.50269395, -0.928083, -0.4291520...
3         [0.9010477, 0.030773602, -0.7089923, -0.733294...
4         [1.4950477, -0.61495525, -0.9125719, -1.188175...
                                ...                        
159566    [0.6632037, -0.4890748, -1.1266145, -0.7530023...
159567    [1.2571803, -0.55618274, -0.89554954, -1.50168...
159568    [0.00072255457, -0.6336445, -0.678324, -0.1280...
159569    [1.0233685, -0.42017877, -0.48933327, -0.41772...
159570    [0.89718443, -0.7670429, -1.0692644, -0.913098...
Name: embedding, Length: 159571, dtype: object

In [27]:
# Split the data into training and test sets
X = np.stack(data_frame['embedding'].values)
y = data_frame['toxic'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [28]:
# Man said let there be memory hazards..
mlp = MLPClassifier(hidden_layer_sizes=(128, 64),max_iter=1000, activation='relu', solver='adam', random_state=42)
mlp.fit(X_train, y_train)
mlp


MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=1000, random_state=42)

In [29]:
y_pred = mlp.predict(X_test)
y_non_toxic = mlp.predict_proba()

In [30]:
y_pred

array([1, 0, 1, ..., 0, 1, 0])

In [31]:
print(classification_report(y_true=y_test, y_pred = y_pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.97     28859
           1       0.69      0.63      0.66      3056

    accuracy                           0.94     31915
   macro avg       0.83      0.80      0.81     31915
weighted avg       0.94      0.94      0.94     31915



In [32]:
print(accuracy_score(y_pred=y_pred, y_true=y_test))
print("yo")

0.9375528748237506
