In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd
import re

import torch
import torch.nn as nn
from torch.optim import Adam

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
import random
from tqdm import tqdm
import numpy as np

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
data = pd.read_csv("/datasets/toxic_comments.csv").drop("Unnamed: 0", axis=1)

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words("english")
stop_words = ["".join(re.sub(r'[^\w\s]', ' ', i).lower().split()) for i in stop_words]

In [None]:
def tokenizer(string:str) -> str:
    string = re.sub(r'[^a-zA-Z]', ' ', string).lower()
    string = re.sub(r'\b(\w+)(?:\s+\1\b)+', '', string)
    string = re.sub(r'\b(\w*(\w)\2\w*)\b|\b(\w+)\b(?:\s+\3\b)+', '', string).split()
    stems = [lemmatizer.lemmatize(i, pos="n") for i in string]
    stems = [lemmatizer.lemmatize(i, pos="v") for i in stems]
    stems = [lemmatizer.lemmatize(i, pos="a") for i in stems]
    stems = [lemmatizer.lemmatize(i, pos="r") for i in stems]
    stems = [lemmatizer.lemmatize(i, pos="s") for i in stems]
    stems = " ".join([i for i in stems if i not in stop_words])
    return stems

In [None]:
tqdm.pandas()
data["text"] = data["text"].progress_apply(lambda x: tokenizer(x))
                                  
nans = []
for i, v in enumerate(data["text"]):
    if len(v) == 0:
        nans.append(i)

data = data.drop(nans, axis=0).reset_index(drop=True)

features = data["text"]
target = data["toxic"]

In [None]:
features.loc[0]

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(12345)

In [None]:
features_train, features_test, target_train, target_test = train_test_split(features, target, random_state=12345, stratify=target, test_size=0.33)
features_test, features_val, target_test, target_val = train_test_split(features_test, target_test, random_state=12345, stratify=target_test, test_size=0.33)

In [None]:
vectorizer = CountVectorizer()
vecs = vectorizer.fit_transform(features_train)
unique_words = vectorizer.get_feature_names_out()

In [None]:
unique_words 

In [None]:
features_train = pd.DataFrame(features_train).reset_index(drop=True)
features_test = pd.DataFrame(features_test).reset_index(drop=True)
features_val = pd.DataFrame(features_val).reset_index(drop=True)
target_train = pd.DataFrame(target_train).reset_index(drop=True)
target_test = pd.DataFrame(target_test).reset_index(drop=True)
target_val = pd.DataFrame(target_val).reset_index(drop=True)

In [None]:
dict_tokenizer = {k:v for v, k in enumerate(unique_words)}
dict_tokenizer["unknown"] = len(dict_tokenizer)
vocab_size = len(dict_tokenizer) + 1

In [None]:
def tokenization(string:str) -> torch.tensor:
    lbd = lambda x: dict_tokenizer[x] if x in dict_tokenizer else dict_tokenizer["unknown"]
    string = [lbd(i) for i in string.split()]
    return torch.tensor(string, dtype=torch.int32).to(device)

In [None]:
features_train["text"] = features_train["text"].apply(lambda x: tokenization(x))
features_test["text"] = features_test["text"].apply(lambda x: tokenization(x))
features_val["text"] = features_val["text"].apply(lambda x: tokenization(x))
target_train["toxic"] = target_train["toxic"].apply(lambda x: torch.tensor([x], dtype=torch.float32).to(device))

In [None]:
class RNN_LSTM(nn.Module):
    def __init__(self, vocab_size, embeddings_len):
        super(RNN_LSTM, self).__init__()

        self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embeddings_len)
        self.lstm = nn.LSTM(input_size=embeddings_len, hidden_size=embeddings_len//2, num_layers=5, batch_first=True)
        self.fc_in = nn.Linear(embeddings_len//2, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        out = self.embeddings(x)
        out, _ = self.lstm(out)
        out = self.fc_in(out[-1])
        out = self.sigmoid(out)
        return out

In [None]:
model_lstm = RNN_LSTM(vocab_size, 256).to(device)
citeration = nn.BCELoss().to(device)
optimazer = Adam(model_lstm.parameters(), lr=0.0001)

In [None]:
num_epochs = 3

for _ in range(num_epochs):
    model_lstm.train()
    total_loss = 0
    for features, target in tqdm(zip(features_train["text"], target_train["toxic"])):
        optimazer.zero_grad()
        input = model_lstm(features)
        loss = citeration(input, target)
        loss.backward()
        optimazer.step()
        total_loss += loss.item()

    average_loss = total_loss / len(features_train)
    print(f'Epoch [{_+1}/{num_epochs}], Loss: {average_loss:.4f}')

model_lstm.eval()

In [None]:
for g in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    predictions = []
    treash_hold = g

    for i in tqdm(features_val["text"]):
        prediction = model_lstm(i)
        prediction = 1 if prediction >= treash_hold else 0
        predictions.append(prediction)

    acc_lstm = accuracy_score(target_val["toxic"], predictions)
    f1_lstm = f1_score(target_val["toxic"], predictions)
    precision_lstm = precision_score(target_val["toxic"], predictions)
    recall_lstm = recall_score(target_val["toxic"], predictions)

    print(f"""Accuracy: {acc_lstm:.4f}
    F1: {f1_lstm:.4f}
    Precision: {precision_lstm:.4f}
    Recall: {recall_lstm:.4f}""")

In [None]:
class RNN_GRU(nn.Module):
    def __init__(self, vocab_size, embeddings_len):
        super(RNN_GRU, self).__init__()

        self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embeddings_len)
        self.lstm = nn.GRU(input_size=embeddings_len, hidden_size=embeddings_len//2, num_layers=5, batch_first=True)
        self.fc_in = nn.Linear(embeddings_len//2, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        out = self.embeddings(x)
        out, _ = self.lstm(out)
        out = self.fc_in(out[-1])
        out = self.sigmoid(out)
        return out

In [None]:
model_gru = RNN_GRU(vocab_size, 256).to(device)
citeration = nn.BCELoss().to(device)
optimazer = Adam(model_gru.parameters(), lr=0.0001)

In [None]:
num_epochs = 3

for _ in range(num_epochs):
    model_gru.train()
    total_loss = 0
    for features, target in tqdm(zip(features_train["text"], target_train["toxic"])):
        optimazer.zero_grad()
        input = model_gru(features)
        loss = citeration(input, target)
        loss.backward()
        optimazer.step()
        total_loss += loss.item()

    average_loss = total_loss / len(features_train)
    print(f'Epoch [{_+1}/{num_epochs}], Loss: {average_loss:.4f}')
model_gru.eval()

In [None]:
for g in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    predictions = []
    treash_hold = g

    for i in tqdm(features_val["text"]):
        prediction = model_gru(i)
        prediction = 1 if prediction >= treash_hold else 0
        predictions.append(prediction)

    acc_gru = accuracy_score(target_val["toxic"], predictions)
    f1_gru = f1_score(target_val["toxic"], predictions)
    precision_gru = precision_score(target_val["toxic"], predictions)
    recall_gru = recall_score(target_val["toxic"], predictions)

    print(f"""Accuracy: {acc_gru:.4f}
    F1: {f1_gru:.4f}
    Precision: {precision_gru:.4f}
    Recall: {recall_gru:.4f}
    Treash_hold: {treash_hold:.4f}""")

In [None]:
predictions = []
treash_hold = 0.48

for i in tqdm(features_test["text"]):
    prediction = model_lstm(i)
    prediction = 1 if prediction >= treash_hold else 0
    predictions.append(prediction)

acc_lstm = accuracy_score(target_test["toxic"], predictions)
f1_lstm = f1_score(target_test["toxic"], predictions)
precision_lstm = precision_score(target_test["toxic"], predictions)
recall_lstm = recall_score(target_test["toxic"], predictions)

print(f"""Accuracy: {acc_lstm:.2f}
F1: {f1_lstm:.2f}
Precision: {precision_lstm:.2f}
Recall: {recall_lstm:.2f}""")