In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np

In [2]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(12345)

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
class CustomRNN(nn.Module):
    def __init__(self, vocab_size, embeddings_len):
        super(CustomRNN, self).__init__()

        self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embeddings_len)
        self.lstm = nn.LSTM(input_size=embeddings_len, hidden_size=embeddings_len//2, num_layers=3, batch_first=True)
        # self.fc_in = nn.Linear(embeddings_len//2, embeddings_len//2)
        # self.fc_out = nn.Linear(embeddings_len//2, embeddings_len//2)
        self.relu = nn.LeakyReLU()
    
    def forward(self, x):
        out = self.embeddings(x)
        out, _ = self.lstm(out)
        # out = self.fc_in(out[-1])
        out = self.relu(out[-1])
        # out = self.fc_out(out)
        return out

In [5]:
import json
import re
import pandas as pd
from stemmer_lib.stemmer import Stemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from tqdm import tqdm

In [6]:
with open("tokenizer.json", "r", encoding="utf-8") as file:
    tokenizer_dict = json.load(file)
    file.close()

In [7]:
tokenizer_dict = {v:k for k, v in enumerate(tokenizer_dict) if len(v) >= 3}
tokenizer_dict = {v:k for k, v in enumerate(tokenizer_dict)}

In [8]:
vocab_size = len(tokenizer_dict)

In [9]:
stemmer = Stemmer()

In [10]:
def tokenizer(string:str) -> list[int]:
    string = re.sub(r'\d', '', string)
    string = re.sub(r'[^\w\s]', '', string)
    string = re.sub(r'\s+', ' ', string)
    stems = stemmer.stem_words(string.lower().split())
    func = lambda x: tokenizer_dict[x] if x in tokenizer_dict else tokenizer_dict.get("unknown")
    stems = torch.tensor(list(map(func, stems)))
    return stems

In [11]:
data = pd.read_csv("data.csv").drop("Unnamed: 0", axis=1)

In [13]:
data["query"] = data["query"].apply(lambda x: tokenizer(str(x)))
data["context"] = data["context"].apply(lambda x: tokenizer(str(x)))
data["probability"] = data["probability"].apply(lambda x: -1 if x == 0 else x)

In [14]:
features = data.drop("probability", axis=1)
target = data["probability"]

In [15]:
features_train, features_test, target_train, target_test = train_test_split(features, target, random_state=12345, test_size=0.25)

In [16]:
model = CustomRNN(vocab_size=vocab_size, embeddings_len=1024).to(device)
optimazer = optim.Adam(model.parameters())
citeration = nn.CosineEmbeddingLoss().to(device)

In [17]:
num_epochs = 10

for _ in range(num_epochs):
    model.train()
    total_loss = 0
    for query, context, label in tqdm(zip(features_train["query"], features_train["context"], target_train)):
        # try:
        optimazer.zero_grad()
        
        query = query.to(device)
        context = context.to(device)
        label = torch.tensor(label).to(device)

        query = model(query)
        context = model(context)

        loss = citeration(query, context, label)
        loss.backward()
        optimazer.step()
        total_loss += loss.item()
        # except:
        #     continue
        
    average_loss = total_loss / len(features_train)
    print(f'Epoch [{_+1}/{num_epochs}], Loss: {average_loss:.4f}')
model.eval()

7182it [03:10, 37.64it/s]


Epoch [1/10], Loss: 0.4743


7182it [03:16, 36.62it/s]


Epoch [2/10], Loss: 0.4221


7182it [03:13, 37.19it/s]


Epoch [3/10], Loss: 0.3739


7182it [03:07, 38.34it/s]


Epoch [4/10], Loss: 0.3441


7182it [03:12, 37.36it/s]


Epoch [5/10], Loss: 0.3192


7182it [03:12, 37.28it/s]


Epoch [6/10], Loss: 0.2914


7182it [03:10, 37.72it/s]


Epoch [7/10], Loss: 0.2699


7182it [03:13, 37.05it/s]


Epoch [8/10], Loss: 0.2438


7182it [03:13, 37.07it/s]


Epoch [9/10], Loss: 0.2317


7182it [03:14, 36.84it/s]

Epoch [10/10], Loss: 0.2235





CustomRNN(
  (embeddings): Embedding(15794, 1024)
  (lstm): LSTM(1024, 512, num_layers=3, batch_first=True)
  (relu): LeakyReLU(negative_slope=0.01)
)

In [18]:
predictions = []

In [19]:
target_test = list(map(lambda x: 0 if x == -1 else x, target_test))

In [20]:
for query, context in tqdm(zip(features_test["query"], features_test["context"])):
    query = query.to(device)
    context = context.to(device)

    query = model(query).to("cpu").detach().numpy().reshape(1, -1)
    context = model(context).to("cpu").detach().numpy().reshape(1, -1)
    predictions.append(round(cosine_similarity(query, context)[0, 0]))

2394it [00:18, 129.93it/s]


In [21]:
accuracy_score = accuracy_score(target_test, predictions)
recall = recall_score(target_test, predictions)
f1 = f1_score(target_test, predictions)
precision = precision_score(target_test, predictions)

In [24]:
print(f"""Accuracy: {accuracy_score:.4f}
Precision: {precision:.4f}
Recall: {recall:.4f}
F1: {f1:.4f}""")

Accuracy: 0.6876
Precision: 0.6755
Recall: 0.7119
F1: 0.6932


In [23]:
# import pickle
# with open('model.pkl', 'wb') as f:
#     pickle.dump(model, f)
#     f.close()