In [23]:
from pathlib import Path

import pandas as pd
from tqdm import tqdm
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, random_split

from src.models import Tokenizer, Vocabulary, VectorizerFactory
from src.data.datasets import SparseDatasetFactory, DenseDatasetFactory

In [3]:
BASE_DIR = Path().resolve().parent
CONVERTED = "data/converted"

df_train = pd.read_csv(str(BASE_DIR / CONVERTED / "train.csv"))
df_test = pd.read_csv(str(BASE_DIR / CONVERTED / "test.csv"))

In [4]:
MAX_DF = 0.8
MIN_COUNT = 5
MIN_TOKEN_SIZE = 4

vocabulary = Vocabulary(max_doc_freq=MAX_DF, min_count=MIN_COUNT)
tokenizer = Tokenizer(min_token_size=MIN_TOKEN_SIZE)

In [5]:
tokenized_texts_train = tokenizer.tokenize_corpus(list(df_train["text"]))
tokenized_texts_test = tokenizer.tokenize_corpus(list(df_test["text"]))

vocabulary.build(tokenized_texts_train)

In [6]:
use_sparse = True

In [7]:
vectorizer_factory = VectorizerFactory(
    vocabulary, mode="tfidf", scale="minmax", use_sparse=use_sparse
)
vectorizer = vectorizer_factory.get_vectorizer()

In [8]:
train_vectors = vectorizer.vectorize(tokenized_texts_train)
test_vectors = vectorizer.vectorize(tokenized_texts_test)

train_targets = df_train["label_index"].to_numpy()
test_targets = df_test["label_index"].to_numpy()

In [10]:
dataset_factory = SparseDatasetFactory() if use_sparse else DenseDatasetFactory()

main_dataset = dataset_factory.create_dataset(train_vectors, train_targets)
test_dataset = dataset_factory.create_dataset(test_vectors, test_targets)

In [12]:
train_dataset, val_dataset = random_split(main_dataset, [0.8, 0.2])

In [38]:
input_size = len(vocabulary)
num_classes = len(set(train_targets))
learning_rate = 3e-4
batch_size = 32
num_epochs = 30

In [39]:
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [40]:
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

model = nn.Linear(input_size, num_classes).to(device)

In [41]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=learning_rate)

In [42]:
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(tqdm(train_loader)):
        data = data.to(device)
        targets = targets.to(device)

        scores = model(data)
        loss = criterion(scores, targets)

        optimizer.zero_grad()
        loss.backward()

        optimizer.step()

100%|██████████| 283/283 [00:00<00:00, 349.85it/s]
100%|██████████| 283/283 [00:00<00:00, 328.76it/s]
100%|██████████| 283/283 [00:00<00:00, 335.78it/s]
100%|██████████| 283/283 [00:00<00:00, 325.67it/s]
100%|██████████| 283/283 [00:00<00:00, 333.49it/s]
100%|██████████| 283/283 [00:00<00:00, 326.13it/s]
100%|██████████| 283/283 [00:00<00:00, 306.96it/s]
100%|██████████| 283/283 [00:00<00:00, 293.15it/s]
100%|██████████| 283/283 [00:00<00:00, 343.00it/s]
100%|██████████| 283/283 [00:00<00:00, 342.03it/s]
100%|██████████| 283/283 [00:00<00:00, 339.43it/s]
100%|██████████| 283/283 [00:00<00:00, 335.18it/s]
100%|██████████| 283/283 [00:00<00:00, 352.16it/s]
100%|██████████| 283/283 [00:00<00:00, 315.96it/s]
100%|██████████| 283/283 [00:00<00:00, 328.39it/s]
100%|██████████| 283/283 [00:00<00:00, 334.91it/s]
100%|██████████| 283/283 [00:00<00:00, 325.67it/s]
100%|██████████| 283/283 [00:00<00:00, 315.12it/s]
100%|██████████| 283/283 [00:01<00:00, 273.23it/s]
100%|██████████| 283/283 [00:00

In [27]:
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            y = y.to(device)

            scores = model(x)
            _, predictions = scores.max(1)

            num_correct += (predictions == y).sum()

            num_samples += predictions.size(0)

    model.train()
    return num_correct / num_samples

In [43]:
model.to(device)
print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:.2f}")
print(f"Accuracy on val set: {check_accuracy(val_loader, model)*100:.2f}")
print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")

Accuracy on training set: 85.37
Accuracy on val set: 67.90
Accuracy on test set: 50.52
