In [1]:
from pathlib import Path

import pandas as pd
from tqdm import tqdm
import torch
from torch import nn, optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader, random_split

from src.models import Tokenizer, Vocabulary, VectorizerFactory
from src.data.datasets import SparseDatasetFactory, DenseDatasetFactory

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
writer = SummaryWriter()

In [3]:
BASE_DIR = Path().resolve().parent
CONVERTED = "data/converted"

df_train = pd.read_csv(str(BASE_DIR / CONVERTED / "train.csv"))
df_test = pd.read_csv(str(BASE_DIR / CONVERTED / "test.csv"))

In [4]:
MAX_DF = 0.8
MIN_COUNT = 5
MIN_TOKEN_SIZE = 4

vocabulary = Vocabulary(max_doc_freq=MAX_DF, min_count=MIN_COUNT)
tokenizer = Tokenizer(min_token_size=MIN_TOKEN_SIZE)

In [5]:
tokenized_texts_train = tokenizer.tokenize_corpus(list(df_train["text"]))
tokenized_texts_test = tokenizer.tokenize_corpus(list(df_test["text"]))

vocabulary.build(tokenized_texts_train)

In [6]:
use_sparse = True

In [7]:
vectorizer_factory = VectorizerFactory(
    vocabulary, mode="tfidf", scale="minmax", use_sparse=use_sparse
)
vectorizer = vectorizer_factory.get_vectorizer()

In [8]:
train_vectors = vectorizer.vectorize(tokenized_texts_train)
test_vectors = vectorizer.vectorize(tokenized_texts_test)

train_targets = df_train["label_index"].to_numpy()
test_targets = df_test["label_index"].to_numpy()

In [9]:
dataset_factory = SparseDatasetFactory() if use_sparse else DenseDatasetFactory()

main_dataset = dataset_factory.create_dataset(train_vectors, train_targets)
test_dataset = dataset_factory.create_dataset(test_vectors, test_targets)

In [10]:
train_dataset, val_dataset = random_split(main_dataset, [0.8, 0.2])

In [11]:
input_size = len(vocabulary)
num_classes = len(set(train_targets))
learning_rate = 3e-4
batch_size = 32
num_epochs = 50

In [12]:
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [13]:
loaders = {
    "train": train_loader,
    "val": val_loader,
    "test": test_loader,
}

In [14]:
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

model = nn.Linear(input_size, num_classes).to(device)

In [15]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=learning_rate)

In [16]:
def check_loss_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    loss = 0.0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            y = y.to(device)

            scores = model(x)

            _, predictions = scores.max(1)

            loss += criterion(scores, y)

            num_correct += (predictions == y).sum()

            num_samples += predictions.size(0)

    model.train()
    return num_correct / num_samples, loss / len(loader)

In [17]:
def train_eval():
    for epoch in tqdm(range(num_epochs)):
        for batch in train_loader:
            data, targets = batch
            data = data.to(device)
            targets = targets.to(device)

            scores = model(data)
            loss = criterion(scores, targets)

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()

        for stage, loader in loaders.items():
            acc, loss = check_loss_accuracy(loader, model)
            writer.add_scalar(f"Acc/{stage}", acc, epoch)
            writer.add_scalar(f"Loss/{stage}", loss, epoch)

In [None]:
train_eval()
writer.flush()