In [25]:
import sys
from pathlib import Path

BASE_DIR = Path().resolve().parent
SRC_PATH = BASE_DIR / "src"

sys.path.append(str(SRC_PATH))

In [None]:
from clearml import Task
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

from text_classifier.data.text_processing import (
    Tokenizer,
    Vocabulary,
    VectorizerFactory,
)
from text_classifier.data.datasets import SparseDatasetFactory, DenseDatasetFactory
from text_classifier.utils import load_params

In [None]:
PACKAGE = "text_classifier"
DATA = "data"
MODELS = "models"
INTERIM = "interim"
EXPS = "experiments"
CONFIG = "config"
LOGS = "logs"

In [None]:
EXP = "exp01"
params = load_params(str(SRC_PATH / PACKAGE / CONFIG / EXPS / f"{EXP}_{CONFIG}.yaml"))

In [2]:
task = Task.init(
    project_name="News Classification",
    task_name="torch nn.Linear simple example",
    output_uri=True,
)
task.set_parameters(params)

ClearML Task: overwriting (reusing) task id=c9685da79129424e9ccca6f63fc69efa
2023-07-16 01:58:57,237 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/85843605c9c34843adfa28f6f4b428bd/experiments/c9685da79129424e9ccca6f63fc69efa/output/log


In [3]:
writer = SummaryWriter(log_dir=str(BASE_DIR / EXPS / EXP / LOGS))

In [7]:
MAX_DF = 0.8
MIN_COUNT = 5
MIN_TOKEN_SIZE = 4

vocabulary = Vocabulary(max_doc_freq=MAX_DF, min_count=MIN_COUNT)
tokenizer = Tokenizer(min_token_size=MIN_TOKEN_SIZE)

In [8]:
df_train = pd.read_csv(str(BASE_DIR / DATA / INTERIM / "train.csv"))
df_test = pd.read_csv(str(BASE_DIR / DATA / INTERIM / "test.csv"))

tokenized_texts_train = tokenizer.tokenize_corpus(list(df_train["text"]))
tokenized_texts_test = tokenizer.tokenize_corpus(list(df_test["text"]))

vocabulary.build(tokenized_texts_train)

In [9]:
use_sparse = True

In [10]:
vectorizer_factory = VectorizerFactory(
    vocabulary, mode="tfidf", scale="minmax", use_sparse=use_sparse
)
vectorizer = vectorizer_factory.get_vectorizer()

In [11]:
train_vectors = vectorizer.vectorize(tokenized_texts_train)
test_vectors = vectorizer.vectorize(tokenized_texts_test)

train_targets = df_train["label_index"].to_numpy()
test_targets = df_test["label_index"].to_numpy()

In [12]:
dataset_factory = SparseDatasetFactory() if use_sparse else DenseDatasetFactory()

main_dataset = dataset_factory.create_dataset(train_vectors, train_targets)
test_dataset = dataset_factory.create_dataset(test_vectors, test_targets)

In [13]:
train_dataset, val_dataset = random_split(main_dataset, [0.8, 0.2])

In [14]:
input_size = len(vocabulary)
num_classes = len(set(train_targets))

In [15]:
learning_rate = params["learning_rate"]
batch_size = params["batch_size"]
num_epochs = params["num_epochs"]

In [17]:
loaders = {
    "train": DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True),
    "val": DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False),
    "test": DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False),
}

In [18]:
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

model = nn.Linear(input_size, num_classes).to(device)

In [19]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=learning_rate)

In [20]:
def check_loss_accuracy(loader, model, criterion):
    num_correct = 0
    num_samples = 0
    loss = 0.0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            y = y.to(device)

            scores = model(x)

            _, predictions = scores.max(1)

            loss += criterion(scores, y)

            num_correct += (predictions == y).sum()

            num_samples += predictions.size(0)

    model.train()
    return num_correct / num_samples, loss / len(loader)

In [21]:
def train_eval_model(
    model,
    loaders,
    num_epochs,
    criterion,
    optimizer,
    device="cpu",
    save_model_path=None,
    writer=None,
):
    for epoch in tqdm(range(num_epochs)):
        for batch in loaders["train"]:
            data, targets = batch
            data = data.to(device)
            targets = targets.to(device)

            scores = model(data)
            loss = criterion(scores, targets)

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()

        for stage, loader in loaders.items():
            acc, loss = check_loss_accuracy(loader, model, criterion)
            if writer is not None:
                writer.add_scalar(f"Acc/{stage}", acc, epoch)
                writer.add_scalar(f"Loss/{stage}", loss, epoch)
            else:
                print(f"Acc/{stage}", acc.item(), epoch)
                print(f"Loss/{stage}", loss.item(), epoch)

    if save_model_path is not None:
        torch.save(model.state_dict(), save_model_path)

In [22]:
train_eval_model(
    model=model,
    laoders=loaders,
    num_epochs=num_epochs,
    criterion=criterion,
    optimizer=optimizer,
    device=device,
    save_model_path=str(BASE_DIR / EXPS / EXP / MODELS / "linear.pt"),
    writer=writer,
)

100%|██████████| 50/50 [02:10<00:00,  2.62s/it]


In [None]:
writer.flush()