In [None]:
! pip install torcheval

Collecting torcheval
  Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.2/179.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torcheval
Successfully installed torcheval-0.0.7


### Import libraries

In [None]:
import os
import torch
from torcheval.metrics.functional import multiclass_accuracy, multiclass_auroc, multiclass_recall
import torch.nn as nn
from torch.optim import Adam, AdamW
from sklearn.model_selection import train_test_split
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
root_path = "/content/drive/MyDrive/Deep_learning_course/Text_processing/"
project_path = os.path.join(root_path, "Classify_Reviews")
nltk_folder_path = "/content/lmtools/"
data_path = "/content/data/"
file_path = os.path.join(data_path, "reviews_train.csv")

In [None]:
external_lmtool_path = os.path.join(root_path, "nltk_data.zip")
local_lmtool_path = nltk_folder_path
!unzip -q $external_lmtool_path -d $local_lmtool_path

In [None]:
import nltk
nltk_path = os.path.join(nltk_folder_path, "nltk_data")
nltk.data.path.insert(0, nltk_path)

In [None]:
lib_path = os.path.join(project_path, "src")
import sys
sys.path.append(lib_path)
from embedding_dictionary import EmbeddingDictionary
from embedding_sequence import SequenceFromText
# from data_set import DatasetReviews
from data_loader import dataloader_reviews_train_eval
from fit_func import FitTrainEval
from sequencial_model import SequencialModel

### Create dictionary

In [None]:
external_data_path = os.path.join(project_path, "data/")
local_data_path = data_path
!cp -r $external_data_path -d $local_data_path

In [None]:
embedding_maker = EmbeddingDictionary(
    text_path=file_path,
    unk_cutoff=3
)

In [None]:
w2v_dict = embedding_maker.create_embedding_dict()

In [None]:
with open(os.path.join(external_data_path, "w2v_dict.pkl"), "wb") as f:
    pickle.dump(w2v_dict, f)

### Create sequences

In [None]:
sequence_length = 52

In [None]:
sequence_maker = SequenceFromText(
    text_path=file_path,
    word_dictionary=w2v_dict,
    series_length=sequence_length
)

In [None]:
sequences = sequence_maker.create_sequence()

### Load data

In [None]:
# data parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_size = len(w2v_dict)
batch_size = 5_000
num_workers = 8
# model parameters
model_load = "sequential_multiple_layers4_1.pkl"
model_save = "sequential_multiple_layers4_2.pkl"
model_load = os.path.join(project_path, "models/", model_load)
model_path = os.path.join(project_path, "models/", model_save)
metrics = [multiclass_accuracy]
EPOCHS = 500

In [None]:
data_train, data_eval = dataloader_reviews_train_eval(
    sequences=sequences,
    vocab_size=vocab_size,
    batch_size=batch_size,
    num_workers=num_workers,
    device=device
)

### Train models

In [None]:
model = SequencialModel(
    embedding_dim=300,
    vocab_size=vocab_size,
    hidden_size=300,
    num_classes=5,
    num_layers=4,
    padding_idx=w2v_dict["unk"]
)
# model.load_state_dict(torch.load(model_load))
model = model.to(device)

In [None]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=1e-5)

In [None]:
optimization = FitTrainEval(
    model=model,
    loss=loss_fn,
    optimizer=optimizer,
    metrics=metrics,
    path_save=model_path,
    device=device
)

In [None]:
trained = optimization.fit_train_eval(
    data_train=data_train,
    data_eval=data_eval,
    epochs=EPOCHS,
)