# Read Data

In [521]:
import numpy as np
import pandas as pd

In [476]:
X = pd.read_pickle('data.pickle')
y = pd.read_pickle('labels.pickle')
with open('vocab.txt', 'r') as f:
	vocab = f.read().split(" ")
	vocab.pop()

In [477]:
len(X), len(y), len(vocab)

(2784, 2784, 9210)

# Split Data

In [478]:
import torch
from sklearn.model_selection import train_test_split

In [479]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=777)
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.15, random_state=777)

In [480]:
y

array([[1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 1, 0, 0],
       ...,
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1]], dtype=int32)

In [481]:
train_X.shape, val_X.shape, test_X.shape, 


((1892,), (335,), (557,))

In [482]:
train_y.shape, val_y.shape, test_y.shape

((1892, 4), (335, 4), (557, 4))

In [483]:
from torch.utils.data import Dataset, DataLoader
class ClassificationDataset(Dataset):
	def __init__(self, X, y):
		self.X = X
		self.y = y
	def __len__(self):
		return len(self.X)
	def __getitem__(self, idx):
		return torch.tensor(self.X[idx], dtype=torch.int32), torch.tensor(self.y[idx], dtype=torch.float)


In [484]:
train_dataset = ClassificationDataset(train_X, train_y)
val_dataset = ClassificationDataset(val_X, val_y)
test_dataset = ClassificationDataset(test_X, test_y)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# RNN

In [485]:
from torch import nn
from torch.optim import Adam
from torch.functional import F

In [639]:
class LSTM(nn.Module):
	def __init__(self,
			  vocab_size,
			  embedding_dim,
			  hidden_dim,
			  num_layers,
			  num_classes,
			  max_len,
			  bidirectional,
	):
		super().__init__()
		# Embeddings, which can be pretrained or normally trained
		self.embeddings = nn.Embedding(
			num_embeddings=vocab_size,
			embedding_dim=embedding_dim
		)
		# LSTM Layer
		self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, bidirectional=bidirectional)
		# Linear Layer
		if bidirectional:
			self.linear = nn.Linear(max_len * hidden_dim * 2, num_classes)
		else:
			self.linear = nn.Linear(max_len * hidden_dim, num_classes)
		self.softmax = nn.Softmax()
	def forward(self, x):
		# print(x.shape)
		embeds = self.embeddings(x)
		lstm_out, _ = self.lstm(embeds)
		linear = self.linear(lstm_out.reshape(lstm_out.shape[0], -1))
		return linear

# Train & Validate 

In [640]:
from tqdm import tqdm

In [641]:
def train_val(
		model: LSTM,
		optim: Adam,
		criterion: nn.CrossEntropyLoss,
		epochs: int,
		train_dataloader: DataLoader,
		val_dataloader: DataLoader,
		device
	):
	model.to(device)
	for epoch in tqdm(range(epochs)):
		model.train()
		train_loss = 0
		train_correct = 0
		train_total = 0
		for train_X, train_y in train_dataloader:
			train_X, train_y = train_X.to(device), train_y.to(device)

			y_preds = model(train_X)
			loss = criterion(y_preds, train_y)  # FIXED

			optim.zero_grad()
			loss.backward()
			optim.step()

			train_loss += loss.item()
			predicted = torch.argmax(y_preds, dim=1)
			train_correct += (predicted == np.argmax(train_y, axis=1)).sum().item()  # FIXED
			train_total += train_y.size(0)

		# Validation
		model.eval()
		val_loss = 0
		val_correct = 0
		val_total = 0
		with torch.no_grad():
			for val_X, val_y in val_dataloader:
				val_X, val_y = val_X.to(device), val_y.to(device)

				y_preds = model(val_X)
				loss = criterion(y_preds, val_y)
				val_loss += loss.item()
				predicted = torch.argmax(y_preds, dim=1)
				val_correct += (predicted == np.argmax(val_y, axis=1)).sum().item()
				val_total += val_y.size(0)

		print(
			f"Epoch {epoch+1}/{epochs}, "
			f"Train Loss: {train_loss/len(train_dataloader):.4f}, "
			f"Val Loss: {val_loss/len(val_dataloader):.4f}, "
			f"Train Acc: {100 * train_correct/train_total:.2f}%, "
			f"Val Acc: {100 * val_correct/val_total:.2f}%"
		)

In [667]:
import itertools
from torcheval.metrics.functional import multiclass_f1_score

def grid_search(train_loader, val_loader, criterion, param_grid, vocab_size, num_classes, max_len, device='cpu'):
	results = []

	keys = list(param_grid.keys())
	for values in itertools.product(*param_grid.values()):
		params = dict(zip(keys, values))
		print(f"Training with params: {params}")

		# Model
		model = LSTM(
			vocab_size=vocab_size,
			embedding_dim=params['embedding_dim'],
			hidden_dim=params['hidden_dim'],
			num_layers=params['num_layers'],
			num_classes=num_classes,
			max_len=max_len,
			bidirectional=params['bidirectional']
		).to(device)
		optimizer = Adam(params=model.parameters())
		# Training
		model.train()
		for epoch in range(params['epochs']):
			for X_batch, y_batch in train_loader:
				X_batch, y_batch = X_batch.to(device), y_batch.to(device)
				optimizer.zero_grad()
				y_preds = model(X_batch)
				loss = criterion(y_preds, y_batch)
				loss.backward()
				optimizer.step()

		# Validation
		model.eval()
		y_preds_list = []
		y_true_list = []
		with torch.no_grad():
			for X_batch, y_batch in val_loader:
				X_batch, y_batch = X_batch.to(device), y_batch.to(device)
				y_preds = model(X_batch)
				y_preds_list.extend(y_preds.numpy())
				y_true_list.extend(torch.argmax(y_batch, dim=1))

		f1 = multiclass_f1_score(torch.Tensor(y_preds_list), torch.Tensor(y_true_list), num_classes=4)
		print(f"Validation accuracy: {f1:.4f}")
		results.append((params, f1))

	return sorted(results, key=lambda x: x[1], reverse=True)


In [668]:
param_grid = {
	'embedding_dim': [64, 128],
	'hidden_dim': [64, 128],
	'num_layers': [1, 2],
	'bidirectional': [True, False],
	'lr': [0.001],
	'batch_size': [32],
	'epochs': [3]
}

best = grid_search(train_dataloader, val_dataloader, nn.CrossEntropyLoss(), param_grid,
				   vocab_size=len(vocab),
				   num_classes=4,
				   max_len=37,
				   device='cuda' if torch.cuda.is_available() else 'cpu')

print("Best config:", best[0])


Training with params: {'embedding_dim': 64, 'hidden_dim': 64, 'num_layers': 1, 'bidirectional': True, 'lr': 0.001, 'batch_size': 32, 'epochs': 3}
Validation accuracy: 0.5642
Training with params: {'embedding_dim': 64, 'hidden_dim': 64, 'num_layers': 1, 'bidirectional': False, 'lr': 0.001, 'batch_size': 32, 'epochs': 3}
Validation accuracy: 0.5433
Training with params: {'embedding_dim': 64, 'hidden_dim': 64, 'num_layers': 2, 'bidirectional': True, 'lr': 0.001, 'batch_size': 32, 'epochs': 3}
Validation accuracy: 0.5522
Training with params: {'embedding_dim': 64, 'hidden_dim': 64, 'num_layers': 2, 'bidirectional': False, 'lr': 0.001, 'batch_size': 32, 'epochs': 3}
Validation accuracy: 0.5433
Training with params: {'embedding_dim': 64, 'hidden_dim': 128, 'num_layers': 1, 'bidirectional': True, 'lr': 0.001, 'batch_size': 32, 'epochs': 3}
Validation accuracy: 0.5075
Training with params: {'embedding_dim': 64, 'hidden_dim': 128, 'num_layers': 1, 'bidirectional': False, 'lr': 0.001, 'batch_siz

# Test

In [539]:
from sklearn.metrics import classification_report

In [550]:
def test(
		model: LSTM,
		optim: Adam,
		criterion: nn.CrossEntropyLoss,
		test_dataloader: DataLoader,
		device
	):
	model.to(device)
	model.eval()
	test_loss = 0
	test_correct = 0
	test_total = 0
	y_preds_list = []
	with torch.no_grad():
		for test_X, test_y in test_dataloader:
			test_X, test_y = test_X.to(device), test_y.to(device)

			y_preds = model(test_X)
			y_preds_list.extend(y_preds.numpy())
			loss = criterion(test_y, y_preds)
			test_loss += loss.item()
			predicted = torch.argmax(y_preds, dim=1)
			test_correct += (predicted == np.argmax(test_y, axis=1)).sum().item()
			test_total += test_y.size(0)

	print(
		f"test Loss: {test_loss/len(test_dataloader):.4f}, "
		f"test Acc: {100 * test_correct/test_total:.2f}%"
	)
	return y_preds_list

In [551]:
y_preds=test(model, optim, criterion, test_dataloader, 'cpu')

test Loss: 1.1790, test Acc: 56.55%


In [554]:
print(classification_report(np.argmax(test_y, axis=1), np.argmax(y_preds, axis=1)))

              precision    recall  f1-score   support

           0       0.58      0.60      0.59       131
           1       0.32      0.32      0.32        87
           2       0.00      0.00      0.00        82
           3       0.63      0.81      0.71       257

    accuracy                           0.57       557
   macro avg       0.38      0.43      0.40       557
weighted avg       0.47      0.57      0.51       557

