In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import confusion_matrix
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score
from google.colab import files
from sklearn.model_selection import ParameterGrid
import dagshub
import os
import mlflow
import mlflow.pytorch
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, f1_score, recall_score
from early_stopper import EarlyStopper
from train_eval import train, evaluate

dagshub.init(repo_owner='MateaLukiccc', repo_name='MLOps-For-NLP', mlflow=True)
os.environ["MLFLOW_TRACKING_USERNAME"] = "placeholder"
os.environ["MLFLOW_TRACKING_PASSWORD"] = 'placeholder'
os.environ["MLFLOW_TRACKING_URI"] = "https://dagshub.com/MateaLukiccc/MLOps-For-NLP.mlflow"

In [None]:
files.upload()
df_train = pd.read_csv('preprocessed_train.csv')
df_test = pd.read_csv('preprocessed_test.csv')

X_train = df_train['Text']
X_test = df_test['Text']

y_train = df_train['Class']
y_test = df_test['Class']

# Print the shapes of the training and testing sets
print("The shape of X_train is ", X_train.shape)
print("The shape of X_test is ", X_test.shape)
print("The shape of y_train is", y_train.shape)
print("The shape of y_test is", y_test.shape)

train_data_texts = X_train.tolist()
test_data_texts = X_test.tolist()

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data_texts)

word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
print("Vocabulary Size:", vocab_size)

# Convert texts to sequences and pad them
max_length_preprocessed = 177  # example value
x_train = pad_sequences(tokenizer.texts_to_sequences(train_data_texts), maxlen=max_length_preprocessed)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_data_texts), maxlen=max_length_preprocessed)

print("Training X Shape:", x_train.shape)
print("Testing X Shape:", x_test.shape)

# Labels as numpy arrays
y_train = np.array(y_train).reshape(-1, 1)
y_test = np.array(y_test).reshape(-1, 1)
print("Training Y Shape:", y_train.shape)
print("Testing Y Shape:", y_test.shape)

# One-hot encode the labels
num_classes = 4
encoder = LabelBinarizer()
encoder.fit(range(num_classes))
y_train_encoded = encoder.transform(y_train)
y_test_encoded = encoder.transform(y_test)

print("Training Y Shape (One-Hot Encoded):", y_train_encoded.shape)
print("Testing Y Shape (One-Hot Encoded):", y_test_encoded.shape)

# Convert data to PyTorch tensors
x_train_tensor = torch.tensor(x_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.float)
x_test_tensor = torch.tensor(x_test, dtype=torch.long)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.float)

# Create DataLoader
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
test_dataset = TensorDataset(x_test_tensor, y_test_tensor)

# potencial batch sizes small batches 32 64 128 256      big batches 512 1024 2048
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, num_layers):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_size, batch_first=True, num_layers=num_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        output = self.fc(output[:, -1, :])  # Get the output of the last time step
        return output
        
param_grid = {
    'learning_rate': [0.01, 0.001],
    'optim': [optim.Adam, optim.NAdam],
    'hidden_size': [32, 64],
    'num_layers': [1, 2],
    'embedding_dim': [128, 256]
}
grid = ParameterGrid(param_grid)

for params in grid:
  model = RNNModel(vocab_size, params['embedding_dim'], params['hidden_size'], num_classes, params['num_layers'])
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  model.to(device)

  # Define optimizer and loss function
  optimizer = params['optim'](model.parameters(), lr=params['learning_rate'])
  criterion = nn.CrossEntropyLoss()

  # Training loop
  N_EPOCHS = 100
  early_stopper = EarlyStopper(patience=3, min_delta=0)

  mlflow.set_experiment("RNN")
  with mlflow.start_run():
      mlflow.log_param("embedding_dim", params['embedding_dim'])
      mlflow.log_param("hidden_size", params['hidden_size'])
      mlflow.log_param("optimizer", "Adam" if params['optim'] == optim.Adam else "NAdam")
      mlflow.log_param("learning_rate", params['learning_rate'])
      mlflow.log_param("num_layers", params['num_layers'])
      mlflow.log_param("epochs", N_EPOCHS)

      best_test_loss = float('inf')
      for epoch in range(N_EPOCHS):
          train_loss = train(model, train_loader, optimizer, criterion, device)
          test_loss = evaluate(model, test_loader, criterion, device)
          
          mlflow.log_metric("train_loss", train_loss, step=epoch)
          mlflow.log_metric("test_loss", test_loss, step=epoch) 
          if early_stopper.early_stop(test_loss):
              print(f"Early stopping at epoch {epoch + 1}")
              break
          print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Test Loss: {test_loss:.3f}')
      

      model.eval()
      all_preds = []
      all_labels = []      
      with torch.no_grad():
          for batch in test_loader:
              text, labels = batch
              text, labels = text.to(device), labels.to(device)
              predictions = model(text)
              preds = predictions.argmax(dim=1)
              all_preds.extend(preds.cpu().numpy())
              all_labels.extend(labels.argmax(dim=1).cpu().numpy())
      
      # Calculate metrics
      conf_matrix = confusion_matrix(all_labels, all_preds)
      accuracy = accuracy_score(all_labels, all_preds)
      f1 = f1_score(all_labels, all_preds, average='weighted')
      recall = recall_score(all_labels, all_preds, average='weighted')

      print("Confusion Matrix:\n", conf_matrix)
      print("Accuracy: ", accuracy)
      print("F1 Score: ", f1)
      print("Recall: ", recall)

      # Log metrics to MLflow
      mlflow.log_metric("accuracy", accuracy)
      mlflow.log_metric("f1_score", f1)
      mlflow.log_metric("recall", recall)

      # Plot and save confusion matrix as an artifact
      disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
      disp.plot(cmap='viridis')
      plt.savefig("confusion_matrix.png")
      mlflow.log_artifact("confusion_matrix.png")