In [1]:
from google.colab import files
files.upload()

Saving preprocessed_train.csv to preprocessed_train.csv


In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import confusion_matrix
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score

In [31]:
df_train = pd.read_csv('preprocessed_train.csv')
df_test = pd.read_csv('preprocessed_test.csv')

X_train = df_train['Text']
X_test = df_test['Text']

y_train = df_train['Class']
y_test = df_test['Class']



# def remove_numbers(text):
#   for i in range(10):
#     try:
#       text = text.replace(str(i), "")
#     except:
#       text = str(text)
#   return text

# X = X.apply(remove_numbers)

# # Train Test Split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("The shape of X_train is ", X_train.shape)
print("The shape of X_test is ", X_test.shape)
print("The shape of y_train is", y_train.shape)
print("The shape of y_test is", y_test.shape)



The shape of X_train is  (95816,)
The shape of X_test is  (23955,)
The shape of y_train is (95816,)
The shape of y_test is (23955,)


In [32]:
train_data_texts = X_train.tolist()
test_data_texts = X_test.tolist()

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data_texts)

word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
print("Vocabulary Size:", vocab_size)

# Convert texts to sequences and pad them
max_length_preprocessed = 177  # example value
x_train = pad_sequences(tokenizer.texts_to_sequences(train_data_texts), maxlen=max_length_preprocessed)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_data_texts), maxlen=max_length_preprocessed)

print("Training X Shape:", x_train.shape)
print("Testing X Shape:", x_test.shape)

# Labels as numpy arrays
y_train = np.array(y_train).reshape(-1, 1)
y_test = np.array(y_test).reshape(-1, 1)
print("Training Y Shape:", y_train.shape)
print("Testing Y Shape:", y_test.shape)

# One-hot encode the labels
num_classes = 4
encoder = LabelBinarizer()
encoder.fit(range(num_classes))
y_train_encoded = encoder.transform(y_train)
y_test_encoded = encoder.transform(y_test)

print("Training Y Shape (One-Hot Encoded):", y_train_encoded.shape)
print("Testing Y Shape (One-Hot Encoded):", y_test_encoded.shape)

Vocabulary Size: 60955
Training X Shape: (95816, 100)
Testing X Shape: (23955, 100)
Training Y Shape: (95816, 1)
Testing Y Shape: (23955, 1)
Training Y Shape (One-Hot Encoded): (95816, 4)
Testing Y Shape (One-Hot Encoded): (23955, 4)


array([[1, 0, 0, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       ...,
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 1, 0, 0]])

In [33]:
class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = float('inf')

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss >= (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [34]:
# Convert data to PyTorch tensors
x_train_tensor = torch.tensor(x_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.float)
x_test_tensor = torch.tensor(x_test, dtype=torch.long)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.float)

# Create DataLoader
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
test_dataset = TensorDataset(x_test_tensor, y_test_tensor)

# potencial batch sizes small batches 32 64 128 256      big batches 512 1024 2048
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [35]:
# Training function
def train(model, iterator, optimizer, criterion, device):
    epoch_loss = 0
    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        text, labels = batch
        text, labels = text.to(device), labels.to(device)
        predictions = model(text)
        loss = criterion(predictions, labels.argmax(dim=1))
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

# Evaluation function
def evaluate(model, iterator, criterion, device):
    epoch_loss = 0
    model.eval()

    with torch.no_grad():
        for batch in iterator:
            text, labels = batch
            text, labels = text.to(device), labels.to(device)
            predictions = model(text)
            loss = criterion(predictions, labels.argmax(dim=1))
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [36]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        output = self.fc(output[:, -1, :])  # Get the output of the last time step
        return output


# Model parameters potencial 50 100 200 300
embedding_dim = 128


# Instantiate the model
model = RNNModel(vocab_size, embedding_dim, 32, num_classes)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

# Training loop
N_EPOCHS = 100
early_stopper = EarlyStopper(patience=3, min_delta=0)
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    test_loss = evaluate(model, test_loader, criterion, device)
    if early_stopper.early_stop(test_loss):
        break
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Test Loss: {test_loss:.3f}')


# Predict and evaluate
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        text, labels = batch
        text, labels = text.to(device), labels.to(device)
        predictions = model(text)
        preds = predictions.argmax(dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.argmax(dim=1).cpu().numpy())

# Calculate confusion matrix
conf_matrix = confusion_matrix(all_labels, all_preds)
print("Confusion Matrix:")
print(conf_matrix)

Epoch: 01, Train Loss: 1.414, Test Loss: 1.358
Epoch: 02, Train Loss: 1.324, Test Loss: 1.298
Epoch: 03, Train Loss: 1.245, Test Loss: 1.212
Epoch: 04, Train Loss: 1.094, Test Loss: 0.978
Epoch: 05, Train Loss: 0.876, Test Loss: 0.852
Epoch: 06, Train Loss: 0.759, Test Loss: 0.795
Epoch: 07, Train Loss: 0.686, Test Loss: 0.751
Epoch: 08, Train Loss: 0.632, Test Loss: 0.723
Epoch: 09, Train Loss: 0.584, Test Loss: 0.702
Epoch: 10, Train Loss: 0.547, Test Loss: 0.695
Confusion Matrix:
[[4602  635  391  375]
 [ 435 5437   43   49]
 [ 357  112 3465 2067]
 [ 363   92 1989 3543]]


In [37]:
# Calculate confusion matrix
conf_matrix = confusion_matrix(all_labels, all_preds)
print("Confusion Matrix:")
print(conf_matrix)


accuracy = accuracy_score(all_labels, all_preds)

# Calculate F1 score
f1 = f1_score(all_labels, all_preds, average='weighted')

# Calculate recall
recall = recall_score(all_labels, all_preds, average='weighted')

print("Confusion Matrix:\n", conf_matrix)
print("Accuracy: ", accuracy)
print("F1 Score: ", f1)
print("Recall: ", recall)


Confusion Matrix:
[[4602  635  391  375]
 [ 435 5437   43   49]
 [ 357  112 3465 2067]
 [ 363   92 1989 3543]]
Confusion Matrix:
 [[4602  635  391  375]
 [ 435 5437   43   49]
 [ 357  112 3465 2067]
 [ 363   92 1989 3543]]
Accuracy:  0.7116259653517011
F1 Score:  0.7106556880042383
Recall:  0.7116259653517011
