In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from torch.utils.data import TensorDataset, DataLoader

In [None]:
glove = pd.read_pickle('../processed_data/glove.pickle')
vocab = torch.load('../processed_data/remove-stopwords-punct-25000.vocab')

In [None]:
data = pd.read_csv('../raw_data/fulltrain.csv', names=['label', 'text'])

In [None]:
# Hyperparameters
sequence_length = 100
input_size = 300
output_size = 4
batch_size = 100
num_epochs = 10


In [None]:
# Convert text to sequences of GloVe embeddings
def text_to_sequence(text):
    sequence = []
    for word in text.split()[:sequence_length]:
        if word in vocab:
            sequence.append(glove[vocab[word]])
    sequence = sequence[:sequence_length] + [np.zeros(300)] * (sequence_length - len(sequence))
    return np.array(sequence)

In [None]:
# Process text data in batches
sequences = []
for i in range(0, len(data), batch_size):
    batch_texts = data['text'][i:i+batch_size]
    batch_sequences = np.array(batch_texts.apply(text_to_sequence).tolist())
    sequences.extend(batch_sequences)


In [None]:
# Convert labels to one-hot encoding
labels = pd.get_dummies(data['label']).values


In [None]:
# Define the CNN model
class CNNModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv1d(input_size, 256, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(256, 128, kernel_size=3, padding=1)
        self.fc = nn.Linear(128 * sequence_length, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = x.transpose(1, 2)  # Reshape for Conv1d input (B, C, L)
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(x.size(0), -1)  # Flatten
        x = self.fc(x)
        return x

In [None]:
# Instantiate the CNN model
model = CNNModel(input_size, output_size)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Prepare dataset and data loader
train_dataset = TensorDataset(torch.tensor(sequences).float(), torch.tensor(np.argmax(labels, axis=1)).long())
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


In [None]:
# Function for training the CNN model
def train(model, criterion, optimizer, train_loader, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        for i, (embeddings, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            outputs = model(embeddings)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            if (i+1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item()}')

# Train the CNN model
train(model, criterion, optimizer, train_loader, num_epochs)
