In [1]:
import pandas as pd
import re
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.optim import SGD



In [2]:

# 1. Load the Dataset
df = pd.read_csv('../dataset/train.csv', encoding='latin1')
df = df[['text', 'sentiment']].dropna()

# 2. Preprocess the Data
# Clean text data
df['text'] = df['text'].str.lower().apply(lambda x: re.sub(r'[^a-z\s]', '', x))

In [3]:

df = df[['text', 'sentiment']]

# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Drop rows with missing values
df.dropna(inplace=True)


Missing values:
 text         0
sentiment    0
dtype: int64


In [4]:

import pandas as pd
from collections import Counter
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])


def tokenize(text):
    return text.split()

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenize(text)

# Tokenize the text in the dataframe
tokenized_text = list(yield_tokens(df['text']))


In [5]:
# Flatten the list of tokens and count the frequency
all_tokens = [token for sublist in tokenized_text for token in sublist]
token_counts = Counter(all_tokens)

# Create vocabulary with special tokens
vocab = {"<pad>": 0, "<unk>": 1}
for idx, token in enumerate(token_counts.keys(), 2):
    vocab[token] = idx


In [6]:
max_length = 50
def text_to_sequence(text, vocab, max_length):
    tokens = tokenize(text)
    sequence = [vocab[token] for token in tokens]
    return sequence[:max_length] + [vocab["<pad>"]] * (max_length - len(sequence))

df['sequence'] = df['text'].apply(lambda x: text_to_sequence(x, vocab, max_length))


In [7]:
import pickle

# Save the vocabulary
with open('vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)
print("Vocabulary saved to vocab.pkl")


Vocabulary saved to vocab.pkl


In [18]:
class SentimentDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx]), torch.tensor(self.labels[idx])


In [19]:
X_train, X_val, y_train, y_val = train_test_split(df['sequence'], df['sentiment'], test_size=0.2, random_state=32)
train_dataset = SentimentDataset(list(X_train), list(y_train))
val_dataset = SentimentDataset(list(X_val), list(y_val))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [20]:
class SentimentModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, dropout_rate=0.1, l2_lambda=0.001, num_classes=3):
        super(SentimentModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(embedding_dim * max_length, 128)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(128, 64)
        self.dropout2 = nn.Dropout(dropout_rate)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.flatten(x)
        x = nn.ReLU()(self.fc1(x))
        x = self.dropout1(x)
        x = nn.ReLU()(self.fc2(x))
        x = self.dropout2(x)
        x = nn.ReLU()(self.fc3(x))
        x = self.fc4(x)
        return x

# Instantiate the Model
model = SentimentModel(vocab_size=len(vocab))

In [21]:
criterion = nn.CrossEntropyLoss()
optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.001)

# 8. Train the Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [22]:
def train(model, loader, optimizer, criterion):
    model.train()
    total_loss, correct = 0, 0
    for sequences, labels in loader:
        sequences, labels = sequences.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(sequences)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        correct += (outputs.argmax(1) == labels).sum().item()
    return total_loss / len(loader), correct / len(loader.dataset)

def evaluate(model, loader, criterion):
    model.eval()
    total_loss, correct = 0, 0
    with torch.no_grad():
        for sequences, labels in loader:
            sequences, labels = sequences.to(device), labels.to(device)
            outputs = model(sequences)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            correct += (outputs.argmax(1) == labels).sum().item()
    return total_loss / len(loader), correct / len(loader.dataset)

In [23]:
num_epochs = 20
for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_loader, criterion)
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")

Epoch 1/20
Train Loss: 1.0866, Train Accuracy: 0.4005
Validation Loss: 1.0816, Validation Accuracy: 0.4065
Epoch 2/20
Train Loss: 1.0823, Train Accuracy: 0.4038
Validation Loss: 1.0848, Validation Accuracy: 0.4065
Epoch 3/20
Train Loss: 1.0745, Train Accuracy: 0.4178
Validation Loss: 1.0718, Validation Accuracy: 0.4041
Epoch 4/20
Train Loss: 1.0655, Train Accuracy: 0.4254
Validation Loss: 1.0667, Validation Accuracy: 0.4347
Epoch 5/20
Train Loss: 1.0615, Train Accuracy: 0.4327
Validation Loss: 1.0539, Validation Accuracy: 0.4367
Epoch 6/20
Train Loss: 1.0416, Train Accuracy: 0.4535
Validation Loss: 1.0182, Validation Accuracy: 0.4667
Epoch 7/20
Train Loss: 1.0237, Train Accuracy: 0.4672
Validation Loss: 1.0250, Validation Accuracy: 0.4643
Epoch 8/20
Train Loss: 1.0320, Train Accuracy: 0.4590
Validation Loss: 1.0276, Validation Accuracy: 0.4591
Epoch 9/20
Train Loss: 0.9969, Train Accuracy: 0.4927
Validation Loss: 1.0379, Validation Accuracy: 0.4640
Epoch 10/20
Train Loss: 0.9698, Train

In [24]:
torch.save(model.state_dict(), 'sentiment_model.pth')
print("Model saved as 'sentiment_model.pth'")

Model saved as 'sentiment_model.pth'
