In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
import os
import numpy as np
import pandas as pd

class SentimentRNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, pretrained_embeddings=None, dropout=0.5):
        super(SentimentRNN, self).__init__()
        
        if pretrained_embeddings is not None:
            self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        else:
            self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x) 
        lstm_out, (h_n, c_n) = self.lstm(embedded)  
        out = self.dropout(h_n[-1])  
        out = self.fc(out)
        return out

MAX_LEN = 128
BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 0.001
NUM_CLASSES = 3  # Negative, neutral, positive sentiment
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
DROPOUT = 0.5

# Download GloVe embeddings (100-dimensional embeddings)
def load_glove_embeddings(glove_file_path, embedding_dim=100):
    embeddings = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Load GloVe embeddings
glove_path = 'glove/glove.6B.100d.txt' # download this and place in the repo.
glove_embeddings = load_glove_embeddings(glove_path)

# Tokenization and indexing
def tokenize_and_create_embeddings(texts, glove_embeddings, max_features=1000):
    vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
    X = vectorizer.fit_transform(texts).toarray()
    
    word_to_index = {word: idx for idx, word in enumerate(vectorizer.get_feature_names_out())}
    embedding_matrix = np.zeros((len(word_to_index), EMBEDDING_DIM))
    
    for word, idx in word_to_index.items():
        if word in glove_embeddings:
            embedding_matrix[idx] = glove_embeddings[word]
    
    return X, word_to_index, embedding_matrix

import kagglehub

path = kagglehub.dataset_download("kazanova/sentiment140")
print("Path to dataset files:", path)
file_path = os.path.join(path, "training.1600000.processed.noemoticon.csv")
columns = ['target', 'id', 'date', 'flag', 'user', 'text']
full_data = pd.read_csv(file_path, encoding='latin-1', names=columns)
data = full_data[['target', 'text']].copy()
data.rename(columns={"target": "label"}, inplace=True)


train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
X_train, word_to_index, embedding_matrix = tokenize_and_create_embeddings(train_data['text'], glove_embeddings)
X_test, _, _ = tokenize_and_create_embeddings(test_data['text'], glove_embeddings)