In [2]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2").to('cuda')


  from tqdm.autonotebook import tqdm, trange
2025-04-15 20:05:06.068975: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744736706.236828     871 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744736706.284532     871 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-15 20:05:06.844698: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [115]:
import torch
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=1, dropout=0.1):
        super(LSTMClassifier, self).__init__()
        
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers=num_layers,
                            batch_first=True,
                            dropout=dropout if num_layers > 1 else 0)
        self.hidden_size = hidden_dim
        self.num_layers = num_layers
        self.layer_norm = nn.LayerNorm(hidden_dim)
        self.dropout = nn.Dropout(dropout)

        # Improved classifier head
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, output_dim)
        )

        self.output = nn.Softmax(dim=-1)

    def forward(self, x):
        #h0 = torch.zeros(x.size(0), self.num_layers, self.hidden_size).to(x.device)
        #c0 = torch.zeros(x.size(0), self.num_layers,  self.hidden_size).to(x.device)
        #print("h0:", h0.shape)
        lstm_out, (h_n, c_n) = self.lstm(x)
        #print("lstm_out:", lstm_out.shape)
        final_hidden_state = lstm_out.reshape(lstm_out.size(0), lstm_out.size(1))
        print("final_hidden_state:", final_hidden_state.shape)
        print("layer_norm:", self.hidden_size)
        #print("final_hidden_state:", final_hidden_state.shape)
        pooled = self.layer_norm(final_hidden_state)
        pooled = self.dropout(pooled)

        # Classifier head
        out = self.classifier(pooled)
        #print("out:", pooled.shape)
        return self.output(out)

In [117]:
# Example setup
vocab_size = 10000
embedding_dim = 384
hidden_dim = 64
output_dim = 3  # For binary classification
sequence_length = 100

model = LSTMClassifier(vocab_size=vocab_size, embedding_dim=embedding_dim,hidden_dim = hidden_dim, output_dim= output_dim)


In [3]:
import pandas as pd

df = pd.read_csv('train.csv', encoding='iso-8859-1')

In [4]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [5]:
from tqdm import tqdm

# Make sure tqdm is used for pandas apply
tqdm.pandas()

df['embeddings'] = df['text'].progress_apply(lambda x: embedding_model.encode(str(x), convert_to_tensor=True).tolist())


100%|██████████| 27481/27481 [03:52<00:00, 118.35it/s]


In [6]:
from sklearn.model_selection import train_test_split

# Assuming you have a DataFrame `df` and your features are X and target variable is y
X = df.drop(columns=['sentiment'])  # Features (remove the target column)
y = df['sentiment']  # Target variable (e.g., sentiment)

# Split the data: 80% for training and 20% for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train['embeddings']
y_train = y_train.apply(lambda x: 2 if x == 'positive' else 1 if x == 'neutral' else 0)

X_val = X_val['embeddings']
y_val = y_val.apply(lambda x: 2 if x == 'positive' else 1 if x == 'neutral' else 0)

In [None]:
type(X_train)

pandas.core.series.Series

In [7]:
X_train = torch.tensor(X_train, dtype=torch.float32).type(torch.FloatTensor)

In [8]:
import numpy as np


X_val_array = np.array(X_val.tolist())
X_val = torch.tensor(X_val_array, dtype=torch.float32).type(torch.FloatTensor)

In [9]:
y_train = torch.tensor(y_train, dtype=torch.float32).type(torch.LongTensor)
y_val_array = np.array(y_val.tolist())

y_val = torch.tensor(y_val_array, dtype=torch.float32).type(torch.LongTensor)

In [11]:
df['sentiment'].unique()

array(['neutral', 'negative', 'positive'], dtype=object)

In [10]:
def labels_to_one_hot(labels, num_classes):
    one_hot_vectors = np.zeros((len(labels), num_classes))
    for i, label in enumerate(labels):
        one_hot_vectors[i, label] = 1
    return one_hot_vectors

In [113]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score


# Dataset & DataLoader
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset((X_val), (y_val))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Model, loss, optimizer
model = LSTMClassifier(vocab_size=vocab_size, embedding_dim=embedding_dim,hidden_dim = 32, output_dim = output_dim, num_layers=1, dropout=0.3)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Training loop
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    train_losses = []
    
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_x)
        #print(f"Outputs: {outputs}")
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())
    
    # Validation loop
    val_losses = []
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for val_x, val_y in val_loader:
            val_x, val_y = val_x.to(device), val_y.to(device)
            outputs = model(val_x).float()
            loss = criterion(outputs, val_y)
            val_losses.append(loss.item())
            #val_labels.extend(val_y.cpu().numpy())

    one_hot = np.zeros(3)
    val_labels = labels_to_one_hot(val_labels, 3)
    #val_acc = accuracy_score(val_labels, (val_preds))

    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {sum(train_losses)/len(train_losses):.4f} - Val loss: {sum(val_losses)/len(val_losses):.4f}")


final_hidden_state: torch.Size([32, 64])
layer_norm: 32


RuntimeError: Given normalized_shape=[32], expected input with shape [*, 32], but got input of size[32, 64]

In [47]:
import pandas as pd

df_test = pd.read_csv('test.csv', encoding='iso-8859-1')

In [48]:
from tqdm import tqdm

# Make sure tqdm is used for pandas apply
tqdm.pandas()

df_test['embeddings'] = df_test['text'].progress_apply(lambda x: embedding_model.encode(str(x), convert_to_tensor=True).tolist())


100%|██████████| 3534/3534 [00:28<00:00, 122.34it/s]


In [49]:
from sklearn.model_selection import train_test_split

# Assuming you have a DataFrame `df` and your features are X and target variable is y
X_test = df_test.drop(columns=['sentiment'])  # Features (remove the target column)
y_test = df_test['sentiment']  # Target variable (e.g., sentiment)

X_test = X_test['embeddings']


In [50]:
y_test

0        neutral
1       positive
2       negative
3       positive
4       positive
          ...   
3529    negative
3530    positive
3531    negative
3532    positive
3533    positive
Name: sentiment, Length: 3534, dtype: object

In [51]:
list_lengths = len(X_test.values)*[384]

# Create a tensor with an extra dimension
data = np.array([row for row in X_test.values])
X_tensor = torch.tensor(data, dtype=torch.float32)

In [52]:
y_test = y_test.apply(lambda x: 2 if x == 'positive' else 1 if x == 'neutral' else 0)


In [53]:
y_tensor_test = torch.tensor(y_test, dtype=torch.long)

In [91]:
test_dataset = TensorDataset((X_tensor), (y_tensor_test))

test_loader = DataLoader(val_dataset, batch_size=16)

model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
    for val_x, val_y in test_loader:
        val_x, val_y = val_x.to(device), val_y.to(device)
        outputs = model(val_x).float()
        #print(f"Outputs: {outputs}")
        preds = []
        for output in outputs:
            pred = torch.argmax(output).cpu().numpy()
            pred_array = np.zeros(3)
            pred_array[pred] = 1
            preds.append(pred_array)
            #print(f"Preds: {preds}")
        preds = torch.tensor(preds)
        #print(f"Preds: {preds}")
        test_preds.extend(preds.cpu().numpy())
        test_labels.extend(val_y.cpu().numpy())

one_hot = np.zeros(3)
val_labels = labels_to_one_hot(test_labels, 3)
val_acc = accuracy_score(val_labels, (test_preds))

In [92]:
print(val_acc)

0.6949245042750591


In [None]:
model = LSTMClassifier(vocab_size=vocab_size, embedding_dim=embedding_dim,hidden_dim = 32, output_dim = output_dim, num_layers=1, dropout=0.3)

model.load_state_dict(torch.load("lstm.pth", map_location='cuda'))


  model.load_state_dict(torch.load("lstm.pth", map_location='cuda'))


RuntimeError: Error(s) in loading state_dict for LSTMClassifier:
	size mismatch for lstm.weight_ih_l0: copying a param with shape torch.Size([128, 384]) from checkpoint, the shape in current model is torch.Size([256, 384]).
	size mismatch for lstm.weight_hh_l0: copying a param with shape torch.Size([128, 32]) from checkpoint, the shape in current model is torch.Size([256, 64]).
	size mismatch for lstm.bias_ih_l0: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for lstm.bias_hh_l0: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for layer_norm.weight: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for layer_norm.bias: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for classifier.0.weight: copying a param with shape torch.Size([16, 32]) from checkpoint, the shape in current model is torch.Size([32, 64]).
	size mismatch for classifier.0.bias: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for classifier.3.weight: copying a param with shape torch.Size([3, 16]) from checkpoint, the shape in current model is torch.Size([3, 32]).

In [120]:
s = "I bought this car yesterday"
embeddings   = embedding_model.encode(s, convert_to_tensor=True).unsqueeze(0).to('cpu')
model.eval()
with torch.no_grad():
    output = model(embeddings)
    pred = torch.argmax(output, dim=1).cpu().numpy()
    print(f"Predicted class: {pred}")

final_hidden_state: torch.Size([1, 64])
layer_norm: 64
Predicted class: [2]


In [96]:
torch.save(model.state_dict(), 'lstm.pth')