In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import torch.nn.functional as F

In [13]:
# Load dataset
df = pd.read_excel(r"E:\Bangla-Sentiment-Analysis\Bangla_Dataset\final_preprocessed_dataset.xlsx")

# Tokenize sentences
df['tokens'] = df['clean_sentence'].apply(word_tokenize)

# Encode labels (Assuming 'Sentiment' column has 'Positive', 'Negative', 'Neutral')
label_dict = {'Positive': 2, 'Neutral': 1, 'Negative': 0}
df['label'] = df['Sentiment'].map(label_dict)

In [14]:
# Load GloVe embeddings
def load_glove_embeddings(glove_file, embedding_dim):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Path to GloVe embeddings file (300d)
glove_file_path = r'E:\Bangla-Sentiment-Analysis\Word Embeddings\bn_glove.39M.300d.txt'
embedding_dim = 300
embeddings_index = load_glove_embeddings(glove_file_path, embedding_dim)

In [15]:
# word-to-index dictionary and embedding matrix
word_to_idx = {}
embedding_matrix = []
vocab = set()

for tokens in df['tokens']:
    vocab.update(tokens)

for idx, word in enumerate(vocab):
    word_to_idx[word] = idx + 1  
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix.append(embedding_vector)
    else:
        embedding_matrix.append(np.random.normal(size=(embedding_dim,)))

embedding_matrix = np.array(embedding_matrix)
embedding_matrix = np.vstack([np.zeros((embedding_dim,)), embedding_matrix])  # Add padding (index 0) embedding

In [16]:
# Custom dataset class
class SentimentDataset(Dataset):
    def __init__(self, df, word_to_idx, max_len):
        self.df = df
        self.word_to_idx = word_to_idx
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        tokens = self.df.iloc[idx]['tokens']
        label = self.df.iloc[idx]['label']

        # Convert tokens to indices
        indices = [self.word_to_idx.get(token, 0) for token in tokens]
        
        # Pad or truncate sequence
        if len(indices) < self.max_len:
            indices += [0] * (self.max_len - len(indices))  # Pad with 0 (padding token index)
        else:
            indices = indices[:self.max_len]

        return torch.tensor(indices, dtype=torch.long), torch.tensor(label, dtype=torch.long)

In [17]:
# Model definition (BiLSTM)
class BiLSTMModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim):
        super(BiLSTMModel, self).__init__()
        num_embeddings, embedding_dim = embedding_matrix.shape
        
        # Embedding layer initialized with pre-trained GloVe embeddings
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=False)  # Set freeze=False to fine-tune
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # BiLSTM -> 2 * hidden_dim

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out[:, -1, :])  # Take the last hidden state
        return output

# Hyperparameters
max_len = 100  # Max length of input sequence
hidden_dim = 128  # LSTM hidden state size
output_dim = 3  # Three sentiment classes (Positive, Negative, Neutral)
batch_size = 32
epochs = 10
lr = 0.001

# Split dataset into train and validation sets
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)

# Create DataLoader objects
train_dataset = SentimentDataset(train_df, word_to_idx, max_len)
valid_dataset = SentimentDataset(valid_df, word_to_idx, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [18]:
# Initialize model, loss function, optimizer
model = BiLSTMModel(embedding_matrix, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

# Training loop with GPU support
def train_model(model, train_loader, valid_loader, criterion, optimizer, epochs):
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        
        for inputs, labels in train_loader:
            # Move inputs and labels to GPU
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        # Validation step
        model.eval()
        valid_loss = 0
        with torch.no_grad():
            for inputs, labels in valid_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                valid_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {epoch_loss/len(train_loader):.4f} | Valid Loss: {valid_loss/len(valid_loader):.4f}")

# Train the model with GPU
train_model(model, train_loader, valid_loader, criterion, optimizer, epochs)

Epoch 1/10 | Train Loss: 1.0572 | Valid Loss: 1.0549
Epoch 2/10 | Train Loss: 1.0561 | Valid Loss: 1.0525
Epoch 3/10 | Train Loss: 1.0553 | Valid Loss: 1.0539
Epoch 4/10 | Train Loss: 1.0544 | Valid Loss: 1.0539
Epoch 5/10 | Train Loss: 1.0539 | Valid Loss: 1.0546
Epoch 6/10 | Train Loss: 0.8812 | Valid Loss: 0.7992
Epoch 7/10 | Train Loss: 0.7230 | Valid Loss: 0.7738
Epoch 8/10 | Train Loss: 0.6257 | Valid Loss: 0.7765
Epoch 9/10 | Train Loss: 0.5424 | Valid Loss: 0.8116
Epoch 10/10 | Train Loss: 0.4757 | Valid Loss: 0.8470


In [19]:
# Saving the fine-tuned GloVe embeddings after training
fine_tuned_embeddings = model.embedding.weight.data.cpu().numpy()

# Create a reverse mapping from index to word
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Function to save the fine-tuned GloVe embeddings
def save_fine_tuned_glove(fine_tuned_embeddings, idx_to_word, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for idx, embedding in enumerate(fine_tuned_embeddings):
            word = idx_to_word.get(idx)
            if word:  # Ensure it's a valid word in the vocabulary
                embedding_str = ' '.join(map(str, embedding))
                f.write(f"{word} {embedding_str}\n")

# Define the output file path
output_file_path = r'E:\Bangla-Sentiment-Analysis\Word Embeddings\fine_tuned_glove_vectors_1.txt'

# Save the fine-tuned embeddings
save_fine_tuned_glove(fine_tuned_embeddings, idx_to_word, output_file_path)

print(f"Fine-tuned GloVe vectors saved to {output_file_path}")

Fine-tuned GloVe vectors saved to E:\Bangla Sentiment Analysis Thesis\Word Embeddings\fine_tuned_glove_vectors_1.txt


In [20]:
# Prediction code stays the same
def preprocess_sentence(sentence, word_to_idx, max_len):
    tokens = word_tokenize(sentence)
    indices = [word_to_idx.get(token, 0) for token in tokens]  # Use 0 for unknown words
    if len(indices) < max_len:
        indices += [0] * (max_len - len(indices))  # Pad with 0
    else:
        indices = indices[:max_len]
    return torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)

def predict_sentiment(model, sentence, word_to_idx, max_len):
    model.eval()
    with torch.no_grad():
        sentence_tensor = preprocess_sentence(sentence, word_to_idx, max_len)
        output = model(sentence_tensor)
        probabilities = F.softmax(output, dim=1)
        predicted_label = torch.argmax(probabilities, dim=1).item()
        return predicted_label, probabilities

# No need to load the saved best model if you're using the final model directly
model = model.to(device)  # Use the final trained model as it is


In [21]:
# Example sentence for prediction
test_sentence = "আমি আজ খুবই আনন্দিত"
predicted_label, probabilities = predict_sentiment(model, test_sentence, word_to_idx, max_len)

# Map label index back to sentiment
idx_to_label = {v: k for k, v in label_dict.items()}
predicted_sentiment = idx_to_label[predicted_label]

print(f"Predicted Sentiment: {predicted_sentiment}")
print(f"Probabilities: {probabilities}")

Predicted Sentiment: Positive
Probabilities: tensor([[0.0090, 0.0203, 0.9707]], device='cuda:0')


In [22]:
# Example sentence for prediction
test_sentence = " বাজে মেশিন   ব্যবহার করার পর রিভিউ দিলাম"
predicted_label, probabilities = predict_sentiment(model, test_sentence, word_to_idx, max_len)

# Map label index back to sentiment
idx_to_label = {v: k for k, v in label_dict.items()}
predicted_sentiment = idx_to_label[predicted_label]

print(f"Predicted Sentiment: {predicted_sentiment}")
print(f"Probabilities: {probabilities}")

Predicted Sentiment: Negative
Probabilities: tensor([[0.9259, 0.0489, 0.0252]], device='cuda:0')
