<a href="https://colab.research.google.com/github/Hanzlazafar1/DIP_assignment_02/blob/main/nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:


# Step 2: Import libraries
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Step 3: Load and prepare dataset
def load_data(path):
    data = pd.read_csv(path, sep=';', names=['text', 'label'])
    return data

train_data = load_data('/content/train.txt')
test_data = load_data('/content/test.txt')

# Step 4: Encode labels
label_encoder = LabelEncoder()
train_data['label_encoded'] = label_encoder.fit_transform(train_data['label'])
test_data['label_encoded'] = label_encoder.transform(test_data['label'])

# Step 5: Use sentence embeddings
model_name = 'all-MiniLM-L6-v2'
embedder = SentenceTransformer(model_name)

train_embeddings = embedder.encode(train_data['text'].tolist(), convert_to_tensor=True)
test_embeddings = embedder.encode(test_data['text'].tolist(), convert_to_tensor=True)

# Step 6: Dataset and Dataloader
class EmotionDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

train_dataset = EmotionDataset(train_embeddings, torch.tensor(train_data['label_encoded'].values))
test_dataset = EmotionDataset(test_embeddings, torch.tensor(test_data['label_encoded'].values))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Step 7: Define LSTM Classifier
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = x.unsqueeze(1)  # (batch, seq=1, features)
        _, (hn, _) = self.lstm(x)
        out = self.fc(hn[-1])
        return out

input_dim = train_embeddings.shape[1]
hidden_dim = 128
output_dim = len(label_encoder.classes_)

model = LSTMClassifier(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Step 8: Train the model
for epoch in range(10):
    model.train()
    total_loss = 0
    for batch in train_loader:
        inputs, labels = batch
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# Step 9: Evaluate the model
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.tolist())
        all_labels.extend(labels.tolist())

accuracy = accuracy_score(all_labels, all_preds)
print(f"\nTest Accuracy: {accuracy:.4f}")


In [3]:
# Step 1: Install dependencies
! pip install sentence-transformers sklearn torch pandas

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [4]:


# Step 2: Import libraries
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Step 3: Load and prepare dataset
def load_data(path):
    data = pd.read_csv(path, sep=';', names=['text', 'label'])
    return data

train_data = load_data('/content/train.txt')
test_data = load_data('/content/test.txt')

# Step 4: Encode labels
label_encoder = LabelEncoder()
train_data['label_encoded'] = label_encoder.fit_transform(train_data['label'])
test_data['label_encoded'] = label_encoder.transform(test_data['label'])

# Step 5: Use sentence embeddings
model_name = 'all-MiniLM-L6-v2'
embedder = SentenceTransformer(model_name)

train_embeddings = embedder.encode(train_data['text'].tolist(), convert_to_tensor=True)
test_embeddings = embedder.encode(test_data['text'].tolist(), convert_to_tensor=True)

# Step 6: Dataset and Dataloader
class EmotionDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

train_dataset = EmotionDataset(train_embeddings, torch.tensor(train_data['label_encoded'].values))
test_dataset = EmotionDataset(test_embeddings, torch.tensor(test_data['label_encoded'].values))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Step 7: Define LSTM Classifier
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = x.unsqueeze(1)  # (batch, seq=1, features)
        _, (hn, _) = self.lstm(x)
        out = self.fc(hn[-1])
        return out

input_dim = train_embeddings.shape[1]
hidden_dim = 128
output_dim = len(label_encoder.classes_)

model = LSTMClassifier(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Step 8: Train the model
for epoch in range(10):
    model.train()
    total_loss = 0
    for batch in train_loader:
        inputs, labels = batch
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# Step 9: Evaluate the model
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.tolist())
        all_labels.extend(labels.tolist())

accuracy = accuracy_score(all_labels, all_preds)
print(f"\nTest Accuracy: {accuracy:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Epoch 1, Loss: 600.3547
Epoch 2, Loss: 455.0518
Epoch 3, Loss: 423.7938
Epoch 4, Loss: 409.6992
Epoch 5, Loss: 399.5584
Epoch 6, Loss: 391.8527
Epoch 7, Loss: 384.1385
Epoch 8, Loss: 376.3318
Epoch 9, Loss: 368.5786
Epoch 10, Loss: 359.2661

Test Accuracy: 0.7005


In [5]:
# Step 10: Real-world prediction function
def predict_emotion(text):
    # Step 1: Embed the input sentence
    with torch.no_grad():
        embedding = embedder.encode([text], convert_to_tensor=True)

    # Step 2: Prepare input for LSTM (reshape for seq input)
    embedding = embedding.unsqueeze(1)  # Shape: (1, 1, embedding_dim)

    # Step 3: Model prediction
    model.eval()
    with torch.no_grad():
        output = model(embedding)
        prediction = torch.argmax(output, dim=1).item()

    # Step 4: Decode predicted label
    predicted_emotion = label_encoder.inverse_transform([prediction])[0]
    return predicted_emotion

sentence = "I feel really sad and disappointed today."
predicted_emotion = predict_emotion(sentence)
print(f"🧠 Predicted Emotion: {predicted_emotion}")


ValueError: LSTM: Expected input to be 2D or 3D, got 4D instead

In [7]:
# Install required packages
!pip install pandas

# Import required library
import pandas as pd

# Define a function to load dataset
def load_data(filepath):
    try:
        data = pd.read_csv(filepath, sep=';', names=["text", "label"])
        print(f"✅ Dataset loaded successfully from: {filepath}")
        print("📊 Sample data:")
        print(data.head())
        return data
    except Exception as e:
        print("❌ Failed to load dataset:", str(e))
        return None

# Load train and test datasets
train_path = '/content/train.txt'
test_path = '/content/test.txt'

train_df = load_data(train_path)
test_df = load_data(test_path)


✅ Dataset loaded successfully from: /content/train.txt
📊 Sample data:
                                                text    label
0                            i didnt feel humiliated  sadness
1  i can go from feeling so hopeless to so damned...  sadness
2   im grabbing a minute to post i feel greedy wrong    anger
3  i am ever feeling nostalgic about the fireplac...     love
4                               i am feeling grouchy    anger
✅ Dataset loaded successfully from: /content/test.txt
📊 Sample data:
                                                text    label
0  im feeling rather rotten so im not very ambiti...  sadness
1          im updating my blog because i feel shitty  sadness
2  i never make her separate from me because i do...  sadness
3  i left with my bouquet of red and yellow tulip...      joy
4    i was feeling a little vain when i did this one  sadness


In [8]:
# Install SentenceTransformer if not installed

# Import the library
from sentence_transformers import SentenceTransformer

# Load the latest embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Loaded SentenceTransformer model: all-MiniLM-L6-v2")

# Generate embeddings for train and test data
print("🔄 Generating embeddings for train data...")
X_train = embedder.encode(train_df['text'].tolist(), convert_to_numpy=True)
print("✅ Train embeddings generated.")

print("🔄 Generating embeddings for test data...")
X_test = embedder.encode(test_df['text'].tolist(), convert_to_numpy=True)
print("✅ Test embeddings generated.")


✅ Loaded SentenceTransformer model: all-MiniLM-L6-v2
🔄 Generating embeddings for train data...
✅ Train embeddings generated.
🔄 Generating embeddings for test data...
✅ Test embeddings generated.


In [9]:
# Install required libraries if needed
!pip install tensorflow scikit-learn

# Imports
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.utils import to_categorical

# Encode the emotion labels
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_df['label'])
test_labels_encoded = label_encoder.transform(test_df['label'])

# Convert labels to one-hot encoded format
y_train = to_categorical(train_labels_encoded)
y_test = to_categorical(test_labels_encoded)

# Reshape embeddings for LSTM: (samples, timesteps, features)
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Define the LSTM model
model = Sequential([
    Input(shape=(1, X_train.shape[2])),
    LSTM(128, return_sequences=False),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(y_train.shape[1], activation='softmax')  # number of emotion classes
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Summary
model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)




Epoch 1/20
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - accuracy: 0.4847 - loss: 1.4069 - val_accuracy: 0.6575 - val_loss: 0.9720
Epoch 2/20
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.6565 - loss: 0.9521 - val_accuracy: 0.6712 - val_loss: 0.9008
Epoch 3/20
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.6808 - loss: 0.8663 - val_accuracy: 0.6913 - val_loss: 0.8617
Epoch 4/20
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.6923 - loss: 0.8260 - val_accuracy: 0.6900 - val_loss: 0.8452
Epoch 5/20
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.6950 - loss: 0.8186 - val_accuracy: 0.6931 - val_loss: 0.8307
Epoch 6/20
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.7034 - loss: 0.7999 - val_accuracy: 0.6931 - val_loss: 0.8205
Epoch 7/20
[1m450/450[0m

In [10]:
# Save the trained model
model.save("emotion_lstm_model.h5")
print("✅ Model saved as 'emotion_lstm_model.h5'")

# Save the label encoder using joblib or pickle
import joblib
joblib.dump(label_encoder, "label_encoder.pkl")
print("✅ LabelEncoder saved as 'label_encoder.pkl'")




✅ Model saved as 'emotion_lstm_model.h5'
✅ LabelEncoder saved as 'label_encoder.pkl'


In [12]:
# Load necessary libraries
import numpy as np
import joblib
from tensorflow.keras.models import load_model
from sentence_transformers import SentenceTransformer

# Load saved model and label encoder
model = load_model("/content/emotion_lstm_model.h5")
label_encoder = joblib.load("/content/label_encoder.pkl")
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Predict function
def predict_emotion(text):
    # Get embedding
    embedding = embedder.encode([text])
    embedding = embedding.reshape((1, 1, embedding.shape[1]))  # (batch, timestep, features)

    # Predict
    prediction = model.predict(embedding)
    predicted_class = np.argmax(prediction)
    emotion = label_encoder.inverse_transform([predicted_class])[0]
    return emotion




In [19]:
sentence = "I am not speaking today"
emotion = predict_emotion(sentence)
print(f"🧠 Predicted Emotion: {emotion}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
🧠 Predicted Emotion: anger
