<a href="https://colab.research.google.com/github/GangitiNeeraj4120/databytes_task1/blob/main/DataByte_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from collections import Counter
import pandas as pd
import numpy as np

In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
file_path='/content/drive/MyDrive/db/IMDB Dataset.csv'

In [None]:
df = pd.read_csv(file_path)
df['sentiment'] = df['sentiment'].map({'positive':1, 'negative':0})
print(df.head(5))
df.shape

                                              review  sentiment
0  One of the other reviewers has mentioned that ...          1
1  A wonderful little production. <br /><br />The...          1
2  I thought this was a wonderful way to spend ti...          1
3  Basically there's a family where a little boy ...          0
4  Petter Mattei's "Love in the Time of Money" is...          1


(50000, 2)

In [None]:
#Tokenizing and vocabulary building
df['tokens']=df['review'].apply(lambda x: word_tokenize(x.lower()))

In [None]:
df.head(5)

Unnamed: 0,review,sentiment,tokens
0,One of the other reviewers has mentioned that ...,1,"[one, of, the, other, reviewers, has, mentione..."
1,A wonderful little production. <br /><br />The...,1,"[a, wonderful, little, production, ., <, br, /..."
2,I thought this was a wonderful way to spend ti...,1,"[i, thought, this, was, a, wonderful, way, to,..."
3,Basically there's a family where a little boy ...,0,"[basically, there, 's, a, family, where, a, li..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,"[petter, mattei, 's, ``, love, in, the, time, ..."


In [None]:
#Building vocabulary
all_words = []
for tokens in df['tokens']:
  for word in tokens:
    all_words.append(word)
word_counts = Counter(all_words)
vocab = {word: idx+2 for idx, (word, _) in enumerate(word_counts.items())}
vocab['<PAD>']=0
vocab['<UNK>']=1

def encode_review(tokens):
  return [vocab.get(word, vocab['<UNK>']) for word in tokens]

df['encoded']=df['tokens'].apply(encode_review)

In [None]:
df.head(5)

Unnamed: 0,review,sentiment,tokens,encoded
0,One of the other reviewers has mentioned that ...,1,"[one, of, the, other, reviewers, has, mentione...","[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1..."
1,A wonderful little production. <br /><br />The...,1,"[a, wonderful, little, production, ., <, br, /...","[57, 204, 205, 206, 20, 33, 34, 35, 36, 33, 34..."
2,I thought this was a wonderful way to spend ti...,1,"[i, thought, this, was, a, wonderful, way, to,...","[128, 289, 26, 42, 57, 204, 290, 68, 291, 292,..."
3,Basically there's a family where a little boy ...,0,"[basically, there, 's, a, family, where, a, li...","[366, 367, 254, 57, 368, 93, 57, 205, 369, 166..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,"[petter, mattei, 's, ``, love, in, the, time, ...","[417, 418, 254, 227, 419, 51, 4, 292, 3, 420, ..."


In [None]:
#Padding
MAX_LEN=100
def pad_sequence(seq):
  if len(seq) < MAX_LEN:
    return seq + [0]*(MAX_LEN - len(seq))
  else:
    return seq[:MAX_LEN]

df['padded']=df['encoded'].apply(pad_sequence)
df['padded']

Unnamed: 0,padded
0,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1..."
1,"[57, 204, 205, 206, 20, 33, 34, 35, 36, 33, 34..."
2,"[128, 289, 26, 42, 57, 204, 290, 68, 291, 292,..."
3,"[366, 367, 254, 57, 368, 93, 57, 205, 369, 166..."
4,"[417, 418, 254, 227, 419, 51, 4, 292, 3, 420, ..."
...,...
49995,"[128, 289, 26, 378, 729, 57, 232, 23, 488, 144..."
49996,"[638, 301, 24, 638, 303, 24, 638, 487, 24, 634..."
49997,"[128, 1332, 57, 12022, 17323, 51, 65127, 14422..."
49998,"[128, 963, 388, 68, 96, 68, 10830, 31, 4, 460,..."


In [None]:
#Train and Test split
X = list(df['padded'])
y = list(df['sentiment'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
class IMDBDataset(Dataset):
  def __init__(self, reviews, labels):
    self.reviews = torch.tensor(reviews, dtype=torch.long)
    self.labels = torch.tensor(labels, dtype=torch.float32)

  def __len__(self):
    return len(self.reviews)

  def __getitem__(self, idx):
    return self.reviews[idx], self.labels[idx]

train_ds = IMDBDataset(X_train, y_train)
test_ds = IMDBDataset(X_test, y_test)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=64)

In [None]:
#Training the model
class SentimentRNN(nn.Module):
  def __init__(self, vocab_size):
    super(SentimentRNN, self).__init__()
    self.embedding=nn.Embedding(vocab_size, 100, padding_idx=0)
    self.lstm=nn.LSTM(100, 128, batch_first=True, bidirectional=True, num_layers=3)
    self.dropout = nn.Dropout(0.2)

    self.attain = nn.Linear(128*2, 1)

    self.fc = nn.Linear(128*2, 512)
    self.output = nn.Linear(512, 1)

    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    emb = self.embedding(x)
    out, _ = self.lstm(emb)

    attn_weights = self.attain(out)
    attn_weights = torch.softmax(attn_weights, dim=1)

    context_vector = torch.sum(attn_weights*out, dim=1)

    out = self.dropout(context_vector)
    out = F.relu(self.fc(out))
    out = self.output(out)

    return self.sigmoid(out).squeeze()

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = SentimentRNN(len(vocab)).to(device)

In [None]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

In [None]:
all_preds = []
all_labels = []

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

def train_nn(model, train_loader, test_loader, criterion, optimizer, num_epochs):
  best_loss=float('inf')
  no_improve_epochs=0

  for epoch in range(num_epochs):
    model.train()

    train_correct=0
    train_loss=0
    train_total=0
    for inputs, targets in train_loader:
      inputs = inputs.to(device)
      targets = targets.to(device)
      train_total += targets.size(0)

      optimizer.zero_grad()
      outputs = model(inputs)
      loss = criterion(outputs, targets)
      loss.backward()
      optimizer.step()

      predicted=(outputs>0.5).float()
      train_correct += (predicted == targets).sum().item()
      train_loss += loss.item()

    print(f"Epoch: {epoch + 1}\n Loss: {train_loss/len(train_loader):.4f}, training accuracy: {100*train_correct/train_total:.2f}%")

    # evaluating the model
    model.eval()
    test_correct=0
    test_loss=0
    test_total=0
    with torch.no_grad():
      for inputs, targets in test_loader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        test_total+=targets.size(0)

        outputs=model(inputs)

        loss=criterion(outputs, targets)
        test_loss+=loss.item()

        preds=(outputs>0.5).float()
        test_correct+=(preds==targets).sum().item()
        loss = test_loss/len(test_loader)
        all_preds.extend(preds.cpu().numpy().tolist())
        all_labels.extend(targets.cpu().numpy().tolist())

    print(f"Test accuracy: {100*test_correct/test_total:.2f}%, Loss: {loss:.4f}")

    if loss < best_loss:
      best_loss = loss
      torch.save(model.state_dict(), "best_model.pt")
    else:
      no_improve_epochs += 1
      if no_improve_epochs >3:
        break


In [None]:
train_nn(model, train_loader, test_loader, criterion, optimizer, 20)

Epoch: 1
 Loss: 0.5714, training accuracy: 68.69%
Test accuracy: 77.25%, Loss: 0.4672
Epoch: 2
 Loss: 0.4368, training accuracy: 79.23%
Test accuracy: 79.45%, Loss: 0.4262
Epoch: 3
 Loss: 0.3762, training accuracy: 83.05%
Test accuracy: 82.64%, Loss: 0.3764
Epoch: 4
 Loss: 0.3344, training accuracy: 85.43%
Test accuracy: 83.06%, Loss: 0.3755
Epoch: 5
 Loss: 0.2967, training accuracy: 87.49%
Test accuracy: 83.39%, Loss: 0.3871
Epoch: 6
 Loss: 0.2624, training accuracy: 89.17%
Test accuracy: 83.27%, Loss: 0.3722
Epoch: 7
 Loss: 0.2270, training accuracy: 90.98%
Test accuracy: 83.29%, Loss: 0.4029
Epoch: 8
 Loss: 0.1978, training accuracy: 92.23%
Test accuracy: 82.48%, Loss: 0.4371
Epoch: 9
 Loss: 0.1687, training accuracy: 93.63%
Test accuracy: 83.40%, Loss: 0.4274


In [None]:
def get_predictions_and_labels(model, dataloader, device):
    model.eval()

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            threshold = 0.52
            preds = (outputs > threshold).float()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return all_labels, all_preds

In [None]:
from sklearn.metrics import classification_report
print(classification_report(all_labels, all_preds, digits=2, zero_division=0))

              precision    recall  f1-score   support

         0.0       0.84      0.78      0.81     44649
         1.0       0.80      0.86      0.83     45351

    accuracy                           0.82     90000
   macro avg       0.82      0.82      0.82     90000
weighted avg       0.82      0.82      0.82     90000

