In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import time

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import precision_recall_curve, auc, accuracy_score, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns # plotting problem

!pip install contractions
import contractions # expanding contractions in text can't -> cannot

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl 

In [2]:
torch.backends.cudnn.deterministic = True
random.seed(21)
np.random.seed(21)
torch.manual_seed(21)
torch.cuda.manual_seed_all(21)

In [3]:
import wandb
wandb.login(key = '02c8923278a3dc82932fafb9959cd6d7587dacc7')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
df = pd.read_json('/kaggle/input/Toys_and_Games/Toys_and_Games.json', lines=True).head(100000)
df = df.loc[:, ['reviewText', 'class']]

In [5]:
df['reviewText'] = df['reviewText'].apply(lambda x: contractions.fix(x))
df.loc[:, 'reviewText'] = df['reviewText'].str.lower()

# \W represents Special characters like "$" and "!!!"
df.loc[:, 'reviewText'] = df['reviewText'].str.replace('\W', ' ')

# \d represents Numeric digits like "19.99"
df.loc[:, 'reviewText'] = df['reviewText'].str.replace('\d', ' ')

df

Unnamed: 0,reviewText,class
0,i love these felt nursery rhyme characters and...,1
1,i see no directions for its use. therefore i h...,0
2,this is a great tool for any teacher using the...,1
3,"great product, thank you! our son loved the pu...",1
4,although not as streamlined as the algebra i m...,1
...,...,...
99995,received this product in a timely fashion. i m...,0
99996,mcfarlane sports series are fantastic and life...,1
99997,fortune is a good figure. she has a very attra...,1
99998,i just thought that i would jot a few words to...,0


In [6]:
def pr_auc_score(y_test, y_pred):
    precision, recall, _ = precision_recall_curve(y_test, y_pred)
    pr_auc = auc(recall, precision)
    return pr_auc

In [7]:
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(df['reviewText'], y, test_size=0.33, random_state=21)

In [8]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [9]:
# Define the LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(LSTMClassifier, self).__init__()

        # Load pretrained BERT for feature extraction
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state

        _, (hidden, _) = self.lstm(embeddings)

        if self.lstm.bidirectional:
            hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        else:
            hidden = hidden[-1, :, :]

        hidden = self.dropout(hidden)
        return self.fc(hidden)

In [10]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, bert, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(BiLSTMClassifier, self).__init__()
        self.bert = bert
        self.lstm = nn.LSTM(
            bert.config.hidden_size,
            hidden_dim,
            num_layers=n_layers,
            bidirectional=bidirectional,
            batch_first=True,
            dropout=0 if n_layers < 2 else dropout
        )
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            embedded = self.bert(input_ids=input_ids, attention_mask=attention_mask)[0]
        
        lstm_out, (hidden, cell) = self.lstm(embedded)

        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        else:
            hidden = self.dropout(hidden[-1,:,:])

        output = self.fc(hidden)
        return output

In [11]:
# Training loop
def train_model(model, data_loader, optimizer, criterion, device, epoch):
    model.train()

    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        acc = (outputs.argmax(1) == labels).sum().item() / len(labels)

        loss.backward()
        optimizer.step()
        
    print(f'Epoch [{epoch+1}], Loss: {loss.item():.4f}')
    wandb.log({"epoch": epoch,"loss": loss})

In [12]:
# Evaluation loop
def eval_model(model, data_loader, device):
    model.eval()
    test_preds = []
    test_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            preds = outputs.argmax(1)

            test_preds.extend(preds.cpu().numpy())
            test_labels.extend(labels.cpu().numpy())

    return accuracy_score(test_labels, test_preds), pr_auc_score(test_labels, test_preds), classification_report(test_labels, test_preds)

In [13]:
def train_eval(model, train_data_loader, test_data_loader, criterion, optimizer, EPOCHS, device):
    wandb.watch(model, criterion, log="all", log_freq=10)

    start_time = time.time()
    # Train and evaluate the model
    EPOCHS = EPOCHS

    for epoch in range(EPOCHS):
        train_model(model, train_data_loader, optimizer, criterion, device, epoch)

    accuracy, pr_auc, report = eval_model(model, test_data_loader, device)

    wandb.log({"test_accuracy": accuracy})
    wandb.log({"test_pr_auc": pr_auc})

    print(f"Accuracy: {accuracy:.4f}")
    print(f"PR_AUC: {pr_auc:.4f}")
    print(f"Classification Report:\n{report}")

    elapsed_time = time.time() - start_time
    wandb.log({"run_time_sec": elapsed_time})
    wandb.finish()

In [14]:
class SpamDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [15]:
wandb.finish()

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertModel


# Set up the BERT 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

MAX_LEN = 128
BATCH_SIZE = 64
HIDDEN_DIM = 256
OUTPUT_DIM = 2
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.3
LR = 2e-5

train_dataset = SpamDataset(X_train.values, y_train.values, tokenizer, MAX_LEN)
test_dataset = SpamDataset(X_test.values, y_test.values, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

wandb.init(project='RNN Spam Detection', name='biLSTM BERT WEmbedding')

model = BiLSTMClassifier(bert_model, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
model = torch.nn.DataParallel(model)
model = model.to(device)

# Set the optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss().to(device)

EPOCHS = 15
train_eval(model, train_loader, test_loader, criterion, optimizer, EPOCHS, device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

[34m[1mwandb[0m: Currently logged in as: [33mmint21[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.17.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240520_085920-g862lfq0[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mbiLSTM BERT WEmbedding[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/mint21/RNN%20Spam%20Detection[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/mint21/RNN%20Spam%20Detection/runs/g862lfq0[0m


Epoch [1], Loss: 0.4468
Epoch [2], Loss: 0.1849
Epoch [3], Loss: 0.2513
Epoch [4], Loss: 0.2648
Epoch [5], Loss: 0.1205
Epoch [6], Loss: 0.2758
Epoch [7], Loss: 0.1313
Epoch [8], Loss: 0.1697
Epoch [9], Loss: 0.2507
Epoch [10], Loss: 0.0712
Epoch [11], Loss: 0.1316
Epoch [12], Loss: 0.1743
Epoch [13], Loss: 0.0942
Epoch [14], Loss: 0.1109
Epoch [15], Loss: 0.1174
Accuracy: 0.9260
PR_AUC: 0.9706
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.63      0.70      4420
           1       0.95      0.97      0.96     28580

    accuracy                           0.93     33000
   macro avg       0.86      0.80      0.83     33000
weighted avg       0.92      0.93      0.92     33000



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:         epoch ▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
[34m[1mwandb[0m:          loss █▃▄▅▂▅▂▃▄▁▂▃▁▂▂
[34m[1mwandb[0m:  run_time_sec ▁
[34m[1mwandb[0m: test_accuracy ▁
[34m[1mwandb[0m:   test_pr_auc ▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:         epoch 14
[34m[1mwandb[0m:          loss 0.11741
[34m[1mwandb[0m:  run_time_sec 11945.19893
[34m[1mwandb[0m: test_accuracy 0.926
[34m[1mwandb[0m:   test_pr_auc 0.97057
[34m[1mwandb[0m: 
[34m[1mwandb[0m: 🚀 View run [33mbiLSTM BERT WEmbedding[0m at: [34m[4mhttps://wandb.ai/mint21/RNN%20Spam%20Detection/runs/g862lfq0[0m
[34m[1mwandb[0m: ⭐️ View project at: [34m[4mhttps://wandb.ai/mint21/RNN%20Spam%20Detection[0m
[34m[1mwandb[0m: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
[34m[1mwandb[0