In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import time

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import precision_recall_curve, auc, accuracy_score, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns # plotting problem

!pip install contractions
import contractions # expanding contractions in text can't -> cannot

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (

In [2]:
torch.backends.cudnn.deterministic = True
random.seed(21)
np.random.seed(21)
torch.manual_seed(21)
torch.cuda.manual_seed_all(21)

In [3]:
import wandb
wandb.login(key = '02c8923278a3dc82932fafb9959cd6d7587dacc7')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
df = pd.read_json('/kaggle/input/amazon-product-review-spam-and-non-spam/Toys_and_Games/Toys_and_Games.json', lines=True).head(100000)
df = df.loc[:, ['reviewText', 'class']]

In [5]:
df['reviewText'] = df['reviewText'].apply(lambda x: contractions.fix(x))
df.loc[:, 'reviewText'] = df['reviewText'].str.lower()

# \W represents Special characters like "$" and "!!!"
df.loc[:, 'reviewText'] = df['reviewText'].str.replace('\W', ' ')

# \d represents Numeric digits like "19.99"
df.loc[:, 'reviewText'] = df['reviewText'].str.replace('\d', ' ')

df

Unnamed: 0,reviewText,class
0,i love these felt nursery rhyme characters and...,1
1,i see no directions for its use. therefore i h...,0
2,this is a great tool for any teacher using the...,1
3,"great product, thank you! our son loved the pu...",1
4,although not as streamlined as the algebra i m...,1
...,...,...
99995,received this product in a timely fashion. i m...,0
99996,mcfarlane sports series are fantastic and life...,1
99997,fortune is a good figure. she has a very attra...,1
99998,i just thought that i would jot a few words to...,0


In [6]:
def pr_auc_score(y_test, y_pred):
    precision, recall, _ = precision_recall_curve(y_test, y_pred)
    pr_auc = auc(recall, precision)
    return pr_auc

In [7]:
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(df['reviewText'], y, test_size=0.33, random_state=21)

In [8]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [9]:
# Define the LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(LSTMClassifier, self).__init__()

        # Load pretrained BERT for feature extraction
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state

        _, (hidden, _) = self.lstm(embeddings)

        if self.lstm.bidirectional:
            hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        else:
            hidden = hidden[-1, :, :]

        hidden = self.dropout(hidden)
        return self.fc(hidden)

In [10]:
# GRU classifier definition
class GRUClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(GRUClassifier, self).__init__()

        # Load BERT for feature extraction
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state

        _, hidden = self.gru(embeddings)

        if self.gru.bidirectional:
            hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        else:
            hidden = hidden[-1, :, :]

        hidden = self.dropout(hidden)
        return self.fc(hidden)

In [11]:
# Training loop
def train_model(model, data_loader, optimizer, criterion, device, epoch):
    model.train()
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}], Loss: {loss.item():.4f}')
    wandb.log({"epoch": epoch,"loss": loss})

In [12]:
# Evaluation loop
def eval_model(model, data_loader, device):
    model.eval()
    predictions = []
    labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            label = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, pred = torch.max(outputs, dim=1)

            predictions.extend(pred.cpu().numpy())
            labels.extend(label.cpu().numpy())

    return accuracy_score(labels, predictions), pr_auc_score(labels, predictions), classification_report(labels, predictions)

In [13]:
def train_eval(model, train_data_loader, test_data_loader, criterion, optimizer, EPOCHS, device):
    wandb.watch(model, criterion, log="all", log_freq=10)

    start_time = time.time()
    # Train and evaluate the model
    EPOCHS = EPOCHS

    for epoch in range(EPOCHS):
        train_model(model, train_data_loader, optimizer, criterion, device, epoch)

    accuracy, pr_auc, report = eval_model(model, test_data_loader, device)

    wandb.log({"test_accuracy": accuracy})
    wandb.log({"test_pr_auc": pr_auc})

    print(f"Accuracy: {accuracy:.4f}")
    print(f"PR_AUC: {pr_auc:.4f}")
    print(f"Classification Report:\n{report}")

    elapsed_time = time.time() - start_time
    wandb.log({"run_time_sec": elapsed_time})
    wandb.finish()

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertModel

# Define the dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Set up the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create the DataLoader instances
def create_data_loader(X, y, tokenizer, max_length, batch_size):
    dataset = TextDataset(X, y, tokenizer, max_length)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

BATCH_SIZE = 64
MAX_LENGTH = 128

train_data_loader = create_data_loader(X_train, y_train, tokenizer, MAX_LENGTH, BATCH_SIZE)
test_data_loader = create_data_loader(X_test, y_test, tokenizer, MAX_LENGTH, BATCH_SIZE)



wandb.init(project='RNN Spam Detection', name='LSTM BERT WEmbedding')

model = LSTMClassifier(embedding_dim=768, hidden_dim=256, output_dim=2, n_layers=2, bidirectional=True, dropout=0.3)
model = torch.nn.DataParallel(model)
model = model.to(device)

# Set the optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss().to(device)

EPOCHS = 15
train_eval(model, train_data_loader, test_data_loader, criterion, optimizer, EPOCHS, device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

[34m[1mwandb[0m: Currently logged in as: [33mmint21[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.17.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240510_043215-84mffxax[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mLSTM BERT WEmbedding[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/mint21/RNN%20Spam%20Detection[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/mint21/RNN%20Spam%20Detection/runs/84mffxax[0m


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch [1], Loss: 0.3701
Epoch [2], Loss: 0.1930
Epoch [3], Loss: 0.2330
Epoch [4], Loss: 0.1948
Epoch [5], Loss: 0.0997
Epoch [6], Loss: 0.1975
Epoch [7], Loss: 0.0864
Epoch [8], Loss: 0.1262
Epoch [9], Loss: 0.1545
Epoch [10], Loss: 0.0547
Epoch [11], Loss: 0.0541
Epoch [12], Loss: 0.0592
Epoch [13], Loss: 0.0475
Epoch [14], Loss: 0.0827
Epoch [15], Loss: 0.1039
Accuracy: 0.9223
PR_AUC: 0.9717
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.66      0.70      4420
           1       0.95      0.96      0.96     28580

    accuracy                           0.92     33000
   macro avg       0.84      0.81      0.83     33000
weighted avg       0.92      0.92      0.92     33000



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:         epoch ▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
[34m[1mwandb[0m:          loss █▄▅▄▂▄▂▃▃▁▁▁▁▂▂
[34m[1mwandb[0m:  run_time_sec ▁
[34m[1mwandb[0m: test_accuracy ▁
[34m[1mwandb[0m:   test_pr_auc ▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:         epoch 14
[34m[1mwandb[0m:          loss 0.10393
[34m[1mwandb[0m:  run_time_sec 9951.03523
[34m[1mwandb[0m: test_accuracy 0.92233
[34m[1mwandb[0m:   test_pr_auc 0.9717
[34m[1mwandb[0m: 
[34m[1mwandb[0m: 🚀 View run [33mLSTM BERT WEmbedding[0m at: [34m[4mhttps://wandb.ai/mint21/RNN%20Spam%20Detection/runs/84mffxax[0m
[34m[1mwandb[0m: ⭐️ View project at: [34m[4mhttps://wandb.ai/mint21/RNN%20Spam%20Detection[0m
[34m[1mwandb[0m: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
[34m[1mwandb[0m:

In [15]:
wandb.init(project='RNN Spam Detection', name='GRU BERT WEmbedding')

model = GRUClassifier(embedding_dim=768, hidden_dim=256, output_dim=2, n_layers=2, bidirectional=True, dropout=0.3)
model = torch.nn.DataParallel(model)
model = model.to(device)

# Set the optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss().to(device)

EPOCHS = 15
train_eval(model, train_data_loader, test_data_loader, criterion, optimizer, EPOCHS, device)

[34m[1mwandb[0m: wandb version 0.17.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240510_071831-xidj9mig[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mGRU BERT WEmbedding[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/mint21/RNN%20Spam%20Detection[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/mint21/RNN%20Spam%20Detection/runs/xidj9mig[0m


Epoch [1], Loss: 0.1741
Epoch [2], Loss: 0.2902
Epoch [3], Loss: 0.1715
Epoch [4], Loss: 0.2488
Epoch [5], Loss: 0.2523
Epoch [6], Loss: 0.0845
Epoch [7], Loss: 0.1462
Epoch [8], Loss: 0.0792
Epoch [9], Loss: 0.1014
Epoch [10], Loss: 0.0453
Epoch [11], Loss: 0.1247
Epoch [12], Loss: 0.0421
Epoch [13], Loss: 0.0460
Epoch [14], Loss: 0.0471
Epoch [15], Loss: 0.0324
Accuracy: 0.9260
PR_AUC: 0.9751
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.71      0.72      4420
           1       0.96      0.96      0.96     28580

    accuracy                           0.93     33000
   macro avg       0.84      0.84      0.84     33000
weighted avg       0.93      0.93      0.93     33000



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:         epoch ▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
[34m[1mwandb[0m:          loss ▅█▅▇▇▂▄▂▃▁▄▁▁▁▁
[34m[1mwandb[0m:  run_time_sec ▁
[34m[1mwandb[0m: test_accuracy ▁
[34m[1mwandb[0m:   test_pr_auc ▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:         epoch 14
[34m[1mwandb[0m:          loss 0.03245
[34m[1mwandb[0m:  run_time_sec 9817.28349
[34m[1mwandb[0m: test_accuracy 0.926
[34m[1mwandb[0m:   test_pr_auc 0.9751
[34m[1mwandb[0m: 
[34m[1mwandb[0m: 🚀 View run [33mGRU BERT WEmbedding[0m at: [34m[4mhttps://wandb.ai/mint21/RNN%20Spam%20Detection/runs/xidj9mig[0m
[34m[1mwandb[0m: ⭐️ View project at: [34m[4mhttps://wandb.ai/mint21/RNN%20Spam%20Detection[0m
[34m[1mwandb[0m: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
[34m[1mwandb[0m: Fi