<a href="https://colab.research.google.com/github/HamsWael/NLP_Proj/blob/main/NLP_Project_MS3_ClassificationModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import nltk

# import tensorflow as tf
# from tensorflow.keras.preprocessing import sequence
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import Input, Dense, Embedding, SimpleRNN, Concatenate

import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
df = pd.read_csv('train.csv')

In [None]:
df= df.drop(['ViewCount','Label', 'Score'], axis=1)
df

Unnamed: 0,Id,Title,Body,LabelNum
0,197234,Drop\stop mobile data connection (non-wifi) by...,<p>Can I set Android 4.4.2 to drop mobile data...,0
1,114800,How to automatically crop text messages when S...,<p>Is there a way to prevent the Messages app ...,0
2,124532,Can't find text message that was to a group,<p>When John Doe texts to a group that include...,0
3,193875,Can't store contacts on my Android phone,<p>I was going through all of my installed app...,0
4,50332,Dropbox on Samsung Galaxy - where is the Setti...,"<p>On a Sony Xperia, the settings button in Dr...",0
...,...,...,...,...
51365,101944,How to securely root g2 phone (AT&T LGE LG-D800)?,"<p>I've been looking to root my phone, only al...",0
51366,194088,Is Fennec F-Droid an official product by Mozilla?,"<p>Is the <a href=""https://f-droid.org/package...",0
51367,57764,Whats the difference between cell phone and da...,<p>Is cell phone and data/WiFi/LTE radiation t...,0
51368,9150,“There are no android phones associated with t...,<p>I recently bought an eLocity A7 Internet Ta...,0


In [None]:
df['Body'] = df['Body'].str.replace('<p>', '')
df

Unnamed: 0,Id,Title,Body,LabelNum
0,197234,Drop\stop mobile data connection (non-wifi) by...,Can I set Android 4.4.2 to drop mobile data co...,0
1,114800,How to automatically crop text messages when S...,Is there a way to prevent the Messages app fro...,0
2,124532,Can't find text message that was to a group,When John Doe texts to a group that includes m...,0
3,193875,Can't store contacts on my Android phone,I was going through all of my installed applic...,0
4,50332,Dropbox on Samsung Galaxy - where is the Setti...,"On a Sony Xperia, the settings button in Dropb...",0
...,...,...,...,...
51365,101944,How to securely root g2 phone (AT&T LGE LG-D800)?,"I've been looking to root my phone, only all m...",0
51366,194088,Is Fennec F-Droid an official product by Mozilla?,"Is the <a href=""https://f-droid.org/packages/o...",0
51367,57764,Whats the difference between cell phone and da...,Is cell phone and data/WiFi/LTE radiation the ...,0
51368,9150,“There are no android phones associated with t...,I recently bought an eLocity A7 Internet Table...,0


In [None]:
#df['txt_Concatenated'] = pd.concat([df['Title'], df['Body']], axis=1)

df['txt_Concatenated'] = df['Title'] + df['Body']


desired_columns = ['txt_Concatenated','LabelNum']

# Reindex the DataFrame
df = df.reindex(columns=desired_columns)
df


Unnamed: 0,txt_Concatenated,LabelNum
0,Drop\stop mobile data connection (non-wifi) by...,0
1,How to automatically crop text messages when S...,0
2,Can't find text message that was to a groupWhe...,0
3,Can't store contacts on my Android phoneI was ...,0
4,Dropbox on Samsung Galaxy - where is the Setti...,0
...,...,...
51365,How to securely root g2 phone (AT&T LGE LG-D80...,0
51366,Is Fennec F-Droid an official product by Mozil...,0
51367,Whats the difference between cell phone and da...,0
51368,“There are no android phones associated with t...,0


**Preparing the DataSet for The Model **

In [None]:
#This CustomDataset class allows encapsulating the data and handle tokenization,
#encoding, padding, and truncation within the dataset itself, making it easier to work with BERT models in PyTorch.

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    #This method returns the total number of samples in the dataset. In this case, it returns the length of the texts array.
    def __len__(self):
        return len(self.texts)

    # 'getitem' method allows indexing to access individual samples from the dataset.
    #It takes an index idx and returns a dictionary containing the input text, input IDs, attention mask, and label for the sample at that index.

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(    #The text is tokenized and encoded using the provided tokenizer (self.tokenizer.encode_plus()).
                                                  #This method converts the text into input IDs and attention mask, which are necessary inputs for BERT.
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,        #--->  #The tokenizer's encode_plus method is called with pad_to_max_length=True and truncation=True.
                                                  #This ensures that all input sequences have the same length (max_len) by padding shorter sequences and truncating longer sequences.
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

**Loading the BERT tokenizer and model**

In [None]:
tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
model = BertForSequenceClassification.from_pretrained('prajjwal1/bert-tiny', num_labels=2)

**Train-Test Splitting **

In [None]:
# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['txt_Concatenated'], df['LabelNum'], test_size=0.2, random_state=42)


# Defining training and testing datasets
train_dataset = CustomDataset(X_train.values, y_train.values, tokenizer, max_len=128)
test_dataset = CustomDataset(X_test.values, y_test.values, tokenizer, max_len=128)


# Defining data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)          #This randomization helps prevent the model from learning the order of
                                                                               #the data and can improve the training process by introducing variability
                                                                               #in the batches presented to the model during each training iteration.
                                                                               #----> reducing the risk of overfitting to the training data


test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)           #This consistency allows for fair and reproducible
                                                                               #evaluation of the model's performance on the test set.

# **Training the Model **

In [None]:
# Specifying Training parameters
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Defining the number of epochs and Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}', unit='batch'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_train_loss = total_loss / len(train_loader)
    print(f"Average training loss: {average_train_loss}")




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 2569/2569 [14:35<00:00,  2.93batch/s]


Average training loss: 0.15394362207623935


Epoch 2/3: 100%|██████████| 2569/2569 [15:44<00:00,  2.72batch/s]


Average training loss: 0.0669902520298301


Epoch 3/3: 100%|██████████| 2569/2569 [13:59<00:00,  3.06batch/s]

Average training loss: 0.05311315695979223





# **Model Evaluation**

In [None]:
model.eval()
total_correct = 0
total_samples = 0
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Evaluating', unit='batch'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

accuracy = total_correct / total_samples
print(f"Accuracy: {accuracy}")



Evaluating: 100%|██████████| 643/643 [01:21<00:00,  7.91batch/s]

Accuracy: 0.980533385244306



