<a href="https://colab.research.google.com/github/HamsWael/NLP_Proj/blob/main/NLP_Project_MS3_RegressionModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

***Regression Model***

In [78]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertConfig, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [79]:
df = pd.read_csv('train.csv')

In [80]:
df.drop(['ViewCount','Label', 'LabelNum','Id'],axis=1)

Unnamed: 0,Title,Body,Score
0,Drop\stop mobile data connection (non-wifi) by...,<p>Can I set Android 4.4.2 to drop mobile data...,0
1,How to automatically crop text messages when S...,<p>Is there a way to prevent the Messages app ...,0
2,Can't find text message that was to a group,<p>When John Doe texts to a group that include...,1
3,Can't store contacts on my Android phone,<p>I was going through all of my installed app...,0
4,Dropbox on Samsung Galaxy - where is the Setti...,"<p>On a Sony Xperia, the settings button in Dr...",1
...,...,...,...
51365,How to securely root g2 phone (AT&T LGE LG-D800)?,"<p>I've been looking to root my phone, only al...",4
51366,Is Fennec F-Droid an official product by Mozilla?,"<p>Is the <a href=""https://f-droid.org/package...",2
51367,Whats the difference between cell phone and da...,<p>Is cell phone and data/WiFi/LTE radiation t...,1
51368,“There are no android phones associated with t...,<p>I recently bought an eLocity A7 Internet Ta...,4


In [81]:
#Some cleaning to the Dataset and dropping unnecessary columns
df['Body'] = df['Body'].str.replace('<p>', '')
df['txt_Concatenated'] = df['Title'] + df['Body']
desired_columns = ['txt_Concatenated','Score']
df = df.reindex(columns=desired_columns)
df


Unnamed: 0,txt_Concatenated,Score
0,Drop\stop mobile data connection (non-wifi) by...,0
1,How to automatically crop text messages when S...,0
2,Can't find text message that was to a groupWhe...,1
3,Can't store contacts on my Android phoneI was ...,0
4,Dropbox on Samsung Galaxy - where is the Setti...,1
...,...,...
51365,How to securely root g2 phone (AT&T LGE LG-D80...,4
51366,Is Fennec F-Droid an official product by Mozil...,2
51367,Whats the difference between cell phone and da...,1
51368,“There are no android phones associated with t...,4


In [82]:
df = df.reset_index(drop=True)

In [83]:
texts = df['txt_Concatenated'].tolist()
scores = df['Score'].tolist()

In [84]:
train_texts, test_texts, train_scores, test_scores = train_test_split(texts, scores, test_size=0.2, random_state=42)

In [85]:
# Defining Dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, scores, tokenizer, max_len):
        self.texts = texts
        self.scores = scores
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        score = float(self.scores[idx])

        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')

        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'score': torch.tensor(score, dtype=torch.float)
        }


Initializing tokenizer and model

In [86]:
tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
model = BertModel.from_pretrained('prajjwal1/bert-tiny')

# Define model architecture for regression
class RegressionBert(torch.nn.Module):
    def __init__(self, bert):
        super(RegressionBert, self).__init__()
        self.bert = bert
        self.linear = torch.nn.Linear(128, 1)  # Assuming TinyBERT has hidden size of 128

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state.mean(dim=1)
        score = self.linear(pooled_output)
        return score


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

**Calling the Model and Training it **

In [87]:
model_regression = RegressionBert(model)
train_dataset = CustomDataset(train_texts, train_scores, tokenizer, max_len=128)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Training the model
optimizer = AdamW(model_regression.parameters(), lr=2e-5)
criterion = torch.nn.MSELoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_regression.to(device)



RegressionBert(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affi

In [88]:
for epoch in range(5):  # Train for 5 epochs
    model_regression.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        target = batch['score'].unsqueeze(1).to(device)

        optimizer.zero_grad()
        output = model_regression(input_ids, attention_mask)
        loss = criterion(output, target)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {total_loss}')

Epoch 1, Loss: 94529.64983081818
Epoch 2, Loss: 92121.84314161539
Epoch 3, Loss: 90424.09765303135
Epoch 4, Loss: 88517.39712738991
Epoch 5, Loss: 86728.92257028818


***Evaluate the Model***

In [90]:
model_regression.eval()
test_dataset = CustomDataset(test_texts, test_scores, tokenizer, max_len=128)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

predictions = []
targets = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        target = batch['score'].unsqueeze(1).to(device)

        output = model_regression(input_ids, attention_mask)

        predictions.extend(output.cpu().numpy())
        targets.extend(target.cpu().numpy())

predictions = np.array(predictions).flatten()
targets = np.array(targets).flatten()

mse = mean_squared_error(targets, predictions)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 25.6873836517334


# **Test real user inputs **

In [110]:
def approximate_score(x):
    # Round the average score to the nearest whole number
    rounded_score = round(predicted_score)
    return rounded_score

In [101]:
user_text = input("")

Android is better than iPhone 


In [111]:
user_inputs = tokenizer(user_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
user_input_ids = user_inputs['input_ids'].to(device)
user_attention_mask = user_inputs['attention_mask'].to(device)

# Get the predicted score
with torch.no_grad():
    user_output = model_regression(user_input_ids, user_attention_mask)

predicted_score = approximate_score(user_output.item())
print(f'Predicted Score: {predicted_score}')

Predicted Score: 2
