# **Install required libraries:**

In [32]:
pip install transformers pandas openpyxl scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [9]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split

# **Load the data from the Excel file**

In [4]:
data = pd.read_excel("/kaggle/input/samplespam1/sample12.xlsx")
data

Unnamed: 0,body,label
0,Do you need only 1 neoprene game organiser for...,0
1,"Hello,\nI hope you are doing great?\n am here ...",1
2,"hello, \n\nI just recently learn about the eve...",0
3,"Hello,\nAm an experienced marketer specialized...",1
4,"hello Steve,how are you doing?? I discovered y...",1
...,...,...
497,Hi there! I trust this message finds you well....,1
498,"Hi everybody,\n\nI know I'm late but I would l...",0
499,Hello\nI need some information from you base o...,0
500,hello,0


In [5]:
texts = data["body"].tolist()
labels = data["label"].tolist()

# **Preprocess the data and tokenize it using the chosen tokenizer:**

In [6]:
# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [7]:
def tokenize_data(texts, labels):
    encoded_texts = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    encoded_labels = torch.tensor(labels)
    return encoded_texts, encoded_labels

In [10]:
train_encoded_texts, train_encoded_labels = tokenize_data(train_texts, train_labels)
test_encoded_texts, test_encoded_labels = tokenize_data(test_texts, test_labels)


# **Create a PyTorch dataset and data loader:**

In [11]:
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(train_encoded_texts["input_ids"], train_encoded_texts["attention_mask"], train_encoded_labels)
test_dataset = TensorDataset(test_encoded_texts["input_ids"], test_encoded_texts["attention_mask"], test_encoded_labels)


In [12]:
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# **Load the pre-trained model and set it up for fine-tuning:**

In [13]:
from transformers import RobertaForSequenceClassification
import torch

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

from transformers import AdamW, get_linear_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [14]:
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 10)



In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# **Define the training loop and fine-tune the model:**

In [17]:
for epoch in range(10):
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()


# **Evaluate the fine-tuned model on Training Data:**

In [20]:
model.eval()
train_predictions = []
train_true_labels = []
for batch in train_dataloader:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs.logits, dim=1)
    train_predictions.extend(predictions.cpu().numpy())
    train_true_labels.extend(labels.cpu().numpy())

In [21]:
train_accuracy = accuracy_score(train_true_labels, train_predictions)
train_precision = precision_score(train_true_labels, train_predictions, average='macro')
train_recall = recall_score(train_true_labels, train_predictions, average='macro')
train_confusion_matrix = confusion_matrix(train_true_labels, train_predictions)

# **Metrics on Training Data:**

In [22]:
    print(f"Epoch {epoch + 1} - Training Metrics:")
    print(f"Accuracy: {train_accuracy:.2f}")
    print(f"Precision: {train_precision:.2f}")
    print(f"Recall: {train_recall:.2f}")
    print(f"Confusion Matrix:\n{train_confusion_matrix}\n")

Epoch 10 - Training Metrics:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
Confusion Matrix:
[[169   1]
 [  0 231]]



# **Evaluate the fine-tuned model on Testing Data:**

In [25]:
test_predictions = []
test_true_labels = []
for batch in test_dataloader:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs.logits, dim=1)
    test_predictions.extend(predictions.cpu().numpy())
    test_true_labels.extend(labels.cpu().numpy())


In [26]:
    test_accuracy = accuracy_score(test_true_labels, test_predictions)
    test_precision = precision_score(test_true_labels, test_predictions, average='macro')
    test_recall = recall_score(test_true_labels, test_predictions, average='macro')
    test_confusion_matrix = confusion_matrix(test_true_labels, test_predictions)


# **Metrics on Testing Data:**

In [27]:
    print(f"Epoch {epoch + 1} - Testing Metrics:")
    print(f"Accuracy: {test_accuracy:.2f}")
    print(f"Precision: {test_precision:.2f}")
    print(f"Recall: {test_recall:.2f}")
    print(f"Confusion Matrix:\n{test_confusion_matrix}\n")

Epoch 10 - Testing Metrics:
Accuracy: 0.91
Precision: 0.91
Recall: 0.91
Confusion Matrix:
[[43  5]
 [ 4 49]]



# **Saving the Finetuned Model to local device**

In [28]:
model.save_pretrained("./fine-tuned-model")

# **Creating Inference Function for Future Prediction**

In [30]:
def inference(model, tokenizer, text):
    input_ids = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt").to(device)
    attention_mask = input_ids.ne(tokenizer.pad_token_id).float().to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1).item()
    return prediction

In [31]:
new_text = "Hello,I hope you are doing great? Iam here to help you to promote your campaign to a real and targeted audience for you to have a high chance of getting support from them. Kindly let me know your thoughts to discuss better"
prediction = inference(model, tokenizer, new_text)
print(f"Prediction for the Given input is '{prediction}'")

Prediction for the Given input is '1'
