<a href="https://colab.research.google.com/github/JeanMusenga/PhD-Thesis_2024_Musenga/blob/main/Simple_BERT_ARPs_ClassificationWithoutPredictionMetrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import resources:
https://chatgpt.com/share/01303bfd-981b-448e-b51c-4ac8bad51dc5

In [None]:
#pip install datasets

In [30]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import DataLoader, Dataset

In [31]:
df=pd.read_excel('ARPs_and_ProgrammingPosts.xlsx')
df = df.dropna(subset=['Question_body', 'Label'])

# Step 2: Preprocess the Data

In [9]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128  # Set the maximum length of the input text

In [11]:
# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['Question_body'], df['Label'], test_size=0.2)


In [12]:
# Create datasets
train_dataset = TextDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, max_length)
val_dataset = TextDataset(val_texts.tolist(), val_labels.tolist(), tokenizer, max_length)


# Step 3: Initialize the BERT Model

In [14]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Step 4: Train the Model

In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the Model
trainer.train()

# Evaluate the Model
trainer.evaluate()



Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


# Step 5: Make Predictions

In [37]:
def predict(text, tokenizer, model, max_length):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    outputs = model(input_ids, attention_mask=attention_mask)
    _, prediction = torch.max(outputs.logits, dim=1)

    return prediction.item()

# Prediction Results

In [36]:
# Example prediction
#example_text = "Your text to classify here"
dataset=pd.read_excel('DataSampePilot.xlsx')
dataset = dataset.dropna(subset=['Question_body'])
prediction = predict(dataset['Question_body'].tolist(), tokenizer, model, max_length)
print(f"Prediction: {prediction}")

Prediction: 1


# Save the model

In [None]:
# Save the model and tokenizer
saved_model_path = './saved_model'
model.save_pretrained(saved_model_path)
tokenizer.save_pretrained(saved_model_path)

In [35]:
# List the contents of the saved model directory
import os
print("Saved model files:", os.listdir(saved_model_path))

# Load the model and tokenizer
loaded_model = BertForSequenceClassification.from_pretrained(saved_model_path)
loaded_tokenizer = BertTokenizer.from_pretrained(saved_model_path)

Saved model files: ['model.safetensors', 'special_tokens_map.json', 'tokenizer_config.json', 'config.json', 'vocab.txt']
