<a href="https://colab.research.google.com/github/JeanMusenga/PhD-Thesis_2024_Musenga/blob/main/BERT_ARPs_Classification_With_PredictionMetrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://chatgpt.com/share/b8196226-48d6-49a4-b448-db1ac491d8a5

In [None]:
pip install datasets

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
import torch
from datasets import Dataset, load_metric

from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [5]:
data=pd.read_excel('ARPs_and_ProgrammingPosts.xlsx')
#df = df.dropna(subset=['Question_body', 'Label'])

# Preprocessing

In [6]:
# Preprocess the text data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess_function(examples):
    return tokenizer(examples, truncation=True, padding='max_length', max_length=128)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
# Apply preprocessing
tokenized_inputs = data['Question_body'].apply(preprocess_function)


In [8]:
# Convert tokenized inputs to a DataFrame
tokenized_df = pd.DataFrame(tokenized_inputs.tolist())

In [9]:
# Add the labels to the tokenized DataFrame
tokenized_df['labels'] = data['Label']

In [10]:
# Split the data into training and testing sets
train_df, test_df = train_test_split(tokenized_df, test_size=0.2)


In [11]:
# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load the BERT model

In [12]:
# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Model training

In [13]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01
)



In [14]:
# Data collator for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [17]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits to PyTorch tensor
    logits = torch.tensor(logits)
    predictions = torch.argmax(logits, dim=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

In [21]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [22]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0958,0.391147,0.935052,0.904607,0.967384,0.934943
2,0.0843,0.411029,0.932708,0.891909,0.979181,0.93351
3,0.0398,0.506363,0.931369,0.907115,0.955586,0.93072


TrainOutput(global_step=4482, training_loss=0.07459664334148661, metrics={'train_runtime': 1368.1251, 'train_samples_per_second': 26.193, 'train_steps_per_second': 3.276, 'total_flos': 2357146167206400.0, 'train_loss': 0.07459664334148661, 'epoch': 3.0})

# Evaluate the model

In [23]:

evaluation_results = trainer.evaluate()
print(evaluation_results)

{'eval_loss': 0.5063628554344177, 'eval_accuracy': 0.9313692668228992, 'eval_precision': 0.9071146245059288, 'eval_recall': 0.9555863983344899, 'eval_f1': 0.9307198377830348, 'eval_runtime': 21.1945, 'eval_samples_per_second': 140.933, 'eval_steps_per_second': 17.646, 'epoch': 3.0}


# Save the model and tokenizer

The model and tokenizer are saved in the directory you specify in the save_pretrained method. In the code example provided, the model and tokenizer are saved in the ./saved_model directory relative to the current working directory where the script is executed

In [24]:

saved_model_path = './saved_model'
model.save_pretrained(saved_model_path)
tokenizer.save_pretrained(saved_model_path)

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')

# List the contents of the saved model directory
You can verify the saved files by checking the ./saved_model directory. This directory should contain the following files:
config.json: Configuration file for the model.
pytorch_model.bin: The model's weights.
tokenizer_config.json: Configuration file for the tokenizer.
vocab.txt: Vocabulary file used by the tokenizer.
special_tokens_map.json

In [None]:
import os
print("Saved model files:", os.listdir(saved_model_path))

# Load the model and tokenizer
loaded_model = BertForSequenceClassification.from_pretrained(saved_model_path)
loaded_tokenizer = BertTokenizer.from_pretrained(saved_model_path)