<a href="https://colab.research.google.com/github/JeanMusenga/PhD-Thesis_2024_Musenga/blob/main/GridSearch_CrossValidation_With_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://chatgpt.com/share/1e4d5790-5bf7-4c3e-b877-b343676ba074

https://chatgpt.com/share/40ddb624-ba98-47a5-a507-0190582bd184

In [None]:
pip install datasets

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset
import evaluate

# Load the data from the provided Excel file
file_path = 'posts.xlsx'
df = pd.read_excel(file_path)

# Print column names to verify
print(df.columns)

# Ensure labels are integers
df['Label'] = df['Label'].astype(int)

# Assuming the column containing text is named 'Question_body' and labels are in 'Label'
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Preprocess function
def preprocess_function(examples):
    return tokenizer(examples['Question_body'], truncation=True, padding='max_length', max_length=512)

# Convert DataFrame to Dataset
dataset = Dataset.from_pandas(df)

# Apply the preprocessing function to the dataset
dataset = dataset.map(preprocess_function, batched=True)

# Ensure the dataset includes 'labels' column
dataset = dataset.rename_column('Label', 'labels')

# Define the hyperparameter grid
param_grid = {
    'learning_rate': [5e-5, 3e-5, 2e-5],
    'batch_size': [8, 16, 32]
}

# Create the parameter grid
grid = list(ParameterGrid(param_grid))

# Initialize Cross-Validation
kf = StratifiedKFold(n_splits=10)
metrics = evaluate.load('accuracy')

# Variables to store the best hyperparameters and corresponding accuracy
best_hyperparams = None
best_accuracy = 0

# Loop over the parameter grid
for params in grid:
    accuracy_scores = []
    for train_index, test_index in kf.split(df, df['Label']):
        train_dataset = dataset.select(train_index)
        eval_dataset = dataset.select(test_index)

        # Initialize the model
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['Label'].unique()))

        # Training arguments
        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=3,
            per_device_train_batch_size=params['batch_size'],
            per_device_eval_batch_size=params['batch_size'],
            evaluation_strategy="epoch",
            save_strategy="epoch",
            logging_dir='./logs',
            learning_rate=params['learning_rate'],
            logging_steps=10,
            save_steps=10,
            save_total_limit=2,
            load_best_model_at_end=True
        )


        # Define Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
            data_collator=DataCollatorWithPadding(tokenizer),
            compute_metrics=lambda p: {"accuracy": metrics.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)}
        )

        # Train and Evaluate
        trainer.train()
        eval_result = trainer.evaluate()
        accuracy_scores.append(eval_result['eval_accuracy'])

    # Average accuracy across folds
    average_accuracy = np.mean(accuracy_scores)

    # Check if we have found new best hyperparameters
    if average_accuracy > best_accuracy:
        best_accuracy = average_accuracy
        best_hyperparams = params

print(f'Best Hyperparameters: {best_hyperparams}')
print(f'Best Average Accuracy: {best_accuracy}')


Index(['Question_body', 'Label'], dtype='object')


Map:   0%|          | 0/14932 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0703,0.187252,{'accuracy': 0.9732262382864793}
2,0.1155,0.0716,{'accuracy': 0.9772423025435074}
3,0.177,0.216682,{'accuracy': 0.9605087014725568}


Trainer is attempting to log a value of "{'accuracy': 0.9732262382864793}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9772423025435074}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9605087014725568}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


Trainer is attempting to log a value of "{'accuracy': 0.9772423025435074}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1055,0.022714,{'accuracy': 0.9919678714859438}


Trainer is attempting to log a value of "{'accuracy': 0.9919678714859438}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
