#### Install the required libraries

In [3]:
!pip uninstall datasets
!pip install datasets

^C
Defaulting to user installation because normal site-packages is not writeable


In [1]:
!pip install transformers datasets pandas numpy jupyter tqdm ipywidgets widgetsnbextension pandas-profiling

Defaulting to user installation because normal site-packages is not writeable


In [2]:
from datasets import load_metric
import tqdm as notebook_tqdm
from ipywidgets import FloatProgress
# Load the SQuAD metric
metric = load_metric('squad_v2')

print("SQuAD metric loaded successfully.")

ImportError: cannot import name 'load_metric' from 'datasets' (C:\Users\user\AppData\Roaming\Python\Python312\site-packages\datasets\__init__.py)

#### Load and Preprocess the Data

In [11]:
import os
import json
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

# Path to the data directory
data_dir = r'F:\Github\SLLIM\data\processed'

# Load the datasets
def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

train_data = load_json(os.path.join(data_dir, 'train.json'))
dev_data = load_json(os.path.join(data_dir, 'dev.json'))
test_data = load_json(os.path.join(data_dir, 'test.json'))

# Convert the data to pandas DataFrames
train_df = pd.DataFrame(train_data)
dev_df = pd.DataFrame(dev_data)
test_df = pd.DataFrame(test_data)

# Verify the structure of the datasets
print(f"Train data: {len(train_data)} samples")
print(f"Dev data: {len(dev_data)} samples")
print(f"Test data: {len(test_data)} samples")

# Function to preprocess the data
def preprocess_data(df, tokenizer):
    # Tokenize the data
    inputs = tokenizer(
        df['question'].tolist(),
        df['context'].tolist(),
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors='pt'
    )
    return inputs, df

# Load the tokenizer for the first model
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

# Preprocess the data for the first model
train_inputs, train_df = preprocess_data(train_df, tokenizer)
dev_inputs, dev_df = preprocess_data(dev_df, tokenizer)
test_inputs, test_df = preprocess_data(test_df, tokenizer)

Train data: 498 samples
Dev data: 42 samples
Test data: 253 samples


#### Create Datasets

In [12]:
def create_dataset(inputs, df):
    dataset = Dataset.from_dict({
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'start_positions': df['answer'].apply(lambda x: x['start'] if isinstance(x, dict) and 0 <= x['start'] < len(inputs['input_ids'][0]) else 0),
        'end_positions': df['answer'].apply(lambda x: x['end'] if isinstance(x, dict) and 0 <= x['end'] < len(inputs['input_ids'][0]) else 0)
    })
    return dataset

# Create datasets for the first model
train_dataset = create_dataset(train_inputs, train_df)
dev_dataset = create_dataset(dev_inputs, dev_df)
test_dataset = create_dataset(test_inputs, test_df)

# Create a DatasetDict for the Trainer
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': dev_dataset,
    'test': test_dataset
})

#### Fine-Tune the First Model (roberta-base)

In [13]:
from transformers import AutoModelForQuestionAnswering, Trainer, TrainingArguments

# Load the model
model = AutoModelForQuestionAnswering.from_pretrained('roberta-base')

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results_roberta',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs_roberta',
    logging_steps=10,
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['validation'],
)

# Train the model
trainer.train()

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 11%|█▏        | 11/96 [00:00<00:06, 12.28it/s]

{'loss': 3.5615, 'grad_norm': 17.213062286376953, 'learning_rate': 1.7916666666666667e-05, 'epoch': 0.31}


 24%|██▍       | 23/96 [00:01<00:05, 12.88it/s]

{'loss': 1.4014, 'grad_norm': 5.603587627410889, 'learning_rate': 1.5833333333333333e-05, 'epoch': 0.62}


 32%|███▏      | 31/96 [00:02<00:05, 12.88it/s]

{'loss': 0.7078, 'grad_norm': 33.05466842651367, 'learning_rate': 1.375e-05, 'epoch': 0.94}



 34%|███▍      | 33/96 [00:02<00:05, 12.01it/s]

{'eval_loss': 0.07345175743103027, 'eval_runtime': 0.0653, 'eval_samples_per_second': 643.343, 'eval_steps_per_second': 45.953, 'epoch': 1.0}


 43%|████▎     | 41/96 [00:03<00:04, 12.61it/s]

{'loss': 0.1168, 'grad_norm': 3.436894655227661, 'learning_rate': 1.1666666666666668e-05, 'epoch': 1.25}


 55%|█████▌    | 53/96 [00:04<00:03, 12.90it/s]

{'loss': 0.0172, 'grad_norm': 0.39245736598968506, 'learning_rate': 9.583333333333335e-06, 'epoch': 1.56}


 64%|██████▎   | 61/96 [00:04<00:02, 12.90it/s]

{'loss': 0.0041, 'grad_norm': 0.12557142972946167, 'learning_rate': 7.500000000000001e-06, 'epoch': 1.88}


 66%|██████▌   | 63/96 [00:05<00:02, 12.90it/s]
 68%|██████▊   | 65/96 [00:05<00:02, 12.04it/s]

{'eval_loss': 0.0005629835068248212, 'eval_runtime': 0.0662, 'eval_samples_per_second': 634.299, 'eval_steps_per_second': 45.307, 'epoch': 2.0}


 74%|███████▍  | 71/96 [00:05<00:02, 12.45it/s]

{'loss': 0.0018, 'grad_norm': 0.044567689299583435, 'learning_rate': 5.416666666666667e-06, 'epoch': 2.19}


 84%|████████▍ | 81/96 [00:06<00:01, 12.49it/s]

{'loss': 0.0014, 'grad_norm': 0.041223518550395966, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.5}


 95%|█████████▍| 91/96 [00:07<00:00, 12.71it/s]

{'loss': 0.0014, 'grad_norm': 0.03265051916241646, 'learning_rate': 1.25e-06, 'epoch': 2.81}


 99%|█████████▉| 95/96 [00:07<00:00, 12.82it/s]
100%|██████████| 96/96 [00:09<00:00,  9.66it/s]

{'eval_loss': 0.0002904820430558175, 'eval_runtime': 0.083, 'eval_samples_per_second': 506.042, 'eval_steps_per_second': 36.146, 'epoch': 3.0}
{'train_runtime': 9.9419, 'train_samples_per_second': 150.274, 'train_steps_per_second': 9.656, 'train_loss': 0.6056270146170087, 'epoch': 3.0}





TrainOutput(global_step=96, training_loss=0.6056270146170087, metrics={'train_runtime': 9.9419, 'train_samples_per_second': 150.274, 'train_steps_per_second': 9.656, 'total_flos': 57184182797400.0, 'train_loss': 0.6056270146170087, 'epoch': 3.0})

#### Evaluate the First Model


In [14]:
from datasets import load_metric

# Load the SQuAD metric
metric = load_metric('squad_v2')

# Define a custom evaluation function
def compute_metrics(p):
    start_logits, end_logits = p.predictions
    start_positions, end_positions = p.label_ids

    # Convert logits to predictions
    start_preds = start_logits.argmax(axis=-1)
    end_preds = end_logits.argmax(axis=-1)

    # Compute the exact match and F1 score
    result = metric.compute(predictions={'start_position': start_preds, 'end_position': end_preds},
                        references={'start_position': start_positions, 'end_position': end_positions})
    
    return result

# Evaluate the first model on the test set
test_results = trainer.evaluate(test_dataset, metric_key_prefix='eval', compute_metrics=compute_metrics)
print(f"Test results for roberta-base: {test_results}")

ImportError: cannot import name 'load_metric' from 'datasets' (C:\Users\user\AppData\Roaming\Python\Python312\site-packages\datasets\__init__.py)

###  Fine-Tune the Second Model (albert-base-v2)

In [None]:
# Load the tokenizer for the second model
tokenizer_2 = AutoTokenizer.from_pretrained('albert-base-v2')

# Preprocess the data for the second model
train_inputs_2, train_df_2 = preprocess_data(train_df, tokenizer_2)
dev_inputs_2, dev_df_2 = preprocess_data(dev_df, tokenizer_2)
test_inputs_2, test_df_2 = preprocess_data(test_df, tokenizer_2)

# Create datasets for the second model
train_dataset_2 = create_dataset(train_inputs_2, train_df_2)
dev_dataset_2 = create_dataset(dev_inputs_2, dev_df_2)
test_dataset_2 = create_dataset(test_inputs_2, test_df_2)

# Create a DatasetDict for the Trainer for the second model
dataset_dict_2 = DatasetDict({
    'train': train_dataset_2,
    'validation': dev_dataset_2,
    'test': test_dataset_2
})

# Load the second model
model_2 = AutoModelForQuestionAnswering.from_pretrained('albert-base-v2')

# Define the training arguments for the second model
training_args_2 = TrainingArguments(
    output_dir='./results_albert',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs_albert',
    logging_steps=10,
)

# Define the trainer for the second model
trainer_2 = Trainer(
    model=model_2,
    args=training_args_2,
    train_dataset=dataset_dict_2['train'],
    eval_dataset=dataset_dict_2['validation'],
)

# Train the second model
trainer_2.train()

### Evaluate the second model

In [None]:
# Evaluate the second model on the test set
test_results_2 = trainer_2.evaluate(test_dataset_2, metric_key_prefix='eval', compute_metrics=compute_metrics)
print(f"Test results for albert-base-v2: {test_results_2}")

### Fine-Tune the Resource-Intensive Model (bert-large-uncased)

In [None]:
# Load the tokenizer for the resource-intensive model
tokenizer_large = AutoTokenizer.from_pretrained('bert-large-uncased')

# Preprocess the data for the resource-intensive model
train_inputs_large, train_df_large = preprocess_data(train_df, tokenizer_large)
dev_inputs_large, dev_df_large = preprocess_data(dev_df, tokenizer_large)
test_inputs_large, test_df_large = preprocess_data(test_df, tokenizer_large)

# Create datasets for the resource-intensive model
train_dataset_large = create_dataset(train_inputs_large, train_df_large)
dev_dataset_large = create_dataset(dev_inputs_large, dev_df_large)
test_dataset_large = create_dataset(test_inputs_large, test_df_large)

# Create a DatasetDict for the Trainer for the resource-intensive model
dataset_dict_large = DatasetDict({
    'train': train_dataset_large,
    'validation': dev_dataset_large,
    'test': test_dataset_large
})

# Load the resource-intensive model
model_large = AutoModelForQuestionAnswering.from_pretrained('bert-large-uncased')

# Define the training arguments for the resource-intensive model
training_args_large = TrainingArguments(
    output_dir='./results_bert_large',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # Lower batch size due to larger model
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs_bert_large',
    logging_steps=10,
)

# Define the trainer for the resource-intensive model
trainer_large = Trainer(
    model=model_large,
    args=training_args_large,
    train_dataset=dataset_dict_large['train'],
    eval_dataset=dataset_dict_large['validation'],
)

# Train the resource-intensive model
trainer_large.train()

### Evaluate the Resource-Intensive Model

In [None]:
# Evaluate the resource-intensive model on the test set
test_results_large = trainer_large.evaluate(test_dataset_large, metric_key_prefix='eval', compute_metrics=compute_metrics)
print(f"Test results for bert-large-uncased: {test_results_large}")

### Analyze and Report Results

In [None]:
# Collect the results
results = {
    'roberta-base': test_results,
    'albert-base-v2': test_results_2,
    'bert-large-uncased': test_results_large
}

# Display the results
for model, result in results.items():
    print(f"Model: {model}")
    print(f"Exact Match: {result.get('eval_exact', 'N/A')}")
    print(f"F1 Score: {result.get('eval_f1', 'N/A')}")
    print(f"Loss: {result.get('eval_loss', 'N/A')}")
    print("\n")