FineTuning TellMeWhy Dataset on T5 Base Model

In [None]:
!pip install nltk
!pip install rouge_score
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    AdamW,
    get_linear_schedule_with_warmup
)
from tqdm import tqdm
import os
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.translate.meteor_score import meteor_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import warnings

warnings.filterwarnings('ignore')

def calculate_meteor(predictions, references, tokenizer):
    meteor_scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = tokenizer.tokenize(pred)
        ref_tokens = tokenizer.tokenize(ref)

        meteor_scores.append(meteor_score([ref_tokens], pred_tokens))

    return sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0

def calculate_bleu(predictions, references):
    smooth = SmoothingFunction().method4
    bleu_scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = pred.split()
        ref_tokens = ref.split()
        bleu_scores.append(sentence_bleu([ref_tokens], pred_tokens, smoothing_function=smooth))
    return sum(bleu_scores) / len(bleu_scores)

def calculate_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)
        rouge_scores['rouge1'].append(score['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(score['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(score['rougeL'].fmeasure)

    # Return average F1 scores for each ROUGE metric
    avg_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1'])
    avg_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2'])
    avg_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL'])

    return [avg_rouge1, avg_rouge2, avg_rougeL]

class TellMeWhyDataset(Dataset):
    def __init__(self, data_path: str, tokenizer, max_length: int = 384):  # Balanced sequence length
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = self.load_data(data_path)

    def load_data(self, data_path: str):
        processed_data = []
        with open(data_path, 'r', encoding='utf-8') as f:
            for line in f:
                example = json.loads(line)
                # Only include answerable questions for better training
                if example['is_ques_answerable'] == 'Answerable':
                    processed_data.append({
                        'narrative': example['narrative'],
                        'question': example['question'],
                        'answer': example['answer']
                    })
        return processed_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        # Efficient but clear input format
        input_text = f"Question: {item['question']} Context: {item['narrative']}"
        target_text = item['answer']

        # Efficient tokenization
        inputs = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Shorter max length for answers
        targets = self.tokenizer(
            target_text,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets['input_ids'].squeeze()
        }


def train_t5(
    data_path: str,
    output_dir: str,
    num_epochs: int = 3,
    batch_size: int = 12,
    gradient_accumulation_steps: int = 16,
    learning_rate: float = 1e-4,
    max_length: int = 384,
    save_steps: int = 1000
):

    # Initialize model and tokenizer
    model_name = 't5-base'
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    # Enable gradient checkpointing for memory efficiency
    model.gradient_checkpointing_enable()

    # Prepare dataset and dataloader
    dataset = TellMeWhyDataset(data_path, tokenizer, max_length=max_length)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )

    # Setup training
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    total_steps = (len(dataloader) // gradient_accumulation_steps) * num_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=total_steps // 10,
        num_training_steps=total_steps
    )

    # Track losses and metrics
    record_losses, record_metrics, record_step_nos = [], [], []

    global_step = 0
    best_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        optimizer.zero_grad()
        progress_bar = tqdm(dataloader, desc=f'Epoch {epoch+1}/{num_epochs}')

        all_preds, all_labels = [], []

        for step, batch in enumerate(progress_bar):
            # Move batch to GPU
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            # Calculate loss
            loss = outputs.loss / gradient_accumulation_steps
            loss.backward()

            # Update weights every gradient_accumulation_steps
            if (step + 1) % gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                global_step += 1

                # Update progress bar
                progress_bar.set_postfix({
                    'loss': f'{loss.item() * gradient_accumulation_steps:.4f}',
                    'lr': f'{scheduler.get_last_lr()[0]:.7f}'
                })

                total_loss += loss.item() * gradient_accumulation_steps

            if step+1 % 1000 == 0:
                # Decode predictions for metric calculation
                with torch.no_grad():
                    preds = model.generate(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        max_new_tokens=128
                    )
                    decoded_preds = [tokenizer.decode(p, skip_special_tokens=True) for p in preds]
                    decoded_labels = [tokenizer.decode(l, skip_special_tokens=True) for l in labels]

                    all_preds.extend(decoded_preds)
                    all_labels.extend(decoded_labels)

        # Decode predictions for metric calculation
        with torch.no_grad():
            preds = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=128
            )
            decoded_preds = [tokenizer.decode(p, skip_special_tokens=True) for p in preds]
            decoded_labels = [tokenizer.decode(l, skip_special_tokens=True) for l in labels]

            all_preds.extend(decoded_preds)
            all_labels.extend(decoded_labels)

        # Calculate and log metrics at the end of epoch
        accuracy = accuracy_score(all_labels, all_preds)
        precision = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
        recall = recall_score(all_labels, all_preds, average='weighted', zero_division=0)
        f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)

        record_metrics.append({
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        })

        # Epoch-end statistics
        avg_loss = total_loss / len(dataloader)
        record_losses.append(avg_loss)
        record_step_nos.append(global_step)

        bleu_score = calculate_bleu(all_preds, all_labels)
        rouge_score = calculate_rouge(all_preds, all_labels)
        meteor_score_val = calculate_meteor(all_preds, all_labels, tokenizer)

        print(f'Epoch {epoch+1} - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, BLEU: {bleu_score:.4f}, ROUGE Averages: {rouge_score}, Meteor: {meteor_score_val:.4f}')

        # Save epoch checkpoint
        model.save_pretrained(f'{output_dir}/epoch-{epoch+1}')
        tokenizer.save_pretrained(f'{output_dir}/epoch-{epoch+1}')

    # Save final model
    model.save_pretrained(f'{output_dir}/final_model')
    tokenizer.save_pretrained(f'{output_dir}/final_model')

    return list(zip(record_losses, record_step_nos, record_metrics))



Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=f29802250526dc69b60fa5ef30d31b931fab37ae89945b58c64a908ddf34868c
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


FineTuning TellMeWhy Dataset on Base GPT2 Model

In [None]:
!pip install nltk
!pip install rouge_score
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    AdamW,
    get_linear_schedule_with_warmup
)
from tqdm import tqdm
import os
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.translate.meteor_score import meteor_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import warnings

warnings.filterwarnings('ignore')

def calculate_meteor(predictions, references, tokenizer):
    meteor_scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = tokenizer.tokenize(pred)
        ref_tokens = tokenizer.tokenize(ref)

        meteor_scores.append(meteor_score([ref_tokens], pred_tokens))

    return sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0

def calculate_bleu(predictions, references):
    smooth = SmoothingFunction().method4
    bleu_scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = pred.split()
        ref_tokens = ref.split()
        bleu_scores.append(sentence_bleu([ref_tokens], pred_tokens, smoothing_function=smooth))
    return sum(bleu_scores) / len(bleu_scores)

def calculate_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)
        rouge_scores['rouge1'].append(score['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(score['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(score['rougeL'].fmeasure)

    # Return average F1 scores for each ROUGE metric
    avg_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1'])
    avg_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2'])
    avg_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL'])

    return [avg_rouge1, avg_rouge2, avg_rougeL]

class TellMeWhyDataset(Dataset):
    def __init__(self, data_path: str, tokenizer, max_length: int = 384):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = self.load_data(data_path)

    def load_data(self, data_path: str):
        processed_data = []
        with open(data_path, 'r', encoding='utf-8') as f:
            for line in f:
                example = json.loads(line)
                if example['is_ques_answerable'] == 'Answerable':
                    processed_data.append({
                        'narrative': example['narrative'],
                        'question': example['question'],
                        'answer': example['answer']
                    })
        return processed_data

    def __getitem__(self, idx):
        item = self.data[idx]
        # Efficient prompt format
        input_text = f"Question: {item['question']}\nContext: {item['narrative']}\nAnswer: {item['answer']}"

        encodings = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encodings['input_ids'].squeeze(),
            'attention_mask': encodings['attention_mask'].squeeze(),
            'labels': encodings['input_ids'].squeeze()
        }

    def __len__(self):
        return len(self.data)

def train_gpt2(
    data_path: str,
    output_dir: str,
    num_epochs: int = 5,
    batch_size: int = 12,
    gradient_accumulation_steps: int = 8,
    learning_rate: float = 3e-6,
    max_length: int = 384,
    save_steps: int = 1000
):
    os.makedirs(output_dir, exist_ok=True)

    # Initialize model and tokenizer
    model_name = 'gpt2'
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Add padding token
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'left'
    model.config.pad_token_id = model.config.eos_token_id

    # Enable gradient checkpointing
    model.gradient_checkpointing_enable()

    # Mixed precision training
    scaler = torch.cuda.amp.GradScaler()

    # Prepare dataset and dataloader
    dataset = TellMeWhyDataset(data_path, tokenizer, max_length=max_length)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Optimizer with weight decay
    optimizer = AdamW(
        model.parameters(),
        lr=learning_rate,
        weight_decay=0.01
    )

    # Track losses and metrics
    record_losses, record_metrics, record_step_nos = [], [], []

    total_steps = (len(dataloader) // gradient_accumulation_steps) * num_epochs

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=total_steps // 10,
        num_training_steps=total_steps
    )

    global_step = 0
    best_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        optimizer.zero_grad()

        progress_bar = tqdm(dataloader, desc=f'Epoch {epoch+1}/{num_epochs}')

        all_preds, all_labels = [], []

        for step, batch in enumerate(progress_bar):
            # Move batch to GPU
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Mixed precision forward pass
            with torch.cuda.amp.autocast():
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss / gradient_accumulation_steps

            # Scale loss and backward pass
            scaler.scale(loss).backward()

            if (step + 1) % gradient_accumulation_steps == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                optimizer.zero_grad()

                global_step += 1

                progress_bar.set_postfix({
                    'loss': f'{loss.item() * gradient_accumulation_steps:.4f}',
                    'lr': f'{scheduler.get_last_lr()[0]:.7f}'
                })

                total_loss += loss.item() * gradient_accumulation_steps

            if step+1 % 1000 == 0:
                # Decode predictions for metric calculation
                with torch.no_grad():
                    preds = model.generate(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        max_new_tokens=128
                    )
                    decoded_preds = [tokenizer.decode(p, skip_special_tokens=True) for p in preds]
                    decoded_labels = [tokenizer.decode(l, skip_special_tokens=True) for l in labels]

                    all_preds.extend(decoded_preds)
                    all_labels.extend(decoded_labels)

        # Decode predictions for metric calculation
        with torch.no_grad():
            preds = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=128
            )
            decoded_preds = [tokenizer.decode(p, skip_special_tokens=True) for p in preds]
            decoded_labels = [tokenizer.decode(l, skip_special_tokens=True) for l in labels]

            all_preds.extend(decoded_preds)
            all_labels.extend(decoded_labels)

        # Calculate metrics
        accuracy = accuracy_score(all_labels, all_preds)
        precision = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
        recall = recall_score(all_labels, all_preds, average='weighted', zero_division=0)
        f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)

        record_metrics.append({
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        })

        # Epoch-end statistics
        avg_loss = total_loss / len(dataloader)
        record_losses.append(avg_loss)
        record_step_nos.append(global_step)

        bleu_score = calculate_bleu(all_preds, all_labels)
        rouge_score = calculate_rouge(all_preds, all_labels)
        meteor_score_val = calculate_meteor(all_preds, all_labels, tokenizer)

        print(f'Epoch {epoch+1} - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, BLEU: {bleu_score:.4f}, ROUGE Averages: {rouge_score}, Meteor: {meteor_score_val:.4f}')

        # Save epoch checkpoint
        model.save_pretrained(f'{output_dir}/epoch-{epoch+1}')
        tokenizer.save_pretrained(f'{output_dir}/epoch-{epoch+1}')

    # Save final model
    model.save_pretrained(f'{output_dir}/final_model')
    tokenizer.save_pretrained(f'{output_dir}/final_model')

    # Return tracked losses and metrics for further analysis
    return list(zip(record_losses, record_step_nos, record_metrics))

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=a2895386dd4888a9b81fc364ddf559c430bb859a39951668df81315e62b7cfe6
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


DistilBert

In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    DistilBertTokenizer,
    DistilBertForQuestionAnswering,
    AdamW,
    get_linear_schedule_with_warmup
)
from tqdm import tqdm
import os
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

class TellMeWhyDataset(Dataset):
    def __init__(self, data_path: str, tokenizer, max_length: int = 384):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = self.load_data(data_path)

    def load_data(self, data_path: str):
        processed_data = []
        with open(data_path, 'r', encoding='utf-8') as f:
            for line in f:
                example = json.loads(line)
                if example['is_ques_answerable'] == 'Answerable':
                    processed_data.append({
                        'narrative': example['narrative'],
                        'question': example['question'],
                        'answer': example['answer'],
                        'original_sentence_for_question' : example['original_sentence_for_question']
                    })
        return processed_data

    def __getitem__(self, idx):
        item = self.data[idx]

        # Treat "original_sentence_for_question" as the answer
        answer = item['original_sentence_for_question']
        context = item['narrative']

        # Generate the input text for tokenization
        input_text = f"<start_of_turn>user\nQuestion: {item['question']}\nContext: {context}<end_of_turn>\n<start_of_turn>assistant\nAnswer: {answer}<end_of_turn>"

        # Tokenize the input text
        encodings = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Find the start and end positions of the answer in the context (treat answer as a substring)
        start_pos = context.find(answer)
        end_pos = start_pos + len(answer) - 1

        # Convert to token indices
        start_token = len(self.tokenizer.encode(context[:start_pos]))
        end_token = len(self.tokenizer.encode(context[:end_pos+1]))

        # Adjust for padding (ensure indices are within max_length)
        start_token = min(start_token, self.max_length - 1)
        end_token = min(end_token, self.max_length - 1)

        return {
            'input_ids': encodings['input_ids'].squeeze(),
            'attention_mask': encodings['attention_mask'].squeeze(),
            'labels': encodings['input_ids'].squeeze(),
            'start_positions': torch.tensor(start_token),
            'end_positions': torch.tensor(end_token)
        }

    def __len__(self):
        return len(self.data)

def train_distilbert(
    data_path: str,
    output_dir: str,
    num_epochs: int = 5,
    batch_size: int = 12,
    gradient_accumulation_steps: int = 16,
    learning_rate: float = 1e-4,
    max_length: int = 384,
    save_steps: int = 1000
):
    os.makedirs(output_dir, exist_ok=True)

    # Initialize model and tokenizer
    model_name = "distilbert-base-uncased-distilled-squad"
    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
    model = DistilBertForQuestionAnswering.from_pretrained(model_name)

    # Enable gradient checkpointing
    model.gradient_checkpointing_enable()

    # Prepare dataset and dataloader
    dataset = TellMeWhyDataset(data_path, tokenizer, max_length=max_length)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0,
        pin_memory=True
    )

    # Optimizer with weight decay
    optimizer = AdamW(
        model.parameters(),
        lr=learning_rate,
        weight_decay=0.01
    )

    total_steps = (len(dataloader) // gradient_accumulation_steps) * num_epochs

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=total_steps // 10,
        num_training_steps=total_steps
    )

    # Setup training
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Track losses and metrics
    record_losses, record_metrics, record_step_nos = [], [], []

    global_step = 0
    best_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        optimizer.zero_grad()

        progress_bar = tqdm(dataloader, desc=f'Epoch {epoch+1}/{num_epochs}')

        all_preds, all_labels = [], []

        for step, batch in enumerate(progress_bar):
            # Move batch to GPU
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                start_positions=start_positions,
                end_positions=end_positions
            )

            loss = outputs.loss / gradient_accumulation_steps
            loss.backward()

            if (step + 1) % gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

                global_step += 1

                progress_bar.set_postfix({
                    'loss': f'{loss.item() * gradient_accumulation_steps:.4f}',
                    'lr': f'{scheduler.get_last_lr()[0]:.7f}'
                })

            total_loss += loss.item() * gradient_accumulation_steps

            # Decode predictions for metric calculation
            with torch.no_grad():
                start_logits = outputs.start_logits.argmax(dim=1)
                end_logits = outputs.end_logits.argmax(dim=1)

                for i in range(len(start_logits)):
                    pred_answer = tokenizer.decode(
                        input_ids[i][start_logits[i]:end_logits[i] + 1]
                    )
                    true_answer = tokenizer.decode(
                        input_ids[i][start_positions[i]:end_positions[i] + 1]
                    )

                    all_preds.append(pred_answer)
                    all_labels.append(true_answer)

        # Calculate metrics
        accuracy = accuracy_score(all_labels, all_preds)
        precision = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
        recall = recall_score(all_labels, all_preds, average='weighted', zero_division=0)
        f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)

        record_metrics.append({
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        })

        avg_loss = total_loss / len(dataloader)
        record_losses.append(avg_loss)
        record_step_nos.append(global_step)

        print(f'Epoch {epoch+1} - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}')

        # Save epoch checkpoint
        model.save_pretrained(f'{output_dir}/epoch-{epoch+1}')
        tokenizer.save_pretrained(f'{output_dir}/epoch-{epoch+1}')

    # Save final model
    model.save_pretrained(f'{output_dir}/final_model')
    tokenizer.save_pretrained(f'{output_dir}/final_model')

    # Return tracked losses and metrics for further analysis
    return list(zip(record_losses, record_step_nos, record_metrics))

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Set paths
data_path = '/content/drive/MyDrive/TellMeWhy/Dataset/train.json'  #
output_dir = '/content/drive/MyDrive/TellMeWhy/Output/T5'

Mounted at /content/drive


In [None]:
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    GPT2LMHeadModel,
    GPT2Tokenizer,
    DistilBertForQuestionAnswering,
    DistilBertTokenizer
)

def query_t5(model, tokenizer, input_text: str, max_length: int = 384, max_new_tokens: int = 128):
    """
    Query the trained model with an input and get the generated output.

    Args:
        model: Trained model.
        tokenizer: Corresponding tokenizer for the model.
        input_text: Input string to query the model.
        max_length: Maximum sequence length for tokenization.
        max_new_tokens: Maximum number of new tokens to generate.

    Returns:
        str: The generated output from the model.
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    # Tokenize the input
    inputs = tokenizer(
        input_text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    ).to(device)

    # Generate output
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=max_new_tokens
        )

    # Decode and return the output
    generated_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_output

def query_gpt2(model, tokenizer, input_text: str):
    # """
    # Query the trained GPT-2 model with an input and get the generated output.

    # Args:
    #     model: Trained GPT-2 model.
    #     tokenizer: Corresponding tokenizer for the GPT-2 model.
    #     input_text: Input string to query the model.
    #     max_length: Maximum sequence length for tokenization.
    #     max_new_tokens: Maximum number of new tokens to generate.

    # Returns:
    #     str: The generated output from the model.
    # """
    # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # model.to(device)
    # model.eval()

    # # Tokenize the input
    # inputs = tokenizer(
    #     input_text,
    #     max_length=max_length,
    #     padding='max_length',
    #     truncation=True,
    #     return_tensors='pt'
    # ).to(device)

    # # Generate output
    # with torch.no_grad():
    #     outputs = model.generate(
    #         input_ids=inputs['input_ids'],
    #         attention_mask=inputs['attention_mask'],
    #         max_new_tokens=max_new_tokens
    #     )

    # # Decode and return the output
    # generated_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # return generated_output

    # Add padding token if not already added
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'left'
    model.config.pad_token_id = model.config.eos_token_id

    # Prepare the input text in the same format as the model was trained with
    input_text = f"{input_text}\nAnswer:"

    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=384)

    # Move the inputs to the same device as the model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate the answer
    with torch.no_grad():
        output = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=128,
            num_beams=5,
            no_repeat_ngram_size=2,
            temperature=0.7,
            early_stopping=True,
            eos_token_id=tokenizer.eos_token_id  # Ensure it stops at the eos token
        )

    # Decode the generated output (this will be the answer)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the answer from the generated text (after "Answer:" token)
    answer = generated_text.split('Answer:')[-1].strip()

    return answer

def query_distilbert(model, tokenizer, question: str, context: str, max_length: int = 384):
    """
    Query the trained DistilBERT model for Question Answering.

    Args:
        model: Trained DistilBERT model.
        tokenizer: Corresponding tokenizer for the DistilBERT model.
        question: The question to ask.
        context: The context where the answer might be located.
        max_length: Maximum sequence length for tokenization.

    Returns:
        str: The predicted answer from the model.
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    # Tokenize the input question and context
    inputs = tokenizer.encode_plus(
        question,
        context,
        add_special_tokens=True,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"]
        )

    # Extract start and end logits
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Identify the start and end tokens of the answer
    start_idx = torch.argmax(start_logits, dim=1).item()
    end_idx = torch.argmax(end_logits, dim=1).item()

    # Decode the answer
    all_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    answer = tokenizer.convert_tokens_to_string(all_tokens[start_idx:end_idx + 1])

    # Post-process the answer (remove unnecessary spaces)
    #answer = answer.strip()
    return answer


In [None]:
output_dir = "/content/drive/MyDrive/BestModels/T5Output/final_model"
model = T5ForConditionalGeneration.from_pretrained(f"{output_dir}")
tokenizer = T5Tokenizer.from_pretrained(f"{output_dir}")

# Query the model
input_text = "Question: Why did Lily open the door? Context: One rainy evening, Lily sat by the window, watching the droplets race down the glass. She had just moved to the old house at the edge of town and still wasn't used to the silence. Suddenly, she heard a soft knock on the door. Startled, she opened it to find a small, bedraggled cat with a collar but no owner in sight. Lily smiled, deciding to let the cat in, thinking that maybe this new house wouldn’t be so lonely after all."
generated_output = query_t5(model, tokenizer, input_text)

print("\nInput:", input_text)
print("Generated Output:", generated_output)

input_text = "Question: Why did He look for his pizza cutter? Context: Cam ordered a pizza and took it home. He opened the box to take out a slice. Cam discovered that the store did not cut the pizza for him. He looked for his pizza cutter but did not find it. He had to use his chef knife to cut a slice."
generated_output = query_t5(model, tokenizer, input_text)

print("\nInput:", input_text)
print("Generated Output:", generated_output)

input_text = "Question: Why is the sky blue? Context: The sky appears blue due to the scattering of sunlight by the atmosphere. This scattering causes shorter wavelengths of light, such as blue, to scatter more than other wavelengths."
generated_output = query_t5(model, tokenizer, input_text)

print("\nInput:", input_text)
print("Generated Output:", generated_output)


Input: Question: Why did Lily open the door? Context: One rainy evening, Lily sat by the window, watching the droplets race down the glass. She had just moved to the old house at the edge of town and still wasn't used to the silence. Suddenly, she heard a soft knock on the door. Startled, she opened it to find a small, bedraggled cat with a collar but no owner in sight. Lily smiled, deciding to let the cat in, thinking that maybe this new house wouldn’t be so lonely after all.
Generated Output: she heard a knock on the door.

Input: Question: Why did He look for his pizza cutter? Context: Cam ordered a pizza and took it home. He opened the box to take out a slice. Cam discovered that the store did not cut the pizza for him. He looked for his pizza cutter but did not find it. He had to use his chef knife to cut a slice.
Generated Output: Cam was not sure if the store cut the pizza for him.

Input: Question: Why is the sky blue? Context: The sky appears blue due to the scattering of sun

In [None]:
output_dir = "/content/drive/MyDrive/BestModels/GPT2OutputNewNew/final_model"
model = GPT2LMHeadModel.from_pretrained(f"{output_dir}")
tokenizer = GPT2Tokenizer.from_pretrained(f"{output_dir}")

# Query the model
input_text = "Question: Why did Lily open the door?\nContext: One rainy evening, Lily sat by the window, watching the droplets race down the glass. She had just moved to the old house at the edge of town and still wasn't used to the silence. Suddenly, she heard a soft knock on the door. Startled, she opened it to find a small, bedraggled cat with a collar but no owner in sight. Lily smiled, deciding to let the cat in, thinking that maybe this new house wouldn’t be so lonely after all."
generated_output = query_gpt2(model, tokenizer, input_text)

print("\nInput:", input_text)
print("Generated Output:", generated_output)

input_text = "Question: Why did He look for his pizza cutter?\nContext: Cam ordered a pizza and took it home. He opened the box to take out a slice. Cam discovered that the store did not cut the pizza for him. He looked for his pizza cutter but did not find it. He had to use his chef knife to cut a slice."
generated_output = query_gpt2(model, tokenizer, input_text)

print("\nInput:", input_text)
print("Generated Output:", generated_output)

input_text = "Question: Why is the sky blue?\nContext: The sky appears blue due to the scattering of sunlight by the atmosphere. This scattering causes shorter wavelengths of light, such as blue, to scatter more than other wavelengths."
generated_output = query_gpt2(model, tokenizer, input_text)

print("\nInput:", input_text)
print("Generated Output:", generated_output)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Input: Question: Why did Lily open the door?
Context: One rainy evening, Lily sat by the window, watching the droplets race down the glass. She had just moved to the old house at the edge of town and still wasn't used to the silence. Suddenly, she heard a soft knock on the door. Startled, she opened it to find a small, bedraggled cat with a collar but no owner in sight. Lily smiled, deciding to let the cat in, thinking that maybe this new house wouldn’t be so lonely after all.
Generated Output: Lily opened the front door of the house and found


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Input: Question: Why did He look for his pizza cutter?
Context: Cam ordered a pizza and took it home. He opened the box to take out a slice. Cam discovered that the store did not cut the pizza for him. He looked for his pizza cutter but did not find it. He had to use his chef knife to cut a slice.
Generated Output: The pizza was too small to fit in his pocket. The chef cut it in half and placed it on the countertop of his kitchen counter. When he opened it, he found that it had been cut out of a piece of pizza.

Input: Question: Why is the sky blue?
Context: The sky appears blue due to the scattering of sunlight by the atmosphere. This scattering causes shorter wavelengths of light, such as blue, to scatter more than other wavelengths.
Generated Output: This phenomenon is caused by an imbalance in the light spectrum between the Earth's atmosphere and the Sun. The Sun's light is absorbed by our atmosphere, which absorbs more light than it absorbs from the sun. Therefore, the blue sky i

In [None]:
output_dir = "/content/drive/MyDrive/BestModels/DistilBertOutputFinal/final_model"

# Load the trained model and tokenizer
model = DistilBertForQuestionAnswering.from_pretrained(f"{output_dir}")
tokenizer = DistilBertTokenizer.from_pretrained(f"{output_dir}")

question = "Why did Lily open the door?"
context = "One rainy evening, Lily sat by the window, watching the droplets race down the glass. She had just moved to the old house at the edge of town and still wasn't used to the silence. Suddenly, she heard a soft knock on the door. Startled, she opened it to find a small, bedraggled cat with a collar but no owner in sight. Lily smiled, deciding to let the cat in, thinking that maybe this new house wouldn’t be so lonely after all."
predicted_answer = query_distilbert(model, tokenizer, question, context)

print("\nQuestion:", question)
print("Context:", context)
print("Predicted Answer:", predicted_answer)

question = "Why did He look for his pizza cutter?"
context = "Cam ordered a pizza and took it home. He opened the box to take out a slice. Cam discovered that the store did not cut the pizza for him. He looked for his pizza cutter but did not find it. He had to use his chef knife to cut a slice."
predicted_answer = query_distilbert(model, tokenizer, question, context)

print("\nQuestion:", question)
print("Context:", context)
print("Predicted Answer:", predicted_answer)

question = "Why is the sky blue?"
context = "The sky appears blue due to the scattering of sunlight by the atmosphere. This scattering causes shorter wavelengths of light, such as blue, to scatter more than other wavelengths."
predicted_answer = query_distilbert(model, tokenizer, question, context)

print("\nQuestion:", question)
print("Context:", context)
print("Predicted Answer:", predicted_answer)


Question: Why did Lily open the door?
Context: One rainy evening, Lily sat by the window, watching the droplets race down the glass. She had just moved to the old house at the edge of town and still wasn't used to the silence. Suddenly, she heard a soft knock on the door. Startled, she opened it to find a small, bedraggled cat with a collar but no owner in sight. Lily smiled, deciding to let the cat in, thinking that maybe this new house wouldn’t be so lonely after all.
Predicted Answer: 

Question: Why did He look for his pizza cutter?
Context: Cam ordered a pizza and took it home. He opened the box to take out a slice. Cam discovered that the store did not cut the pizza for him. He looked for his pizza cutter but did not find it. He had to use his chef knife to cut a slice.
Predicted Answer: take out a slice . cam discovered that the store did

Question: Why is the sky blue?
Context: The sky appears blue due to the scattering of sunlight by the atmosphere. This scattering causes sho

In [None]:
from google.colab import drive
drive.mount('/content/drive')
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    GPT2LMHeadModel,
    GPT2Tokenizer,
    DistilBertForQuestionAnswering,
    DistilBertTokenizer
)
import torch
from torch.utils.data import Dataset, DataLoader
import json
from tqdm import tqdm
import nltk
nltk.download('wordnet')

Mounted at /content/drive


[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
max_length = 384
model_dir = '/content/drive/MyDrive/BestModels/T5Output/final_model'
validation_data_path = '/content/drive/MyDrive/TellMeWhy/Dataset/validation.json'
test_data_path = '/content/drive/MyDrive/TellMeWhy/Dataset/test.json'
batch_size = 8

# Initialize model and tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_dir)
model = T5ForConditionalGeneration.from_pretrained(model_dir)

# Load validation and test datasets
validation_dataset = TellMeWhyDataset(validation_data_path, tokenizer, max_length=max_length)
test_dataset = TellMeWhyDataset(test_data_path, tokenizer, max_length=max_length)

# DataLoader for validation and test datasets
validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def evaluate_on_dataset(dataloader):
    model.eval()
    all_preds, all_labels = [], []

    for batch in tqdm(dataloader, desc="Evaluating"):
        # Move batch to GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Generate predictions
        with torch.no_grad():
            preds = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=128
            )
            decoded_preds = [tokenizer.decode(p, skip_special_tokens=True) for p in preds]
            decoded_labels = [tokenizer.decode(l, skip_special_tokens=True) for l in labels]

            all_preds.extend(decoded_preds)
            all_labels.extend(decoded_labels)

    return all_preds, all_labels

# Evaluate on validation dataset
print("Evaluating on validation data...")
val_preds, val_labels = evaluate_on_dataset(validation_dataloader)

# Evaluate on test dataset
print("Evaluating on test data...")
test_preds, test_labels = evaluate_on_dataset(test_dataloader)

# Calculate metrics for validation and test sets
val_accuracy = accuracy_score(val_labels, val_preds)
val_precision = precision_score(val_labels, val_preds, average='weighted', zero_division=0)
val_recall = recall_score(val_labels, val_preds, average='weighted', zero_division=0)
val_f1 = f1_score(val_labels, val_preds, average='weighted', zero_division=0)
val_bleu = calculate_bleu(val_preds, val_labels)
val_rouge = calculate_rouge(val_preds, val_labels)
val_meteor = calculate_meteor(val_preds, val_labels, tokenizer)

test_accuracy = accuracy_score(test_labels, test_preds)
test_precision = precision_score(test_labels, test_preds, average='weighted', zero_division=0)
test_recall = recall_score(test_labels, test_preds, average='weighted', zero_division=0)
test_f1 = f1_score(test_labels, test_preds, average='weighted', zero_division=0)
test_bleu = calculate_bleu(test_preds, test_labels)
test_rouge = calculate_rouge(test_preds, test_labels)
test_meteor = calculate_meteor(test_preds, test_labels, tokenizer)

# Print results
print("\nValidation Metrics:")
print(f'Accuracy: {val_accuracy:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1: {val_f1:.4f}')
print(f'BLEU: {val_bleu:.4f}, ROUGE: {val_rouge}, METEOR: {val_meteor:.4f}')

print("\nTest Metrics:")
print(f'Accuracy: {test_accuracy:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1: {test_f1:.4f}')
print(f'BLEU: {test_bleu:.4f}, ROUGE: {test_rouge}, METEOR: {test_meteor:.4f}')

Evaluating on validation data...


Evaluating: 100%|██████████| 821/821 [08:03<00:00,  1.70it/s]


Evaluating on test data...


Evaluating: 100%|██████████| 937/937 [09:07<00:00,  1.71it/s]



Validation Metrics:
Accuracy: 0.0379, Precision: 0.0164, Recall: 0.0379, F1: 0.0219
BLEU: 0.1177, ROUGE: [0.3491483212093372, 0.17959917507822765, 0.3352649566432692], METEOR: 0.3447

Test Metrics:
Accuracy: 0.0295, Precision: 0.0121, Recall: 0.0295, F1: 0.0164
BLEU: 0.1023, ROUGE: [0.3346535153368372, 0.16048854343006091, 0.3206969901793555], METEOR: 0.3296


In [None]:
max_length = 384
model_dir = '/content/drive/MyDrive/BestModels/GPT2Output/final_model'
validation_data_path = '/content/drive/MyDrive/TellMeWhy/Dataset/validation.json'
test_data_path = '/content/drive/MyDrive/TellMeWhy/Dataset/test.json'
batch_size = 8

warnings.filterwarnings('ignore')

# Initialize model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
model = GPT2LMHeadModel.from_pretrained(model_dir)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'
model.config.pad_token_id = model.config.eos_token_id

# Load validation and test datasets
validation_dataset = TellMeWhyDataset(validation_data_path, tokenizer, max_length=max_length)
test_dataset = TellMeWhyDataset(test_data_path, tokenizer, max_length=max_length)

# DataLoader for validation and test datasets
validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def evaluate_on_dataset(dataloader):
    model.eval()
    all_preds, all_labels = [], []

    for batch in tqdm(dataloader, desc="Evaluating"):
        # Move batch to GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Generate predictions
        with torch.no_grad():
            preds = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=128,
                pad_token_id=model.config.pad_token_id
            )
            decoded_preds = [tokenizer.decode(p, skip_special_tokens=True) for p in preds]
            decoded_labels = [tokenizer.decode(l, skip_special_tokens=True) for l in labels]

            all_preds.extend(decoded_preds)
            all_labels.extend(decoded_labels)

    return all_preds, all_labels

# Evaluate on validation dataset
print("Evaluating on validation data...")
val_preds, val_labels = evaluate_on_dataset(validation_dataloader)

# Evaluate on test dataset
print("Evaluating on test data...")
test_preds, test_labels = evaluate_on_dataset(test_dataloader)

# Calculate metrics for validation and test sets
val_accuracy = accuracy_score(val_labels, val_preds)
val_precision = precision_score(val_labels, val_preds, average='weighted', zero_division=0)
val_recall = recall_score(val_labels, val_preds, average='weighted', zero_division=0)
val_f1 = f1_score(val_labels, val_preds, average='weighted', zero_division=0)
val_bleu = calculate_bleu(val_preds, val_labels)
val_rouge = calculate_rouge(val_preds, val_labels)
val_meteor = calculate_meteor(val_preds, val_labels, tokenizer)

test_accuracy = accuracy_score(test_labels, test_preds)
test_precision = precision_score(test_labels, test_preds, average='weighted', zero_division=0)
test_recall = recall_score(test_labels, test_preds, average='weighted', zero_division=0)
test_f1 = f1_score(test_labels, test_preds, average='weighted', zero_division=0)
test_bleu = calculate_bleu(test_preds, test_labels)
test_rouge = calculate_rouge(test_preds, test_labels)
test_meteor = calculate_meteor(test_preds, test_labels, tokenizer)

# Print results
print("\nValidation Metrics:")
print(f'Accuracy: {val_accuracy:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1: {val_f1:.4f}')
print(f'BLEU: {val_bleu:.4f}, ROUGE: {val_rouge}, METEOR: {val_meteor:.4f}')

print("\nTest Metrics:")
print(f'Accuracy: {test_accuracy:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1: {test_f1:.4f}')
print(f'BLEU: {test_bleu:.4f}, ROUGE: {test_rouge}, METEOR: {test_meteor:.4f}')

Evaluating on validation data...


Evaluating: 100%|██████████| 821/821 [25:53<00:00,  1.89s/it]


Evaluating on test data...


Evaluating: 100%|██████████| 937/937 [29:10<00:00,  1.87s/it]



Validation Metrics:
Accuracy: 0.0000, Precision: 0.0000, Recall: 0.0000, F1: 0.0000
BLEU: 0.3927, ROUGE: [0.5875726354593594, 0.5836508605027191, 0.5875726354593594], METEOR: 0.7677

Test Metrics:
Accuracy: 0.0000, Precision: 0.0000, Recall: 0.0000, F1: 0.0000
BLEU: 0.3881, ROUGE: [0.5788999865306321, 0.574883147749692, 0.5788999865306321], METEOR: 0.7639


In [None]:
max_length = 384
model_dir = '/content/drive/MyDrive/BestModels/DistilBertOutputFinal/final_model'
validation_data_path = '/content/drive/MyDrive/TellMeWhy/Dataset/validation.json'
test_data_path = '/content/drive/MyDrive/TellMeWhy/Dataset/test.json'
batch_size = 8

# Initialize model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained(model_dir)
model = DistilBertForQuestionAnswering.from_pretrained(model_dir)

# Load validation and test datasets
validation_dataset = TellMeWhyDataset(validation_data_path, tokenizer, max_length=max_length)
test_dataset = TellMeWhyDataset(test_data_path, tokenizer, max_length=max_length)

# DataLoader for validation and test datasets
validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def evaluate_on_dataset(dataloader):
    model.eval()
    all_preds, all_labels = [], []

    for batch in tqdm(dataloader, desc="Evaluating"):
        # Move batch to GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Generate predictions
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            start_positions=start_positions,
            end_positions=end_positions
        )

        with torch.no_grad():
            start_logits = outputs.start_logits.argmax(dim=1)
            end_logits = outputs.end_logits.argmax(dim=1)

            for i in range(len(start_logits)):
                pred_answer = tokenizer.decode(
                    input_ids[i][start_logits[i]:end_logits[i] + 1]
                )
                true_answer = tokenizer.decode(
                    input_ids[i][start_positions[i]:end_positions[i] + 1]
                )

                all_preds.append(pred_answer)
                all_labels.append(true_answer)
    print(len(all_preds))
    print(len(all_labels))

    return all_preds, all_labels

# Evaluate on validation dataset
print("Evaluating on validation data...")
val_preds, val_labels = evaluate_on_dataset(validation_dataloader)

# Evaluate on test dataset
print("Evaluating on test data...")
test_preds, test_labels = evaluate_on_dataset(test_dataloader)

# Calculate metrics for validation and test sets
val_accuracy = accuracy_score(val_labels, val_preds)
val_precision = precision_score(val_labels, val_preds, average='weighted', zero_division=0)
val_recall = recall_score(val_labels, val_preds, average='weighted', zero_division=0)
val_f1 = f1_score(val_labels, val_preds, average='weighted', zero_division=0)

test_accuracy = accuracy_score(test_labels, test_preds)
test_precision = precision_score(test_labels, test_preds, average='weighted', zero_division=0)
test_recall = recall_score(test_labels, test_preds, average='weighted', zero_division=0)
test_f1 = f1_score(test_labels, test_preds, average='weighted', zero_division=0)

# Print results
print("\nValidation Metrics:")
print(f'Accuracy: {val_accuracy}, Precision: {val_precision}, Recall: {val_recall}, F1: {val_f1}')

print("\nTest Metrics:")
print(f'Accuracy: {test_accuracy}, Precision: {test_precision}, Recall: {test_recall}, F1: {test_f1}')

Evaluating on validation data...


Evaluating: 100%|██████████| 821/821 [01:20<00:00, 10.14it/s]


6564
6564
Evaluating on test data...


Evaluating: 100%|██████████| 937/937 [01:29<00:00, 10.44it/s]


7494
7494

Validation Metrics:
Accuracy: 0.9675502742230347, Precision: 0.967560661459199, Recall: 0.9675502742230347, F1: 0.9675498539578474

Test Metrics:
Accuracy: 0.9675740592473979, Precision: 0.967627581919015, Recall: 0.9675740592473979, F1: 0.9675726360599367


In [None]:
plot_losses = train_t5(
    data_path=data_path,
    output_dir=output_dir,
)

# Plot the loss over time
record_losses, record_step_nos, record_metrics = zip(*plot_losses)
plt.plot(record_step_nos, record_losses)
plt.xlabel("Training Step")
plt.ylabel("Average Training Loss")
plt.title("T5 Training Loss")
plt.show()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Epoch 1/3:   0%|          | 0/4501 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Epoch 1/3:   2%|▏         | 86/4501 [02:00<1:42:52,  1.40s/it, loss=22.1992, lr=0.0000060]


KeyboardInterrupt: 

In [None]:
plot_losses = train_gpt2(
    data_path=data_path,
    output_dir=output_dir,
)

# Plot the loss over time
record_losses, record_step_nos, record_metrics = zip(*plot_losses)
plt.plot(record_step_nos, record_losses)
plt.xlabel("Training Step")
plt.ylabel("Average Training Loss")
plt.title("GPT2 Training Loss")
plt.show()

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Epoch 1/3:   0%|          | 0/4501 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
Epoch 1/3:   0%|          | 15/4501 [00:10<39:00,  1.92it/s, loss=10.6498, lr=0.0000024]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Epoch 1/3:   1%|          | 31/4501 [01:17<47:03,  1.58it/s, loss=10.6982, lr=0.0000048]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Epoch 1/3:   1%|          | 47/4501 [02:23<46:14,  1.61it/s, loss=10.5805, lr=0.0000071]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Epoch 1/3:   1%|▏         | 63/4501 [03:29<46:08,  1.60it/s, loss=10.6919, lr=0.0000095]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Epoch 1/3:   2%|▏         | 79/4501 [04:35<46:03,  1.60it/s, loss=10.4975, lr=0.0000119]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Epoch 1/3:   2%|▏         | 95/4501 [05:41<45:45,  1.60it/s, lo

KeyboardInterrupt: 

In [None]:
plot_losses = train_distilbert(
    data_path=data_path,
    output_dir=output_dir,
)

# Plot the loss over time
record_losses, record_step_nos, record_metrics = zip(*plot_losses)
plt.plot(record_step_nos, record_losses)
plt.xlabel("Training Step")
plt.ylabel("Average Training Loss")
plt.title("DistilBert Training Loss")
plt.show()