<a href="https://colab.research.google.com/github/Mian-fahdiii/Do-List-Whisper-AI/blob/main/f219125.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Question 2**

In [1]:
# ✅ Disable Weights & Biases logging (prevents API key prompt)
import os
os.environ["WANDB_DISABLED"] = "true"

# Install required libraries
!pip install transformers datasets torch

# Import necessary libraries
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import pandas as pd

# Load the dataset
dataset_path = "/content/sample_data/math_meme_repair_dataset.csv"  # Ensure correct path
df = pd.read_csv(dataset_path)

# Prepare the dataset for fine-tuning
incorrect_memes = df["Incorrect Meme"].tolist()
correct_explanations = df["Correct Explanation"].tolist()

# Combine memes and corrections for training
training_texts = [f"Incorrect: {meme}\nCorrect: {explanation}" for meme, explanation in zip(incorrect_memes, correct_explanations)]

# Convert into a Dataset format
dataset = Dataset.from_dict({"text": training_texts})

# Load tokenizer and model (GPT-2 base)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to eos_token

model = GPT2LMHeadModel.from_pretrained("gpt2")

# Explicitly define loss type to prevent warning
model.config.loss_type = "ForCausalLMLoss"

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Data collator for training
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# ✅ Improved Training Arguments (2 or 3 epochs)
training_args = TrainingArguments(
    output_dir="/content/gpt2_math_meme_fine_tuned",
    eval_strategy="no",
    learning_rate=5e-5,  # ✅ Slightly increased learning rate for faster adaptation
    per_device_train_batch_size=4,
    num_train_epochs=3,  # ✅ Adjust to 2 or 3 epochs as required
    weight_decay=0.01,
    save_total_limit=1,
    save_strategy="epoch",
    logging_dir="/content/logs"
)

# ✅ Trainer instance (Optimized)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
)

# ✅ Start fine-tuning process
trainer.train()

# ✅ Save the fine-tuned model
fine_tuned_model_path = "/content/gpt2_math_meme_fine_tuned"
model.save_pretrained(fine_tuned_model_path)
tokenizer.save_pretrained(fine_tuned_model_path)

print(f"🎯 Fine-tuned model saved at: {fine_tuned_model_path}")


Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


🎯 Fine-tuned model saved at: /content/gpt2_math_meme_fine_tuned


In [2]:
from transformers import pipeline

# Load the fine-tuned model
model_path = "/content/gpt2_math_meme_fine_tuned"  # Ensure correct path
generator = pipeline("text-generation", model=model_path, tokenizer=model_path)

# Test with new incorrect memes
test_memes = [
    "Wrong: 6 ÷ 2(1+2) = 1",
    "Wrong: (2+3)² = 2² + 3²",
    "Wrong: 0.999... = 0"
]

# Generate corrected explanations
print("🎭 **Model's Math Meme Fixes:**\n")
for i, meme in enumerate(test_memes):
    output = generator(f"Incorrect: {meme}\nCorrect:", max_length=50, num_return_sequences=1, temperature=0.6,
                       repetition_penalty=1.8, truncation=True, top_p=0.9, top_k=40)
    print(f"🔹 Meme {i+1}: {output[0]['generated_text']}\n")


Device set to use cpu


🎭 **Model's Math Meme Fixes:**

🔹 Meme 1: Incorrect: Wrong: 6 ÷ 2(1+2) = 1
Correct: Correct answer (in the correct form): 3 × 5 + 4² is a bit odd. Note that in addition to adding an extra number, we need

🔹 Meme 2: Incorrect: Wrong: (2+3)² = 2² + 3²
Correct: Correct answer is 1.1 × 10 − 4/10(4*5)/12^-8=0, which means that the correct number

🔹 Meme 3: Incorrect: Wrong: 0.999... = 0
Correct: Correct answer (in fact, correct): 1/3 of a factor is the same as 2 × 3 + 4 - 5 * 10^2(10)²+4/(



**Question 3**

In [3]:
# ✅ Disable Weights & Biases logging (prevents API key prompt)
import os
os.environ["WANDB_DISABLED"] = "true"

# Install required libraries
!pip install transformers datasets torch

# Import necessary libraries
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import pandas as pd

# Load the emoji math dataset
dataset_path = "/content/sample_data/emoji_math_dataset.csv"  # Ensure correct path
df = pd.read_csv(dataset_path)

# Prepare the dataset for fine-tuning
emoji_problems = df["Emoji Problem"].tolist()
emoji_solutions = df["Solution"].tolist()

# Combine emoji problems and solutions for training
training_texts = [f"Problem: {problem}\nSolution: {solution}" for problem, solution in zip(emoji_problems, emoji_solutions)]

# Convert into a Dataset format
dataset = Dataset.from_dict({"text": training_texts})

# Load tokenizer and model (GPT-2 base)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to eos_token

model = GPT2LMHeadModel.from_pretrained("gpt2")

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Data collator for training
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# ✅ Training Arguments (2-3 epochs)
training_args = TrainingArguments(
    output_dir="/content/gpt2_emoji_math_fine_tuned",
    eval_strategy="no",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,  # ✅ Adjustable: Use 2 or 3 epochs
    weight_decay=0.01,
    save_total_limit=1,
    save_strategy="epoch",
    logging_dir="/content/logs"
)

# ✅ Trainer instance (Optimized)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
)

# ✅ Start fine-tuning process
trainer.train()

# ✅ Save the fine-tuned model
fine_tuned_model_path = "/content/gpt2_emoji_math_fine_tuned"
model.save_pretrained(fine_tuned_model_path)
tokenizer.save_pretrained(fine_tuned_model_path)

print(f"🎯 Fine-tuned model saved at: {fine_tuned_model_path}")




Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


🎯 Fine-tuned model saved at: /content/gpt2_emoji_math_fine_tuned


In [4]:
from transformers import pipeline

# Load the fine-tuned model
model_path = "/content/gpt2_emoji_math_fine_tuned"  # Ensure correct path
generator = pipeline("text-generation", model=model_path, tokenizer=model_path)

# New emoji problems for testing
test_problems = [
    "Problem: 🦉 + 🦉 + 🦉 = 30\nSolution:",
    "Problem: 🏆 + 🏆 = 14\nSolution:",
    "Problem: 🍔 + 🍔 + 🍔 = 21\nSolution:"
]

# Generate answers
print("🎭 **Model's Emoji Math Solutions:**\n")
for i, problem in enumerate(test_problems):
    output = generator(problem, max_length=50, num_return_sequences=1, temperature=0.6,
                       repetition_penalty=1.7, truncation=True, top_p=0.9, top_k=40)
    print(f"🔹 Problem {i+1}: {output[0]['generated_text']}\n")


Device set to use cpu


🎭 **Model's Emoji Math Solutions:**

🔹 Problem 1: Problem: 🦉 + 🦉 + 🦉 = 30
Solution: 😐😂 += 10

🔹 Problem 2: Problem: 🏆 + 🏆 = 14
Solution: 😎😌+🍁= 4

🔹 Problem 3: Problem: 🍔 + 🍔 + 🍔 = 21
Solution: 😀 (9) Solution #2 – 5 / 9.5 * 7 ** 6 === 12 == 24



**question 1**

In [None]:
# ✅ Disable Weights & Biases logging (prevents API key prompt)
import os
os.environ["WANDB_DISABLED"] = "true"

# Install required libraries
!pip install transformers datasets torch

# Import necessary libraries
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import pandas as pd

# Load the cleaned math riddles dataset
dataset_path = "/content/sample_data/cleaned_math_riddles_fine_tune_dataset.csv"  # Ensure correct path
df = pd.read_csv(dataset_path)

# Prepare the dataset for fine-tuning
riddles = df["Riddle"].tolist()
solutions = df["Solution"].tolist()

# Combine riddles and solutions for training
training_texts = [f"Riddle: {riddle}\nSolution: {solution}" for riddle, solution in zip(riddles, solutions)]

# Convert into a Dataset format
dataset = Dataset.from_dict({"text": training_texts})

# Load tokenizer and model (GPT-2 base)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to eos_token

model = GPT2LMHeadModel.from_pretrained("gpt2")

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Data collator for training
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# ✅ Training Arguments (2-3 epochs)
training_args = TrainingArguments(
    output_dir="/content/gpt2_math_riddle_fine_tuned",
    eval_strategy="no",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,  # ✅ Adjustable: Use 2 or 3 epochs
    weight_decay=0.01,
    save_total_limit=1,
    save_strategy="epoch",
    logging_dir="/content/logs"
)

# ✅ Trainer instance (Optimized)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
)

# ✅ Start fine-tuning process
trainer.train()

# ✅ Save the fine-tuned model
fine_tuned_model_path = "/content/gpt2_math_riddle_fine_tuned"
model.save_pretrained(fine_tuned_model_path)
tokenizer.save_pretrained(fine_tuned_model_path)

print(f"🎯 Fine-tuned model saved at: {fine_tuned_model_path}")




Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


In [None]:
from transformers import pipeline

# Load the fine-tuned model
model_path = "/content/gpt2_math_riddle_fine_tuned"  # Ensure correct path
generator = pipeline("text-generation", model=model_path, tokenizer=model_path)

# New math riddle prompts for testing
test_riddles = [
    "Riddle: What number is half of its double?",
    "Riddle: A number is three times the sum of its digits. What is the number?",
    "Riddle: What comes next in the sequence: 1, 4, 9, 16, 25, ...?",
    "Riddle: If a hen and a half lays an egg and a half in a day and a half, how many eggs do three hens lay in three days?",
    "Riddle: I am a two-digit number. My tens digit is twice my ones digit. What number am I?"
]

# Generate answers
print("🎭 **Model's Generated Math Riddles:**\n")
for i, riddle in enumerate(test_riddles):
    output = generator(riddle, max_length=50, num_return_sequences=1, temperature=0.6,
                       repetition_penalty=1.7, truncation=True, top_p=0.9, top_k=40)
    print(f"🔹 Riddle {i+1}: {output[0]['generated_text']}\n")
