In [None]:
!pip install transformers datasets gradio

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting gradio
  Downloading gradio-5.19.0-py3-none-any.whl.metadata (16 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Downloading gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Dow

In [20]:
### Data Processing
import torch
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset
import pandas as pd
import json

# Verify and Load SQuAD v2 dataset
def load_and_verify_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        try:
            data = json.load(f)
            print("JSON file loaded successfully!")
            return data
        except json.JSONDecodeError as e:
            print(f"JSON format error: {e}")
            return None

squad_data = load_and_verify_json("train-v2.0.json")
if squad_data is None:
    raise ValueError("Invalid JSON file. Please check and re-upload the dataset.")

# Load dataset
dataset = load_dataset("json", data_files={"train": "train-v2.0.json"}, field="data")

# Extract context and questions
def prepare_data(dataset):
    contexts, questions = [], []
    for entry in dataset["train"]:
        if "paragraphs" in entry:
            for paragraph in entry["paragraphs"]:
                if "qas" in paragraph:
                    for qa in paragraph["qas"]:
                        if "context" in paragraph and "question" in qa:
                            contexts.append(paragraph["context"])
                            questions.append(qa["question"])
    return {"context": contexts, "questions": questions}

data = prepare_data(dataset)

# Ensure equal lengths of context and questions
min_length = min(len(data["context"]), len(data["questions"]))
data["context"] = data["context"][:min_length]
data["questions"] = data["questions"][:min_length]

# Convert to dataset
dataset = Dataset.from_dict(data)

# Save dataset as CSV for download
df = pd.DataFrame(data)
df.to_csv("squad_data.csv", index=False)

# Load tokenizer
model_name = "microsoft/DialoGPT-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name, pad_token=AutoTokenizer.from_pretrained(model_name).eos_token)

def preprocess_function(examples):
    # Tokenize context and question with truncation and padding to max_length
    inputs = tokenizer(
        examples["context"],
        examples["questions"],
        truncation=True,
        padding="max_length",  # Pad to max length (512 tokens)
        max_length=512  # Limit to 512 tokens
    )
    return inputs

# Verify columns before calling remove_columns
print("Dataset columns before tokenization:", dataset.column_names)

# Tokenize dataset and remove original context and questions
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["context", "questions"])

# Train-Test Split
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.1)
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

JSON file loaded successfully!
Dataset columns before tokenization: ['context', 'questions']


Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

In [None]:
### Model Training
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments

# Define model name
model_name = "microsoft/DialoGPT-medium"

# Load pre-trained model
model = AutoModelForCausalLM.from_pretrained(model_name)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
    push_to_hub=False,
    report_to="none",
    remove_unused_columns=False
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train the model
trainer.train()

# Save the trained model
model.save_pretrained("./trained_chatbot")
tokenizer.save_pretrained("./trained_chatbot")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
### Chatbot Interaction
import gradio as gr
import torch

def chatbot_response(user_input):
    inputs = tokenizer(user_input, return_tensors="pt").to(model.device)

    with torch.no_grad():
        reply_ids = model.generate(
            **inputs,
            max_length=30,  # Reduce length for faster response
            num_return_sequences=1,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.2,
            do_sample=True,
            early_stopping=True  # Stops early to speed up
        )

    return tokenizer.decode(reply_ids[0], skip_special_tokens=True)

# Create Gradio interface
iface = gr.Interface(
    fn=chatbot_response,
    inputs="text",
    outputs="text",
    title="Education Chatbot",
    description="Ask me any educational question!",
)

# Launch Gradio app
if __name__ == "__main__":
    iface.launch(share=True)

print("Dataset saved as squad_data.csv. You can download it from your environment.")


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://44de4dba2d5964040b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Dataset saved as squad_data.csv. You can download it from your environment.
