In [4]:
!pip install transformers datasets gradio
!pip install gradio
!pip install datasets
!pip install datasets transformers
!pip install gradio
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting gradio
  Downloading gradio-5.20.0-py3-none-any.whl.metadata (16 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Downloading gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downlo



In [None]:
### Data Processing
import torch
from transformers import AutoTokenizer
from datasets import Dataset
import pandas as pd

# Load dataset from CSV
df = pd.read_csv("/content/text_data_toc.csv")  # Update path if needed
print("Dataset loaded successfully!")
print("Dataset Columns:", df.columns)

# Rename columns to match expected names
df.rename(columns={"words": "context", "file": "questions"}, inplace=True)

# Ensure the required columns exist
required_columns = ["context", "questions"]
if not all(col in df.columns for col in required_columns):
    raise ValueError(f"Dataset is missing required columns: {required_columns}")

# Convert columns to string type to ensure compatibility with tokenizer
df["context"] = df["context"].astype(str)
df["questions"] = df["questions"].astype(str)

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Save dataset as CSV for reference
df.to_csv("squad_data.csv", index=False)

# Load tokenizer
model_name = "microsoft/DialoGPT-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

# Tokenization function
def preprocess_function(examples):
    inputs = tokenizer(
        examples["context"],
        examples["questions"],
        truncation=True,
        padding="max_length",
        max_length=256,  # Reduce max_length to speed up processing
        return_tensors="pt",
        return_attention_mask=True
    )
    return inputs

# Print dataset columns before tokenization
print("Dataset columns before tokenization:", dataset.column_names)

# Tokenize dataset and remove original text columns
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["context", "questions"])

# Train-Test Split
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.1, shuffle=True)
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

print("Data processing completed successfully!")

Dataset loaded successfully!
Dataset Columns: Index(['words', 'file'], dtype='object')
Dataset columns before tokenization: ['context', 'questions']


Map:   0%|          | 0/165 [00:00<?, ? examples/s]

Data processing completed successfully!


In [5]:
import torch
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from huggingface_hub import login
import os

# ✅ Disable WANDB logging (optional)
os.environ["WANDB_MODE"] = "disabled"

# ✅ Authenticate with Hugging Face token (Replace with your token if needed)
login("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")

# ✅ Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"✅ Using device: {device}")

# ✅ Fix: Read dataset correctly (TSV → CSV)
original_file = "merged_question_answer_pairs.csv"  # Ensure this file is uploaded

# ✅ Read dataset as TSV (force tab separator & correct encoding)
df = pd.read_csv(original_file, sep="\t", encoding="utf-8-sig", engine="python")

# ✅ Debug: Check detected columns
print("\n📌 Detected Columns:", df.columns)

# ✅ Check if dataset is being read as a single column
if len(df.columns) == 1:
    print("\n❌ ERROR: Dataset is not properly split! Fixing column separation...")

    # ✅ Force Pandas to split columns correctly
    df = df.iloc[:, 0].str.split("\t", expand=True)

    # ✅ Rename columns manually
    df.columns = ["ArticleTitle", "Question", "Answer", "DifficultyFromQuestioner", "DifficultyFromAnswerer", "ArticleFile"]

# ✅ Drop missing values in `Question` or `Answer` columns
df.dropna(subset=["Question", "Answer"], inplace=True)

# ✅ Debug: Show first few rows
print("\n📌 First 5 Rows of Fixed Dataset:")
print(df.head())

# ✅ Save it as a proper CSV with commas
fixed_file = "fixed_dataset.csv"
df.to_csv(fixed_file, index=False)

print(f"\n✅ Successfully converted dataset: {fixed_file}")

# ✅ Reload dataset with correct column separation
dataset = load_dataset("csv", data_files=fixed_file, split="train")

# ✅ Debugging: Print dataset columns
print(f"\n📌 Available Dataset Columns: {dataset.column_names}")

# ✅ Fix column selection
question_column_name = "Question"
answer_column_name = "Answer"

if question_column_name not in dataset.column_names or answer_column_name not in dataset.column_names:
    raise KeyError(f"❌ ERROR: Dataset does not contain expected columns!\n"
                   f"✅ Available Columns: {dataset.column_names}")

print(f"✅ Detected Column Names: Question → {question_column_name}, Answer → {answer_column_name}")

# ✅ Split dataset into training & validation
split_dataset = dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# ✅ Load pre-trained model
model_name = "facebook/bart-large"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ✅ Tokenization function (Handle Missing Values Safely)
def process_data_for_training(dataset):
    def tokenize_function(examples):
        # ✅ Ensure no missing values before processing
        inputs = [text.strip() if text is not None else "" for text in examples[question_column_name]]
        targets = [text.strip() if text is not None else "" for text in examples[answer_column_name]]

        tokenized_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=128)
        tokenized_targets = tokenizer(targets, padding="max_length", truncation=True, max_length=128)

        labels = tokenized_targets["input_ids"]
        labels = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels]

        return {
            "input_ids": tokenized_inputs["input_ids"],
            "attention_mask": tokenized_inputs["attention_mask"],
            "labels": labels
        }

    return dataset.map(tokenize_function, batched=True)

# ✅ Process datasets
train_dataset = process_data_for_training(train_dataset)
eval_dataset = process_data_for_training(eval_dataset)

# ✅ Set up training arguments (optimized for Colab)
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
    save_strategy="epoch",
    evaluation_strategy="epoch",
    fp16=True if torch.cuda.is_available() else False,  # Enable mixed precision if GPU is available
    dataloader_num_workers=2
)

# ✅ Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# ✅ Start training
trainer.train()

# ✅ Save trained model & tokenizer
model.save_pretrained("./trained_chatbot")
tokenizer.save_pretrained("./trained_chatbot")

# ✅ Function to generate chatbot responses
def generate_response(input_text):
    input_text = input_text.strip()
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)

    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=50)

    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# ✅ Example Usage
user_input = "What areas did Beyonce compete in when she was growing up?"
response = generate_response(user_input)
print("🤖 Chatbot:", response)

✅ Using device: cuda

📌 Detected Columns: Index(['ArticleTitle\tQuestion\tAnswer\tDifficultyFromQuestioner\tDifficultyFromAnswerer\tArticleFile'], dtype='object')

❌ ERROR: Dataset is not properly split! Fixing column separation...

📌 First 5 Rows of Fixed Dataset:
       ArticleTitle                                           Question Answer  \
0  Alessandro_Volta                    Was Volta an Italian physicist?    yes   
1  Alessandro_Volta                    Was Volta an Italian physicist?    yes   
2  Alessandro_Volta         Is Volta buried in the city of Pittsburgh?     no   
3  Alessandro_Volta         Is Volta buried in the city of Pittsburgh?     no   
4  Alessandro_Volta  Did Volta have a passion for the study of elec...    yes   

  DifficultyFromQuestioner DifficultyFromAnswerer   ArticleFile  
0                     easy                   easy  S09_set4_a10  
1                     easy                   easy  S09_set4_a10  
2                     easy                   easy

Generating train split: 0 examples [00:00, ? examples/s]


📌 Available Dataset Columns: ['ArticleTitle', 'Question', 'Answer', 'DifficultyFromQuestioner', 'DifficultyFromAnswerer', 'ArticleFile']
✅ Detected Column Names: Question → Question, Answer → Answer


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/2048 [00:00<?, ? examples/s]

Map:   0%|          | 0/228 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,3.0935,2.405855
2,1.7744,2.682912
3,1.2032,2.11421




🤖 Chatbot: 


In [6]:
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the fine-tuned model and tokenizer for QA (BART or T5)
model_name = "./trained_chatbot"  # Path to your fine-tuned model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the chatbot response function for answering questions
def chatbot_response(user_input):
    # Preprocess and tokenize the input text
    inputs = tokenizer(user_input, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)

    # Generate the response using the model
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_length=50,  # Limit the response length
            num_return_sequences=1,
            temperature=0.7,  # Control randomness
            top_p=0.92,       # Use top-p sampling
            repetition_penalty=1.2,
            do_sample=True,
            early_stopping=True
        )

    # Decode and return the answer (skip special tokens)
    reply = tokenizer.decode(output[0], skip_special_tokens=True)
    return reply

# Create Gradio interface
iface = gr.Interface(
    fn=chatbot_response,
    inputs=gr.Textbox(label="Ask a question"),  # Label input
    outputs=gr.Textbox(label="Chatbot response"),  # Label output
    title="Education Chatbot",
    description="Ask me any educational question!",
    theme="compact",  # Optional: set theme to compact
)

# Launch Gradio app
if __name__ == "__main__":
    iface.launch(share=True)


Sorry, we can't find the page you are looking for.


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3c10d93e6bb1b114e1.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
