In [None]:
!pip install transformers datasets accelerate

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (f

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import json

# === Load dataset ===
with open("mcq_dataset_2000.json", "r") as f:
    data = json.load(f)

# === Preprocess data ===
def format_example(item):
    options = item["options"]
    option_labels = ['A', 'B', 'C', 'D']
    formatted_options = "\n".join([f"{label}) {opt}" for label, opt in zip(option_labels, options)])
    correct_index = options.index(item["answer"])
    correct_label = option_labels[correct_index]

    input_text = f"generate mcq: {item['text']}"
    target_text = f"Q: {item['question']}\n{formatted_options}\nAnswer: {correct_label}"

    return {"input_text": input_text, "target_text": target_text}

dataset = [format_example(item) for item in data]
dataset = Dataset.from_list(dataset)

# === Train/test split ===
dataset = dataset.train_test_split(test_size=0.2)
train_data = dataset["train"]
test_data = dataset["test"]

# === Load tokenizer and model ===
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# === Tokenize ===
def tokenize(example):
    model_inputs = tokenizer(example["input_text"], max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(example["target_text"], max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_data = train_data.map(tokenize, batched=True, remove_columns=["input_text", "target_text"])
test_data = test_data.map(tokenize, batched=True, remove_columns=["input_text", "target_text"])

# === Training arguments ===
training_args = TrainingArguments(
    output_dir="./t5_mcq_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir="./logs",
    weight_decay=0.01,
    logging_steps=10,
    save_total_limit=2
)

# === Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer
)

# === Train ===
trainer.train()

# === Save model and tokenizer ===
model.save_pretrained("./t5_mcq_model")
tokenizer.save_pretrained("./t5_mcq_model")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/3115 [00:00<?, ? examples/s]

Map:   0%|          | 0/779 [00:00<?, ? examples/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtitusjerwin[0m ([33mtitusjerwin-kits[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.1577,0.092675
2,0.0952,0.04819
3,0.0551,0.039987


('./t5_mcq_model/tokenizer_config.json',
 './t5_mcq_model/special_tokens_map.json',
 './t5_mcq_model/spiece.model',
 './t5_mcq_model/added_tokens.json')

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the fine-tuned model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./t5_mcq_model")
tokenizer = T5Tokenizer.from_pretrained("./t5_mcq_model")

# Prepare the input text related to Java
input_text = (
    "generate mcq: In Java, the 'final' keyword can be applied to variables, methods, and classes. "
    "When a variable is declared as final, its value cannot be modified once assigned. "
    "Declaring a method as final prevents it from being overridden by subclasses. "
    "Similarly, declaring a class as final prevents it from being subclassed."
)

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

# Generate the MCQ
model.eval()  # Set the model to evaluation mode
outputs = model.generate(**inputs, max_length=256)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the generated MCQ
print(generated_text)


Q: What does a method as final do? A) Subclasses B) Subclasses C) Subclasses D) Subclasses Answer: A


In [None]:
!pip install transformers huggingface_hub



In [None]:
from huggingface_hub import notebook_login

notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Replace 'your-username' with your actual Hugging Face username
model.push_to_hub("jerwinTitus/t5_mcq_model")
tokenizer.push_to_hub("jerwinTitus/t5_mcq_model")


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jerwinTitus/t5_mcq_model/commit/683549737a405411193ff01366f3b9589ec06de9', commit_message='Upload tokenizer', commit_description='', oid='683549737a405411193ff01366f3b9589ec06de9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jerwinTitus/t5_mcq_model', endpoint='https://huggingface.co', repo_type='model', repo_id='jerwinTitus/t5_mcq_model'), pr_revision=None, pr_num=None)

In [None]:
!pip install -q streamlit
!npm install localtunnel

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K
added 22 packages in 3s
[1G[0K⠦[1G[0K
[1G[0K⠦[1G[0K3 packages are looking for funding
[1G[0K⠦[1G[0K  run `npm fund` for details
[1G[0K⠦[1G[0K

In [None]:
%%writefile app.py
import streamlit as st
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Load the fine-tuned model and tokenizer
model_name = "t5_mcq_model"  # Replace with your model's name or path
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def generate_mcqs(input_text, num_questions=5):
    """
    Generate multiple-choice questions based on the input text.

    Args:
        input_text (str): The input text for MCQ generation.
        num_questions (int): Number of MCQs to generate.

    Returns:
        list: A list of generated MCQs.
    """
    input_prompt = f"generate {num_questions} mcqs: {input_text}"
    inputs = tokenizer(input_prompt, return_tensors="pt", max_length=512, truncation=True)
    model.eval()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=512,
            num_return_sequences=num_questions,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.8
        )
    questions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return questions

# Streamlit UI
st.title("MCQ Generator")

# Text area for user input
input_text = st.text_area("Enter the text for MCQ generation:", height=200)

# Slider to select the number of MCQs to generate
num_questions = st.slider("Number of MCQs to generate:", min_value=1, max_value=10, value=5)

if st.button("Generate MCQs"):
    if input_text.strip():
        with st.spinner("Generating MCQs..."):
            mcqs = generate_mcqs(input_text, num_questions)
            for idx, mcq in enumerate(mcqs, 1):
                st.subheader(f"MCQ {idx}")
                st.write(mcq)
    else:
        st.warning("Please enter some text to generate MCQs.")



Overwriting app.py


In [None]:
import urllib
external_ip = urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip()
print(f"External IP: {external_ip}")


External IP: 35.233.183.147


In [None]:
!streamlit run app.py &>/content/logs.txt &


In [None]:
!npx localtunnel --port 8501


[1G[0K⠙[1G[0Kyour url is: https://grumpy-places-act.loca.lt
^C


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
# Define what to include
!zip -r /content/clean_project.zip \
  /content/t5_mcq_model \
  /content/mcq_dataset_2000.json \
  /content/app.py \
  /content/package.json \
  /content/package-lock.json \
  /content/logs.txt \
  -x "/content/sample_data/*" "/content/drive/*" "/content/node_modules/*" "/content/wandb/*"





zip error: Nothing to do! (/content/clean_project.zip)


In [2]:
from google.colab import files
files.download('/content/clean_project.zip')

FileNotFoundError: Cannot find file: /content/clean_project.zip