In [None]:
!pip install transformers sentence-transformers spacy gradio --quiet
!python -m spacy download en_core_web_sm

import random
import spacy
from transformers import T5ForConditionalGeneration, T5Tokenizer
from sentence_transformers import SentenceTransformer, util
import gradio as gr

print("Loading models...")
nlp = spacy.load("en_core_web_sm")

t5_model_name = "valhalla/t5-base-qg-hl"
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name)
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Models loaded!")

def extract_answer_sentences(text):
    """Extract sentences with candidate keywords (entities or nouns)."""
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    answer_sentences = []
    all_keywords = []
    for sent in sentences:
        sent_doc = nlp(sent)
        keywords = [ent.text for ent in sent_doc.ents if len(ent.text.split()) == 1]
        keywords += [chunk.text for chunk in sent_doc.noun_chunks if len(chunk.text.split()) == 1]
        keywords = list(set(keywords))
        if keywords:
            answer_sentences.append((sent, keywords))
            all_keywords.extend(keywords)
    return answer_sentences, list(set(all_keywords))

def generate_question(context, answer):
    """Generate a question using highlight-based model"""
    highlighted_context = context.replace(answer, f"<hl> {answer} <hl>")
    input_text = f"generate question: {highlighted_context}"
    input_ids = t5_tokenizer(input_text, return_tensors="pt").input_ids
    outputs = t5_model.generate(input_ids, max_length=64, num_beams=4, early_stopping=True)
    question = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

def generate_distractors(answer, all_keywords, top_n=3):
    """Generate plausible distractors"""
    distractors = [kw for kw in all_keywords if kw.lower() != answer.lower() and len(kw.split()) == 1]
    random.shuffle(distractors)
    if len(distractors) < top_n:
        distractors += ["OptionX"] * (top_n - len(distractors))
    return distractors[:top_n]

def generate_mcq(text):
    """Generate MCQs from a passage"""
    answer_sentences, all_keywords = extract_answer_sentences(text)
    mcqs = []
    for sent, keywords in answer_sentences:
        answer = keywords[0]
        question = generate_question(sent, answer)
        distractors = generate_distractors(answer, all_keywords)
        options = distractors + [answer]
        random.shuffle(options)
        mcqs.append({
            "question": question,
            "options": options,
            "answer": answer
        })
    return mcqs

def format_mcq_output(text):
    """Format MCQs for Gradio"""
    mcqs = generate_mcq(text)
    output_text = ""
    for i, mcq in enumerate(mcqs):
        output_text += f"Q{i+1}: {mcq['question']}\n"
        for j, opt in enumerate(mcq['options']):
            output_text += f"   {chr(65+j)}. {opt}\n"
        output_text += f"Answer: {mcq['answer']}\n\n"
    return output_text

iface = gr.Interface(
    fn=format_mcq_output,
    inputs=gr.Textbox(
        lines=10,
        placeholder="Paste your textbook passage here..."
    ),
    outputs=gr.Textbox(
        lines=30,
        placeholder="Generated MCQs will appear here...",
        interactive=False,
        show_label=False
    ),
    title="Student MCQ Generator",
    description="Paste a section of text from a textbook and generate multiple-choice questions."
)

iface.launch()


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Loading models...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/15.0 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Models loaded!
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://99b66d309e7d98a5c2.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


