In [None]:
pip install transformers datasets pdfplumber evaluate tqdm pymupdf


In [None]:
import pdfplumber
import re
from tqdm import tqdm
from transformers import (
    T5TokenizerFast,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from datasets import load_dataset, Dataset
import evaluate

squad = load_dataset('squad')

In [None]:
def h_ans(example):

    context = example['context']
    answer = example['answers']['text'][0]
    context_splits = context.split(answer)

    highlighted_context = context_splits[0] + ' <h> ' + answer + ' <h> ' + context_splits[1]

    return {'answer_highlighted_context': highlighted_context}


h_ans_squad = squad.map(h_ans)

In [None]:

def prepare_instruction_dataset(example):
    """
    Prepare the instruction prompt for generating questions from highlighted answers in the context.
    """
    answer_highlighted_context = example['answer_highlighted_context']

    instruction_prompt = f"""Generate a question whose answer is highlighted by <h> from the context delimited by the triple backticks.
    context:
    ```
    {answer_highlighted_context}
    ```
    """

    return {'instruction_prompt': instruction_prompt}


In [None]:
def tokenize_dataset(batch):
    model_inputs = tokenizer(batch['instruction_prompt'], max_length=512, truncation=True, padding=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch['question'], max_length=128, truncation=True, padding=True)

    labels['input_ids'] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels['input_ids']
    ]
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_squad = instruction_squad.map(tokenize_dataset, batched=True, remove_columns=squad['train'].column_names)


In [None]:
training_args = TrainingArguments(
    output_dir='t5-small-squad-qg',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy='steps',
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500,
    save_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad['train'],
    eval_dataset=tokenized_squad['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()

model.save_pretrained("t5-small-squad-qg")
tokenizer.save_pretrained("t5-small-squad-qg")

In [None]:
model.save_pretrained("t51-small-squad-qg")
tokenizer.save_pretrained("t51-small-squad-qg")

In [None]:
def extract_text_from_pdf(pdf_path):
    """
    Extract text from a given PDF file.
    """
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + '\n'
    return text


In [None]:
def generate_questions_from_text(text, model, tokenizer):

    paragraphs = text.split('\n\n')
    questions = []

    for paragraph in paragraphs:

        input_text = f"Generate a question based on the following context: {paragraph}"
        input_ids = tokenizer.encode(input_text, return_tensors='pt', truncation=True)

        outputs = model.generate(input_ids, max_length=128, num_beams=5, early_stopping=True)
        question = tokenizer.decode(outputs[0], skip_special_tokens=True)
        questions.append(question)

    return questions

In [None]:
#trial func
def generate_questions_from_pdfs(pdf_paths, model, tokenizer):

    all_questions = {}

    for pdf_path in pdf_paths:
        text = extract_text_from_pdf(pdf_path)
        questions = generate_questions_from_text(text, model, tokenizer)
        all_questions[pdf_path] = questions

    return all_questions

In [None]:
model = T5ForConditionalGeneration.from_pretrained("/content/t51-small-squad-qg")
tokenizer = T5TokenizerFast.from_pretrained("/content/t51-small-squad-qg")
#our files over here:
pdf_paths = [
    '/content/MLsample.pdf',
    '/content/NNsample.pdf',
    '/content/cvsample.pdf',
    '/content/nlpsample.pdf',
    '/content/pythonsample.pdf'
]


In [None]:
import fitz
from transformers import T5ForConditionalGeneration, T5Tokenizer
from tqdm import tqdm


def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def split_text_into_chunks(text, max_chunk_length=512):
    chunks = []
    sentences = text.split('. ')
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_chunk_length:
            current_chunk += sentence + ". "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

In [None]:
!pip install --force-reinstall pymupdf

In [None]:
model_ckpt = "/content/t51-small-squad-qg"
tokenizer = T5Tokenizer.from_pretrained(model_ckpt)
model = T5ForConditionalGeneration.from_pretrained(model_ckpt)
def generate_questions_from_text_chunks(chunks, model, tokenizer):
    questions = []
    for chunk in tqdm(chunks):
        input_text = f"generate question: {chunk}"
        inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
        outputs = model.generate(inputs, max_length=150, num_beams=5, num_return_sequences=3)

        for i in range(3):
            question = tokenizer.decode(outputs[i], skip_special_tokens=True)
            questions.append(question)

    return questions

In [None]:
for pdf_path in pdf_paths:
    text = extract_text_from_pdf(pdf_path)
    chunks = split_text_into_chunks(text)

    questions = generate_questions_from_text_chunks(chunks, model, tokenizer)

    print(f"\nQuestions from {pdf_path}:")
    for i, question in enumerate(questions, 1):
        print(f"{i}. {question}")