1_DistillBert

In [4]:
# Install dependencies
!pip install pdfplumber transformers torch sentence-transformers

# Import libraries
from google.colab import drive
import pdfplumber
import torch
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering, Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer, util
import numpy as np
import re
import os
from tqdm import tqdm

# Mount Google Drive
drive.mount('/content/drive')

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"Error extracting text: {e}")
    return text

# Replace with your PDF path
pdf_path = '/content/cse.pdf'  # Update this path
pdf_text = extract_text_from_pdf(pdf_path)
print(f"Extracted text length: {len(pdf_text)} characters")

# Step 2: Preprocess and chunk text
def chunk_text(text, max_length=512):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

text_chunks = chunk_text(pdf_text)
print(f"Number of chunks: {len(text_chunks)}")

# Step 3: Generate synthetic question-answer pairs
def generate_synthetic_qa(chunks, num_questions=100):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    qa_pairs = []
    for chunk in tqdm(chunks, desc="Generating QA pairs"):
        sentences = re.split(r'(?<=[.!?])\s+', chunk)
        for sentence in sentences:
            if len(sentence.strip()) > 20:
                question = f"What is mentioned about {sentence[:30].strip()}...?"
                answer = sentence.strip()
                qa_pairs.append({"question": question, "answer": answer, "context": chunk})
                if len(qa_pairs) >= num_questions:
                    break
        if len(qa_pairs) >= num_questions:
            break
    return qa_pairs

qa_pairs = generate_synthetic_qa(text_chunks)
print(f"Generated {len(qa_pairs)} QA pairs")

# Step 4: Prepare dataset for fine-tuning
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class QADataset(torch.utils.data.Dataset):
    def __init__(self, qa_pairs):
        self.qa_pairs = qa_pairs
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.qa_pairs)

    def __getitem__(self, idx):
        qa = self.qa_pairs[idx]
        question = qa['question']
        context = qa['context']
        answer = qa['answer']

        # Encode question and context
        encoding = self.tokenizer.encode_plus(
            question,
            context,
            add_special_tokens=True,
            max_length=512,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        # Find start and end positions of answer in context
        answer_encoding = self.tokenizer.encode(answer, add_special_tokens=False)
        input_ids = encoding['input_ids'].squeeze()
        answer_ids = self.tokenizer.encode(answer, add_special_tokens=False)

        start_positions = -1
        end_positions = -1
        for i in range(len(input_ids) - len(answer_ids)):
            if input_ids[i:i+len(answer_ids)].tolist() == answer_ids:
                start_positions = i
                end_positions = i + len(answer_ids) - 1
                break

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'token_type_ids': encoding['token_type_ids'].squeeze(),
            'start_positions': start_positions,
            'end_positions': end_positions
        }

dataset = QADataset(qa_pairs)

# Step 5: Fine-tune the model
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/qa_model',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    fp16=True,  # Enable mixed precision for GPU
    logging_dir='/content/drive/MyDrive/logs',
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()

# Step 6: Save the model
model.save_pretrained('/content/drive/MyDrive/qa_model')
tokenizer.save_pretrained('/content/drive/MyDrive/qa_model')
print("Model and tokenizer saved to /content/drive/MyDrive/qa_model")

# Step 7: Function to answer questions using the saved model
def answer_question(question, context, model_path='/content/drive/MyDrive/qa_model'):
    model = DistilBertForQuestionAnswering.from_pretrained(model_path)
    tokenizer = DistilBertTokenizer.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    inputs = tokenizer.encode_plus(
        question,
        context,
        add_special_tokens=True,
        max_length=512,
        return_tensors='pt',
        truncation=True
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    start_idx = torch.argmax(start_scores)
    end_idx = torch.argmax(end_scores) + 1

    answer_tokens = input_ids[0][start_idx:end_idx]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
    return answer

# Example usage
question = "What is the main topic of the document?"
context = text_chunks[0]  # Use first chunk as context or select relevant chunk
answer = answer_question(question, context)
print(f"Question: {question}")
print(f"Answer: {answer}")

Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m42.8/42.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m48.2/48.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.6-py3-none-any.w



Mounted at /content/drive




Extracted text length: 160843 characters
Number of chunks: 353


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating QA pairs:  10%|‚ñà         | 36/353 [00:00<00:00, 60157.35it/s]

Generated 100 QA pairs





tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mimran012x[0m ([33mimran012x-eternalbit[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


Model and tokenizer saved to /content/drive/MyDrive/qa_model
Question: What is the main topic of the document?
Answer: bou. ac. bd, www. bousst. edu. bd facebook page : www. facebook. com / sstbou program handbook published by publishing printing and distribution division bangladesh open university gazipur - 1705.


2_roberta-large

In [9]:
# Install dependencies
!pip install pdfplumber transformers torch sentence-transformers

# Import libraries
from google.colab import drive
import pdfplumber
import torch
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering, Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer, util
import numpy as np
import re
import os
from tqdm import tqdm

# Mount Google Drive
drive.mount('/content/drive')

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"Error extracting text: {e}")
    return text

# Replace with your PDF path
pdf_path = '/content/cse.pdf'  # Update this path
pdf_text = extract_text_from_pdf(pdf_path)
print(f"Extracted text length: {len(pdf_text)} characters")

# Step 2: Preprocess and chunk text
def chunk_text(text, max_length=384):  # Reduced max_length for RoBERTa
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

text_chunks = chunk_text(pdf_text)
print(f"Number of chunks: {len(text_chunks)}")

# Step 3: Generate synthetic question-answer pairs
def generate_synthetic_qa(chunks, num_questions=200):  # Increased number of questions
    model = SentenceTransformer('all-mpnet-base-v2')  # More powerful embedding model
    qa_pairs = []
    for chunk in tqdm(chunks, desc="Generating QA pairs"):
        sentences = re.split(r'(?<=[.!?])\s+', chunk)
        for sentence in sentences:
            if len(sentence.strip()) > 20:
                # Create multiple question types
                question_types = [
                    f"What is mentioned about {sentence[:30].strip()}...?",
                    f"Can you explain {sentence[:30].strip()}...?",
                    f"What details are provided about {sentence[:30].strip()}...?"
                ]
                for question in question_types:
                    qa_pairs.append({"question": question, "answer": sentence.strip(), "context": chunk})
                    if len(qa_pairs) >= num_questions:
                        break
        if len(qa_pairs) >= num_questions:
            break
    return qa_pairs

qa_pairs = generate_synthetic_qa(text_chunks)
print(f"Generated {len(qa_pairs)} QA pairs")

# Step 4: Prepare dataset for fine-tuning
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large')

class QADataset(torch.utils.data.Dataset):
    def __init__(self, qa_pairs):
        self.qa_pairs = qa_pairs
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.qa_pairs)

    def __getitem__(self, idx):
        qa = self.qa_pairs[idx]
        question = qa['question']
        context = qa['context']
        answer = qa['answer']

        # Encode question and context
        encoding = self.tokenizer.encode_plus(
            question,
            context,
            add_special_tokens=True,
            max_length=384,  # RoBERTa optimal length
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        # Find start and end positions of answer in context
        answer_encoding = self.tokenizer.encode(answer, add_special_tokens=False)
        input_ids = encoding['input_ids'].squeeze()
        answer_ids = self.tokenizer.encode(answer, add_special_tokens=False)

        start_positions = -1
        end_positions = -1
        for i in range(len(input_ids) - len(answer_ids)):
            if input_ids[i:i+len(answer_ids)].tolist() == answer_ids:
                start_positions = i
                end_positions = i + len(answer_ids) - 1
                break

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'token_type_ids': encoding['token_type_ids'].squeeze(),
            'start_positions': start_positions,
            'end_positions': end_positions
        }

dataset = QADataset(qa_pairs)

# Step 5: Fine-tune the model
model = RobertaForQuestionAnswering.from_pretrained('roberta-large')

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/LLM/2_roberta-large/qa_model_roberta',
    num_train_epochs=3,
    per_device_train_batch_size=4,  # Smaller batch size for large model
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch size
    save_steps=500,
    save_total_limit=2,
    learning_rate=1e-5,  # Lower learning rate for large model
    fp16=True,  # Enable mixed precision for GPU
    logging_dir='/content/drive/MyDrive/logs',
    logging_steps=100,
    report_to="none"  # Disable W&B logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()

# Step 6: Save the model
model.save_pretrained('/content/drive/MyDrive/LLM/2_roberta-large/qa_model_roberta')
tokenizer.save_pretrained('/content/drive/MyDrive/LLM/2_roberta-large/qa_model_roberta')
print("Model and tokenizer saved to /content/drive/MyDrive/LLM/2_roberta-large/qa_model_roberta")

# Step 7: Function to answer questions using the saved model with context selection
def answer_question(question, chunks, model_path='/content/drive/MyDrive/LLM/2_roberta-large/qa_model_roberta'):
    # Load model and tokenizer
    model = RobertaForQuestionAnswering.from_pretrained(model_path)
    tokenizer = RobertaTokenizerFast.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    # Use sentence-transformer to find the most relevant chunk
    embedder = SentenceTransformer('all-mpnet-base-v2')
    question_embedding = embedder.encode(question, convert_to_tensor=True)
    chunk_embeddings = embedder.encode(chunks, convert_to_tensor=True)
    cos_scores = util.cos_sim(question_embedding, chunk_embeddings)[0]
    best_chunk_idx = torch.argmax(cos_scores).item()
    context = chunks[best_chunk_idx]

    # Encode inputs
    inputs = tokenizer.encode_plus(
        question,
        context,
        add_special_tokens=True,
        max_length=384,
        return_tensors='pt',
        truncation=True
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    start_idx = torch.argmax(start_scores)
    end_idx = torch.argmax(end_scores) + 1

    answer_tokens = input_ids[0][start_idx:end_idx]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
    return answer, context

# Example usage
question = "What is the main topic of the document?"
answer, selected_context = answer_question(question, text_chunks)
print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"Selected context: {selected_context[:100]}...")  # Print first 100 chars of context





Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




Extracted text length: 160843 characters
Number of chunks: 450


Generating QA pairs:   6%|‚ñå         | 27/450 [00:00<00:00, 35028.21it/s]


Generated 201 QA pairs


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Model and tokenizer saved to /content/drive/MyDrive/LLM/2_roberta-large/qa_model_roberta
Question: What is the main topic of the document?
Answer: 
Selected context: The paper will be evaluated based on the ability to understand
a topic, communicate it and identify ...


In [16]:
# Step 7: Function to answer questions using the saved model with context selection
def answer_question(question, chunks, model_path='/content/drive/MyDrive/LLM/2_roberta-large/qa_model_roberta'):
    # Load model and tokenizer
    model = RobertaForQuestionAnswering.from_pretrained(model_path)
    tokenizer = RobertaTokenizerFast.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    # Use sentence-transformer to find the most relevant chunk
    embedder = SentenceTransformer('all-mpnet-base-v2')
    question_embedding = embedder.encode(question, convert_to_tensor=True)
    chunk_embeddings = embedder.encode(chunks, convert_to_tensor=True)
    cos_scores = util.cos_sim(question_embedding, chunk_embeddings)[0]
    best_chunk_idx = torch.argmax(cos_scores).item()
    context = chunks[best_chunk_idx]

    # Encode inputs
    inputs = tokenizer.encode_plus(
        question,
        context,
        add_special_tokens=True,
        max_length=384,
        return_tensors='pt',
        truncation=True
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    start_idx = torch.argmax(start_scores)
    end_idx = torch.argmax(end_scores) + 1

    answer_tokens = input_ids[0][start_idx:end_idx]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
    return answer, context

# Example usage
question = "Give all the teachers phone numbers?"
answer, selected_context = answer_question(question, text_chunks)
print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"Selected context: {selected_context[:1000]}...")  # Print first 100 chars of context

Question: Give all the teachers phone numbers?
Answer: 
Selected context: For any query about tutors and tutorial services, you may contact with respective study
centre coordinator. For any problem don‚Äôt hesitate to contact the Dean's office of the
School of Science and Technology (SST), BOU, Gazipur, Tel:9291111....


In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Step 7: Function to answer questions using the saved model with context selection
def answer_question(question, chunks, model_path='/content/drive/MyDrive/LLM/4_Qwen/qa_model_qwen2'):
    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    # Select most relevant chunk
    embedder = SentenceTransformer('all-mpnet-base-v2')
    question_embedding = embedder.encode(question, convert_to_tensor=True)
    chunk_embeddings = embedder.encode(chunks, convert_to_tensor=True)
    cos_scores = util.cos_sim(question_embedding, chunk_embeddings)[0]
    best_chunk_idx = torch.argmax(cos_scores).item()
    context = chunks[best_chunk_idx]

    # Format input for Qwen2
    prompt = f"Question: {question}\nContext: {context}\nAnswer: "
    inputs = tokenizer(
        prompt,
        add_special_tokens=True,
        max_length=512,
        return_tensors='pt',
        truncation=True
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=100,
            do_sample=False
        )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = answer.split("Answer: ")[-1].strip()
    return answer, context

# Example usage
question = "What is the main topic of the document?"
answer, selected_context = answer_question(question, text_chunks)
print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"Selected context: {selected_context[:100]}...")

3_DeepSeek-R1

In [21]:
# Install dependencies
!pip install pdfplumber transformers torch sentence-transformers datasets accelerate peft huggingface_hub

# Import libraries
from google.colab import drive
import pdfplumber
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from sentence_transformers import SentenceTransformer, util
from datasets import Dataset
import numpy as np
import re
import os
from tqdm import tqdm
from huggingface_hub import login

# Mount Google Drive
drive.mount('/content/drive')

# Step 1: Authenticate with Hugging Face
# Replace with your Hugging Face Read token from https://huggingface.co/settings/tokens
hf_token = "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"  # Update with your Read token
if hf_token:
    login(hf_token)
    print("Hugging Face login successful")
else:
    raise ValueError("Hugging Face Read token required for LLaMA 3.1. Set `hf_token` or run `huggingface-cli login`.")

# Step 2: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"Error extracting text: {e}")
    return text

# Replace with your PDF path
pdf_path = '/content/drive/MyDrive/cse.pdf'  # Update this path
pdf_text = extract_text_from_pdf(pdf_path)
if not pdf_text:
    raise ValueError("No text extracted from PDF. Check the file path or content.")
print(f"Extracted text length: {len(pdf_text)} characters")

# Step 3: Preprocess and chunk text
def chunk_text(text, max_length=512):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

text_chunks = chunk_text(pdf_text)
print(f"Number of chunks: {len(text_chunks)}")

# Step 4: Generate synthetic question-answer pairs
def generate_synthetic_qa(chunks, num_questions=500):
    model = SentenceTransformer('all-mpnet-base-v2')
    qa_pairs = []
    for chunk in tqdm(chunks, desc="Generating QA pairs"):
        sentences = re.split(r'(?<=[.!?])\s+', chunk)
        for sentence in sentences:
            if len(sentence.strip()) > 20:
                question_types = [
                    f"What is discussed about {sentence[:30].strip()}...?",
                    f"Can you explain {sentence[:30].strip()}...?",
                    f"What details are given about {sentence[:30].strip()}...?",
                    f"What is the significance of {sentence[:30].strip()}...?"
                ]
                for question in question_types:
                    qa_pairs.append({"question": question, "answer": sentence.strip(), "context": chunk})
                    if len(qa_pairs) >= num_questions:
                        break
        if len(qa_pairs) >= num_questions:
            break
    return qa_pairs

qa_pairs = generate_synthetic_qa(text_chunks)
print(f"Generated {len(qa_pairs)} QA pairs")

# Step 5: Prepare dataset for fine-tuning
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.1-8B-Instruct')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

class QADataset(Dataset):
    def __init__(self, qa_pairs):
        self.qa_pairs = qa_pairs
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.qa_pairs)

    def __getitem__(self, idx):
        qa = self.qa_pairs[idx]
        question = qa['question']
        context = qa['context']
        answer = qa['answer']

        # Format input for LLaMA 3.1
        prompt = f"<|begin_of_text|>Question: {question}\nContext: {context}\nAnswer: {answer}<|end_of_text|>"
        encoding = self.tokenizer(
            prompt,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': encoding['input_ids'].squeeze()
        }

dataset = Dataset.from_list([QADataset(qa_pairs).__getitem__(i) for i in range(len(qa_pairs))])

# Step 6: Fine-tune the model with LoRA
model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-3.1-8B-Instruct')
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/qa_model_llama3_1',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    fp16=True,
    logging_dir='/content/drive/MyDrive/logs',
    logging_steps=100,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()

# Step 7: Save the model
model.save_pretrained('/content/drive/MyDrive/qa_model_llama3_1')
tokenizer.save_pretrained('/content/drive/MyDrive/qa_model_llama3_1')
print("Model and tokenizer saved to /content/drive/MyDrive/qa_model_llama3_1")

# Step 8: Function to answer questions using the saved model with context selection
def answer_question(question, chunks, model_path='/content/drive/MyDrive/qa_model_llama3_1'):
    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    # Select most relevant chunk
    embedder = SentenceTransformer('all-mpnet-base-v2')
    question_embedding = embedder.encode(question, convert_to_tensor=True)
    chunk_embeddings = embedder.encode(chunks, convert_to_tensor=True)
    cos_scores = util.cos_sim(question_embedding, chunk_embeddings)[0]
    best_chunk_idx = torch.argmax(cos_scores).item()
    context = chunks[best_chunk_idx]

    # Format input for LLaMA 3.1
    prompt = f"<|begin_of_text|>Question: {question}\nContext: {context}\nAnswer: "
    inputs = tokenizer(
        prompt,
        add_special_tokens=True,
        max_length=512,
        return_tensors='pt',
        truncation=True
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=100,
            do_sample=False
        )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = answer.split("Answer: ")[-1].strip()
    return answer, context

# Example usage
question = "What is the main topic of the document?"
answer, selected_context = answer_question(question, text_chunks)
print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"Selected context: {selected_context[:100]}...")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


HTTPError: Invalid user token.

4_Qwen-7B

In [1]:
# Install dependencies
!pip install pdfplumber transformers torch sentence-transformers datasets accelerate peft

# Import libraries
from google.colab import drive
import pdfplumber
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from sentence_transformers import SentenceTransformer, util
from datasets import Dataset
import numpy as np
import re
import os
from tqdm import tqdm

# Mount Google Drive
drive.mount('/content/drive')

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"Error extracting text: {e}")
    return text

# Replace with your PDF path
pdf_path = '/content/drive/MyDrive/cse.pdf'  # Update this path
pdf_text = extract_text_from_pdf(pdf_path)
if not pdf_text:
    raise ValueError("No text extracted from PDF. Check the file path or content.")
print(f"Extracted text length: {len(pdf_text)} characters")

# Step 2: Preprocess and chunk text
def chunk_text(text, max_length=512):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

text_chunks = chunk_text(pdf_text)
print(f"Number of chunks: {len(text_chunks)}")

# Step 3: Generate synthetic question-answer pairs
def generate_synthetic_qa(chunks, num_questions=500):
    model = SentenceTransformer('all-mpnet-base-v2')
    qa_pairs = []
    for chunk in tqdm(chunks, desc="Generating QA pairs"):
        sentences = re.split(r'(?<=[.!?])\s+', chunk)
        for sentence in sentences:
            if len(sentence.strip()) > 20:
                question_types = [
                    f"What is discussed about {sentence[:30].strip()}...?",
                    f"Can you explain {sentence[:30].strip()}...?",
                    f"What details are given about {sentence[:30].strip()}...?",
                    f"What is the significance of {sentence[:30].strip()}...?"
                ]
                for question in question_types:
                    qa_pairs.append({"question": question, "answer": sentence.strip(), "context": chunk})
                    if len(qa_pairs) >= num_questions:
                        break
        if len(qa_pairs) >= num_questions:
            break
    return qa_pairs

qa_pairs = generate_synthetic_qa(text_chunks)
print(f"Generated {len(qa_pairs)} QA pairs")

# Step 4: Prepare dataset for fine-tuning
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2-7B-Instruct')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

class QADataset(Dataset):
    def __init__(self, qa_pairs):
        self.qa_pairs = qa_pairs
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.qa_pairs)

    def __getitem__(self, idx):
        qa = self.qa_pairs[idx]
        question = qa['question']
        context = qa['context']
        answer = qa['answer']

        # Format input for Qwen2
        prompt = f"Question: {question}\nContext: {context}\nAnswer: {answer}"
        encoding = self.tokenizer(
            prompt,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': encoding['input_ids'].squeeze()
        }

dataset = Dataset.from_list([QADataset(qa_pairs).__getitem__(i) for i in range(len(qa_pairs))])

# Step 5: Fine-tune the model with LoRA
model = AutoModelForCausalLM.from_pretrained('Qwen/Qwen2-7B-Instruct')
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/qa_model_qwen2',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    fp16=True,
    logging_dir='/content/drive/MyDrive/logs',
    logging_steps=100,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()

# Step 6: Save the model
model.save_pretrained('/content/drive/MyDrive/qa_model_qwen2')
tokenizer.save_pretrained('/content/drive/MyDrive/qa_model_qwen2')
print("Model and tokenizer saved to /content/drive/MyDrive/qa_model_qwen2")

# Step 7: Function to answer questions using the saved model with context selection
def answer_question(question, chunks, model_path='/content/drive/MyDrive/qa_model_qwen2'):
    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    # Select most relevant chunk
    embedder = SentenceTransformer('all-mpnet-base-v2')
    question_embedding = embedder.encode(question, convert_to_tensor=True)
    chunk_embeddings = embedder.encode(chunks, convert_to_tensor=True)
    cos_scores = util.cos_sim(question_embedding, chunk_embeddings)[0]
    best_chunk_idx = torch.argmax(cos_scores).item()
    context = chunks[best_chunk_idx]

    # Format input for Qwen2
    prompt = f"Question: {question}\nContext: {context}\nAnswer: "
    inputs = tokenizer(
        prompt,
        add_special_tokens=True,
        max_length=512,
        return_tensors='pt',
        truncation=True
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=100,
            do_sample=False
        )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = answer.split("Answer: ")[-1].strip()
    return answer, context

# Example usage
question = "What is the main topic of the document?"
answer, selected_context = answer_question(question, text_chunks)
print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"Selected context: {selected_context[:100]}...")

Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m42.8/42.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m48.2/48.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 



Extracted text length: 160843 characters
Number of chunks: 353


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating QA pairs:  12%|‚ñà‚ñè        | 44/353 [00:00<00:00, 43119.01it/s]

Generated 501 QA pairs





tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


OutOfMemoryError: CUDA out of memory. Tried to allocate 74.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 32.88 MiB is free. Process 14785 has 39.52 GiB memory in use. Of the allocated memory 38.88 GiB is allocated by PyTorch, and 138.59 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [4]:
# Set environment variable to reduce memory fragmentation
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Install dependencies
!pip install pdfplumber transformers torch sentence-transformers datasets accelerate peft bitsandbytes

# Import libraries
from google.colab import drive
import pdfplumber
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from sentence_transformers import SentenceTransformer, util
from datasets import Dataset
import numpy as np
import re
from tqdm import tqdm

# Mount Google Drive
drive.mount('/content/drive')

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"Error extracting text: {e}")
    return text

# Replace with your PDF path
pdf_path = '/content/drive/MyDrive/cse.pdf'  # Update this path
pdf_text = extract_text_from_pdf(pdf_path)
if not pdf_text:
    raise ValueError("No text extracted from PDF. Check the file path or content.")
print(f"Extracted text length: {len(pdf_text)} characters")

# Step 2: Preprocess and chunk text
def chunk_text(text, max_length=512):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

text_chunks = chunk_text(pdf_text)
print(f"Number of chunks: {len(text_chunks)}")

# Step 3: Generate synthetic question-answer pairs
def generate_synthetic_qa(chunks, num_questions=500):
    model = SentenceTransformer('all-mpnet-base-v2')
    qa_pairs = []
    for chunk in tqdm(chunks, desc="Generating QA pairs"):
        sentences = re.split(r'(?<=[.!?])\s+', chunk)
        for sentence in sentences:
            if len(sentence.strip()) > 20:
                question_types = [
                    f"What is discussed about {sentence[:30].strip()}...?",
                    f"Can you explain {sentence[:30].strip()}...?",
                    f"What details are given about {sentence[:30].strip()}...?",
                    f"What is the significance of {sentence[:30].strip()}...?"
                ]
                for question in question_types:
                    qa_pairs.append({"question": question, "answer": sentence.strip(), "context": chunk})
                    if len(qa_pairs) >= num_questions:
                        break
        if len(qa_pairs) >= num_questions:
            break
    return qa_pairs

qa_pairs = generate_synthetic_qa(text_chunks)
print(f"Generated {len(qa_pairs)} QA pairs")

# Step 4: Prepare dataset for fine-tuning
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2-7B-Instruct')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

class QADataset(Dataset):
    def __init__(self, qa_pairs):
        self.qa_pairs = qa_pairs
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.qa_pairs)

    def __getitem__(self, idx):
        qa = self.qa_pairs[idx]
        question = qa['question']
        context = qa['context']
        answer = qa['answer']

        prompt = f"Question: {question}\nContext: {context}\nAnswer: {answer}"
        encoding = self.tokenizer(
            prompt,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': encoding['input_ids'].squeeze()
        }

dataset = Dataset.from_list([QADataset(qa_pairs).__getitem__(i) for i in range(len(qa_pairs))])

# Step 5: Fine-tune the model with LoRA
model = AutoModelForCausalLM.from_pretrained(
    'Qwen/Qwen2-7B-Instruct',
    torch_dtype=torch.float16,
    device_map="auto"
)
model.gradient_checkpointing_enable()
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/qa_model_qwen2',
    num_train_epochs=5,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    fp16=True,
    logging_dir='/content/drive/MyDrive/logs',
    logging_steps=100,
    report_to="none",
    optim="adamw_8bit"  # Requires bitsandbytes
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()

# Step 6: Save the model
model.save_pretrained('/content/drive/MyDrive/qa_model_qwen2')
tokenizer.save_pretrained('/content/drive/MyDrive/qa_model_qwen2')
print("Model and tokenizer saved to /content/drive/MyDrive/qa_model_qwen2")

# Step 7: Function to answer questions using the saved model with context selection
def answer_question(question, chunks, model_path='/content/drive/MyDrive/qa_model_qwen2'):
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    # Select most relevant chunk
    embedder = SentenceTransformer('all-mpnet-base-v2')
    question_embedding = embedder.encode(question, convert_to_tensor=True)
    chunk_embeddings = embedder.encode(chunks, convert_to_tensor=True)
    cos_scores = util.cos_sim(question_embedding, chunk_embeddings)[0]
    best_chunk_idx = torch.argmax(cos_scores).item()
    context = chunks[best_chunk_idx]

    prompt = f"Question: {question}\nContext: {context}\nAnswer: "
    inputs = tokenizer(
        prompt,
        add_special_tokens=True,
        max_length=512,
        return_tensors='pt',
        truncation=True
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=100,
            do_sample=False
        )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = answer.split("Answer: ")[-1].strip()
    return answer, context

# Example usage
question = "Phone number of the teachers?"
answer, selected_context = answer_question(question, text_chunks)
print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"Selected context: {selected_context[:100]}...")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: Who is the dean?
Answer: The dean is not explicitly mentioned in the given context. However, based on the information provided, it can be inferred that the dean might be involved in the appointment process of members for the examination committee or the evaluation of projects. The context mentions the roles of the chairman of the examination committee, the chairman of the relevant semester, one expert member who must hold the rank of an Associate Professor or above, and the Vice-Chancellor's role in making appointments when a member is unable to fulfill their duties.
Selected context: Chairman of the examination committee
Chairman
of the relevant semester
2. One expert member and not...


In [5]:
# Example usage
question = "Phone number of the teachers?"
answer, selected_context = answer_question(question, text_chunks)
print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"Selected context: {selected_context[:100]}...")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: Phone number of the teachers?
Answer: To contact the Tutor for your program at Bangladesh Open University (BOU), you can follow these steps:

1. **Check the BOU Website**: Visit the official website of Bangladesh Open University (www.bou.ac.bd). On the website, look for the "Contact Us" or "Faculty" section. There might be a directory listing all the tutors along with their names, contact numbers, email addresses, and sometimes even their office locations.

2. **Study Center Information**: If you are enrolled
Selected context: According to the BOU concept, a teacher who delivers lectures and provides
tutorial services is call...


#all-MiniLM-L6-v2

In [3]:
# ‚úÖ Install dependencies (only needs to run once)
!pip install -q transformers faiss-cpu sentence-transformers pymupdf nltk

# ‚úÖ Imports
import os
import fitz  # PyMuPDF
import numpy as np
import faiss
import pickle
import nltk
nltk.download('punkt')  # For sentence tokenization

from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# ========================
# üîπ 1. Load PDF Text
# ========================
def load_pdf_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

pdf_path = "/content/cse.pdf"  # üîÑ Replace with your actual PDF path
text = load_pdf_text(pdf_path)

# ========================
# üîπ 2. Chunk Text (based on word count)
# ========================
def chunk_text(text, max_tokens=500):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        sentence_words = sentence.split()
        current_words = current_chunk.split()

        if len(current_words) + len(sentence_words) <= max_tokens:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

chunks = chunk_text(text)

# ========================
# üîπ 3. Embed Text and Build FAISS Index
# ========================
embedder = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedder.encode(chunks, show_progress_bar=True)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

# ========================
# üîπ 4. Save Everything
# ========================
faiss.write_index(index, "faiss_index.index")
with open("chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)

embedder.save("embedding_model")

print("‚úÖ Model, FAISS index, and text chunks saved successfully.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
# ========================
# üîπ 4. Load & Answer
# ========================
# Load all assets
index = faiss.read_index("faiss_index.index")
with open("chunks.pkl", "rb") as f:
    chunks = pickle.load(f)
embedder = SentenceTransformer("embedding_model")

# Use a QA model (choose one)
qa_model = pipeline("text-generation", model="google/flan-t5-xl", tokenizer="google/flan-t5-xl", device=0)

# Function to search and answer
def get_top_chunks(query, k=3):
    query_embedding = embedder.encode([query])
    distances, indices = index.search(query_embedding, k)
    return [chunks[i] for i in indices[0]]

def answer_question(query):
    context_chunks = get_top_chunks(query, k=3)
    context = " ".join(context_chunks)
    prompt = f"Answer the question based on the context below.\n\nContext: {context}\n\nQuestion: {query}\nAnswer:"
    answer = qa_model(prompt, max_new_tokens=200, do_sample=True)[0]['generated_text']
    return answer.split("Answer:")[-1].strip()

# üß™ Example
query = "What is the topic of the document?"
print("Q:", query)
print("A:", answer_question(query))
