#1_DistillBert

In [None]:
# Install dependencies
!pip install pdfplumber transformers torch sentence-transformers

# Import libraries
from google.colab import drive
import pdfplumber
import torch
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering, Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer, util
import numpy as np
import re
import os
from tqdm import tqdm

# Mount Google Drive
drive.mount('/content/drive')

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"Error extracting text: {e}")
    return text

# Replace with your PDF path
pdf_path = '/content/cse.pdf'  # Update this path
pdf_text = extract_text_from_pdf(pdf_path)
print(f"Extracted text length: {len(pdf_text)} characters")

# Step 2: Preprocess and chunk text
def chunk_text(text, max_length=512):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

text_chunks = chunk_text(pdf_text)
print(f"Number of chunks: {len(text_chunks)}")

# Step 3: Generate synthetic question-answer pairs
def generate_synthetic_qa(chunks, num_questions=100):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    qa_pairs = []
    for chunk in tqdm(chunks, desc="Generating QA pairs"):
        sentences = re.split(r'(?<=[.!?])\s+', chunk)
        for sentence in sentences:
            if len(sentence.strip()) > 20:
                question = f"What is mentioned about {sentence[:30].strip()}...?"
                answer = sentence.strip()
                qa_pairs.append({"question": question, "answer": answer, "context": chunk})
                if len(qa_pairs) >= num_questions:
                    break
        if len(qa_pairs) >= num_questions:
            break
    return qa_pairs

qa_pairs = generate_synthetic_qa(text_chunks)
print(f"Generated {len(qa_pairs)} QA pairs")

# Step 4: Prepare dataset for fine-tuning
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class QADataset(torch.utils.data.Dataset):
    def __init__(self, qa_pairs):
        self.qa_pairs = qa_pairs
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.qa_pairs)

    def __getitem__(self, idx):
        qa = self.qa_pairs[idx]
        question = qa['question']
        context = qa['context']
        answer = qa['answer']

        # Encode question and context
        encoding = self.tokenizer.encode_plus(
            question,
            context,
            add_special_tokens=True,
            max_length=512,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        # Find start and end positions of answer in context
        answer_encoding = self.tokenizer.encode(answer, add_special_tokens=False)
        input_ids = encoding['input_ids'].squeeze()
        answer_ids = self.tokenizer.encode(answer, add_special_tokens=False)

        start_positions = -1
        end_positions = -1
        for i in range(len(input_ids) - len(answer_ids)):
            if input_ids[i:i+len(answer_ids)].tolist() == answer_ids:
                start_positions = i
                end_positions = i + len(answer_ids) - 1
                break

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'token_type_ids': encoding['token_type_ids'].squeeze(),
            'start_positions': start_positions,
            'end_positions': end_positions
        }

dataset = QADataset(qa_pairs)

# Step 5: Fine-tune the model
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/qa_model',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    fp16=True,  # Enable mixed precision for GPU
    logging_dir='/content/drive/MyDrive/logs',
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()

# Step 6: Save the model
model.save_pretrained('/content/drive/MyDrive/qa_model')
tokenizer.save_pretrained('/content/drive/MyDrive/qa_model')
print("Model and tokenizer saved to /content/drive/MyDrive/qa_model")

# Step 7: Function to answer questions using the saved model
def answer_question(question, context, model_path='/content/drive/MyDrive/qa_model'):
    model = DistilBertForQuestionAnswering.from_pretrained(model_path)
    tokenizer = DistilBertTokenizer.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    inputs = tokenizer.encode_plus(
        question,
        context,
        add_special_tokens=True,
        max_length=512,
        return_tensors='pt',
        truncation=True
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    start_idx = torch.argmax(start_scores)
    end_idx = torch.argmax(end_scores) + 1

    answer_tokens = input_ids[0][start_idx:end_idx]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
    return answer

# Example usage
question = "What is the main topic of the document?"
context = text_chunks[0]  # Use first chunk as context or select relevant chunk
answer = answer_question(question, context)
print(f"Question: {question}")
print(f"Answer: {answer}")

Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.6-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250327-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#2_roberta-large

In [None]:
# Install dependencies
!pip install pdfplumber transformers torch sentence-transformers

# Import libraries
from google.colab import drive
import pdfplumber
import torch
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering, Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer, util
import numpy as np
import re
import os
from tqdm import tqdm

# Mount Google Drive
drive.mount('/content/drive')

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"Error extracting text: {e}")
    return text

# Replace with your PDF path
pdf_path = '/content/cse.pdf'  # Update this path
pdf_text = extract_text_from_pdf(pdf_path)
print(f"Extracted text length: {len(pdf_text)} characters")

# Step 2: Preprocess and chunk text
def chunk_text(text, max_length=384):  # Reduced max_length for RoBERTa
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

text_chunks = chunk_text(pdf_text)
print(f"Number of chunks: {len(text_chunks)}")

# Step 3: Generate synthetic question-answer pairs
def generate_synthetic_qa(chunks, num_questions=200):  # Increased number of questions
    model = SentenceTransformer('all-mpnet-base-v2')  # More powerful embedding model
    qa_pairs = []
    for chunk in tqdm(chunks, desc="Generating QA pairs"):
        sentences = re.split(r'(?<=[.!?])\s+', chunk)
        for sentence in sentences:
            if len(sentence.strip()) > 20:
                # Create multiple question types
                question_types = [
                    f"What is mentioned about {sentence[:30].strip()}...?",
                    f"Can you explain {sentence[:30].strip()}...?",
                    f"What details are provided about {sentence[:30].strip()}...?"
                ]
                for question in question_types:
                    qa_pairs.append({"question": question, "answer": sentence.strip(), "context": chunk})
                    if len(qa_pairs) >= num_questions:
                        break
        if len(qa_pairs) >= num_questions:
            break
    return qa_pairs

qa_pairs = generate_synthetic_qa(text_chunks)
print(f"Generated {len(qa_pairs)} QA pairs")

# Step 4: Prepare dataset for fine-tuning
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large')

class QADataset(torch.utils.data.Dataset):
    def __init__(self, qa_pairs):
        self.qa_pairs = qa_pairs
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.qa_pairs)

    def __getitem__(self, idx):
        qa = self.qa_pairs[idx]
        question = qa['question']
        context = qa['context']
        answer = qa['answer']

        # Encode question and context
        encoding = self.tokenizer.encode_plus(
            question,
            context,
            add_special_tokens=True,
            max_length=384,  # RoBERTa optimal length
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        # Find start and end positions of answer in context
        answer_encoding = self.tokenizer.encode(answer, add_special_tokens=False)
        input_ids = encoding['input_ids'].squeeze()
        answer_ids = self.tokenizer.encode(answer, add_special_tokens=False)

        start_positions = -1
        end_positions = -1
        for i in range(len(input_ids) - len(answer_ids)):
            if input_ids[i:i+len(answer_ids)].tolist() == answer_ids:
                start_positions = i
                end_positions = i + len(answer_ids) - 1
                break

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'token_type_ids': encoding['token_type_ids'].squeeze(),
            'start_positions': start_positions,
            'end_positions': end_positions
        }

dataset = QADataset(qa_pairs)

# Step 5: Fine-tune the model
model = RobertaForQuestionAnswering.from_pretrained('roberta-large')

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/LLM/2_roberta-large/qa_model_roberta',
    num_train_epochs=3,
    per_device_train_batch_size=4,  # Smaller batch size for large model
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch size
    save_steps=500,
    save_total_limit=2,
    learning_rate=1e-5,  # Lower learning rate for large model
    fp16=True,  # Enable mixed precision for GPU
    logging_dir='/content/drive/MyDrive/logs',
    logging_steps=100,
    report_to="none"  # Disable W&B logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()

# Step 6: Save the model
model.save_pretrained('/content/drive/MyDrive/LLM/2_roberta-large/qa_model_roberta')
tokenizer.save_pretrained('/content/drive/MyDrive/LLM/2_roberta-large/qa_model_roberta')
print("Model and tokenizer saved to /content/drive/MyDrive/LLM/2_roberta-large/qa_model_roberta")

# Step 7: Function to answer questions using the saved model with context selection
def answer_question(question, chunks, model_path='/content/drive/MyDrive/LLM/2_roberta-large/qa_model_roberta'):
    # Load model and tokenizer
    model = RobertaForQuestionAnswering.from_pretrained(model_path)
    tokenizer = RobertaTokenizerFast.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    # Use sentence-transformer to find the most relevant chunk
    embedder = SentenceTransformer('all-mpnet-base-v2')
    question_embedding = embedder.encode(question, convert_to_tensor=True)
    chunk_embeddings = embedder.encode(chunks, convert_to_tensor=True)
    cos_scores = util.cos_sim(question_embedding, chunk_embeddings)[0]
    best_chunk_idx = torch.argmax(cos_scores).item()
    context = chunks[best_chunk_idx]

    # Encode inputs
    inputs = tokenizer.encode_plus(
        question,
        context,
        add_special_tokens=True,
        max_length=384,
        return_tensors='pt',
        truncation=True
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    start_idx = torch.argmax(start_scores)
    end_idx = torch.argmax(end_scores) + 1

    answer_tokens = input_ids[0][start_idx:end_idx]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
    return answer, context

# Example usage
question = "What is the main topic of the document?"
answer, selected_context = answer_question(question, text_chunks)
print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"Selected context: {selected_context[:100]}...")  # Print first 100 chars of context

#3_DeepSeek-R1

In [None]:
# Install dependencies
!pip install pdfplumber transformers torch sentence-transformers datasets accelerate peft huggingface_hub

# Import libraries
from google.colab import drive
import pdfplumber
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from sentence_transformers import SentenceTransformer, util
from datasets import Dataset
import numpy as np
import re
import os
from tqdm import tqdm
from huggingface_hub import login

# Mount Google Drive
drive.mount('/content/drive')

# Step 1: Authenticate with Hugging Face
# Replace with your Hugging Face Read token from https://huggingface.co/settings/tokens
hf_token = "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"  # Update with your Read token
if hf_token:
    login(hf_token)
    print("Hugging Face login successful")
else:
    raise ValueError("Hugging Face Read token required for LLaMA 3.1. Set `hf_token` or run `huggingface-cli login`.")

# Step 2: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"Error extracting text: {e}")
    return text

# Replace with your PDF path
pdf_path = '/content/drive/MyDrive/cse.pdf'  # Update this path
pdf_text = extract_text_from_pdf(pdf_path)
if not pdf_text:
    raise ValueError("No text extracted from PDF. Check the file path or content.")
print(f"Extracted text length: {len(pdf_text)} characters")

# Step 3: Preprocess and chunk text
def chunk_text(text, max_length=512):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

text_chunks = chunk_text(pdf_text)
print(f"Number of chunks: {len(text_chunks)}")

# Step 4: Generate synthetic question-answer pairs
def generate_synthetic_qa(chunks, num_questions=500):
    model = SentenceTransformer('all-mpnet-base-v2')
    qa_pairs = []
    for chunk in tqdm(chunks, desc="Generating QA pairs"):
        sentences = re.split(r'(?<=[.!?])\s+', chunk)
        for sentence in sentences:
            if len(sentence.strip()) > 20:
                question_types = [
                    f"What is discussed about {sentence[:30].strip()}...?",
                    f"Can you explain {sentence[:30].strip()}...?",
                    f"What details are given about {sentence[:30].strip()}...?",
                    f"What is the significance of {sentence[:30].strip()}...?"
                ]
                for question in question_types:
                    qa_pairs.append({"question": question, "answer": sentence.strip(), "context": chunk})
                    if len(qa_pairs) >= num_questions:
                        break
        if len(qa_pairs) >= num_questions:
            break
    return qa_pairs

qa_pairs = generate_synthetic_qa(text_chunks)
print(f"Generated {len(qa_pairs)} QA pairs")

# Step 5: Prepare dataset for fine-tuning
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.1-8B-Instruct')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

class QADataset(Dataset):
    def __init__(self, qa_pairs):
        self.qa_pairs = qa_pairs
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.qa_pairs)

    def __getitem__(self, idx):
        qa = self.qa_pairs[idx]
        question = qa['question']
        context = qa['context']
        answer = qa['answer']

        # Format input for LLaMA 3.1
        prompt = f"<|begin_of_text|>Question: {question}\nContext: {context}\nAnswer: {answer}<|end_of_text|>"
        encoding = self.tokenizer(
            prompt,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': encoding['input_ids'].squeeze()
        }

dataset = Dataset.from_list([QADataset(qa_pairs).__getitem__(i) for i in range(len(qa_pairs))])

# Step 6: Fine-tune the model with LoRA
model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-3.1-8B-Instruct')
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/qa_model_llama3_1',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    fp16=True,
    logging_dir='/content/drive/MyDrive/logs',
    logging_steps=100,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()

# Step 7: Save the model
model.save_pretrained('/content/drive/MyDrive/qa_model_llama3_1')
tokenizer.save_pretrained('/content/drive/MyDrive/qa_model_llama3_1')
print("Model and tokenizer saved to /content/drive/MyDrive/qa_model_llama3_1")

# Step 8: Function to answer questions using the saved model with context selection
def answer_question(question, chunks, model_path='/content/drive/MyDrive/qa_model_llama3_1'):
    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    # Select most relevant chunk
    embedder = SentenceTransformer('all-mpnet-base-v2')
    question_embedding = embedder.encode(question, convert_to_tensor=True)
    chunk_embeddings = embedder.encode(chunks, convert_to_tensor=True)
    cos_scores = util.cos_sim(question_embedding, chunk_embeddings)[0]
    best_chunk_idx = torch.argmax(cos_scores).item()
    context = chunks[best_chunk_idx]

    # Format input for LLaMA 3.1
    prompt = f"<|begin_of_text|>Question: {question}\nContext: {context}\nAnswer: "
    inputs = tokenizer(
        prompt,
        add_special_tokens=True,
        max_length=512,
        return_tensors='pt',
        truncation=True
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=100,
            do_sample=False
        )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = answer.split("Answer: ")[-1].strip()
    return answer, context

# Example usage
question = "What is the main topic of the document?"
answer, selected_context = answer_question(question, text_chunks)
print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"Selected context: {selected_context[:100]}...")

#4_Qwen-7B

In [None]:
# Install dependencies
!pip install pdfplumber transformers torch sentence-transformers datasets accelerate peft


# Set environment variable to reduce memory fragmentation
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Install dependencies
!pip install pdfplumber transformers torch sentence-transformers datasets accelerate peft bitsandbytes

# Import libraries
from google.colab import drive
import pdfplumber
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from sentence_transformers import SentenceTransformer, util
from datasets import Dataset
import numpy as np
import re
from tqdm import tqdm

# Mount Google Drive
drive.mount('/content/drive')

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"Error extracting text: {e}")
    return text

# Replace with your PDF path
pdf_path = '/content/drive/MyDrive/cse.pdf'  # Update this path
pdf_text = extract_text_from_pdf(pdf_path)
if not pdf_text:
    raise ValueError("No text extracted from PDF. Check the file path or content.")
print(f"Extracted text length: {len(pdf_text)} characters")

# Step 2: Preprocess and chunk text
def chunk_text(text, max_length=512):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

text_chunks = chunk_text(pdf_text)
print(f"Number of chunks: {len(text_chunks)}")

# Step 3: Generate synthetic question-answer pairs
def generate_synthetic_qa(chunks, num_questions=500):
    model = SentenceTransformer('all-mpnet-base-v2')
    qa_pairs = []
    for chunk in tqdm(chunks, desc="Generating QA pairs"):
        sentences = re.split(r'(?<=[.!?])\s+', chunk)
        for sentence in sentences:
            if len(sentence.strip()) > 20:
                question_types = [
                    f"What is discussed about {sentence[:30].strip()}...?",
                    f"Can you explain {sentence[:30].strip()}...?",
                    f"What details are given about {sentence[:30].strip()}...?",
                    f"What is the significance of {sentence[:30].strip()}...?"
                ]
                for question in question_types:
                    qa_pairs.append({"question": question, "answer": sentence.strip(), "context": chunk})
                    if len(qa_pairs) >= num_questions:
                        break
        if len(qa_pairs) >= num_questions:
            break
    return qa_pairs

qa_pairs = generate_synthetic_qa(text_chunks)
print(f"Generated {len(qa_pairs)} QA pairs")

# Step 4: Prepare dataset for fine-tuning
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2-7B-Instruct')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

class QADataset(Dataset):
    def __init__(self, qa_pairs):
        self.qa_pairs = qa_pairs
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.qa_pairs)

    def __getitem__(self, idx):
        qa = self.qa_pairs[idx]
        question = qa['question']
        context = qa['context']
        answer = qa['answer']

        prompt = f"Question: {question}\nContext: {context}\nAnswer: {answer}"
        encoding = self.tokenizer(
            prompt,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': encoding['input_ids'].squeeze()
        }

dataset = Dataset.from_list([QADataset(qa_pairs).__getitem__(i) for i in range(len(qa_pairs))])

# Step 5: Fine-tune the model with LoRA
model = AutoModelForCausalLM.from_pretrained(
    'Qwen/Qwen2-7B-Instruct',
    torch_dtype=torch.float16,
    device_map="auto"
)
model.gradient_checkpointing_enable()
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/qa_model_qwen2',
    num_train_epochs=5,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    fp16=True,
    logging_dir='/content/drive/MyDrive/logs',
    logging_steps=100,
    report_to="none",
    optim="adamw_8bit"  # Requires bitsandbytes
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()

# Step 6: Save the model
model.save_pretrained('/content/drive/MyDrive/qa_model_qwen2')
tokenizer.save_pretrained('/content/drive/MyDrive/qa_model_qwen2')
print("Model and tokenizer saved to /content/drive/MyDrive/qa_model_qwen2")

# Step 7: Function to answer questions using the saved model with context selection
def answer_question(question, chunks, model_path='/content/drive/MyDrive/qa_model_qwen2'):
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    # Select most relevant chunk
    embedder = SentenceTransformer('all-mpnet-base-v2')
    question_embedding = embedder.encode(question, convert_to_tensor=True)
    chunk_embeddings = embedder.encode(chunks, convert_to_tensor=True)
    cos_scores = util.cos_sim(question_embedding, chunk_embeddings)[0]
    best_chunk_idx = torch.argmax(cos_scores).item()
    context = chunks[best_chunk_idx]

    prompt = f"Question: {question}\nContext: {context}\nAnswer: "
    inputs = tokenizer(
        prompt,
        add_special_tokens=True,
        max_length=512,
        return_tensors='pt',
        truncation=True
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=100,
            do_sample=False
        )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = answer.split("Answer: ")[-1].strip()
    return answer, context

# Example usage
question = "Phone number of the teachers?"
answer, selected_context = answer_question(question, text_chunks)
print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"Selected context: {selected_context[:100]}...")

In [None]:
# Install dependencies
!pip install pdfplumber transformers torch sentence-transformers datasets

# Import libraries
from google.colab import drive
import pdfplumber
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, util
import numpy as np
import re

# Mount Google Drive
drive.mount('/content/drive')

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"Error extracting text: {e}")
    return text

# Replace with your PDF path
pdf_path = '/content/drive/MyDrive/cse.pdf'  # Update this path
pdf_text = extract_text_from_pdf(pdf_path)
if not pdf_text:
    raise ValueError("No text extracted from PDF. Check the file path or content.")
print(f"Extracted text length: {len(pdf_text)} characters")

# Step 2: Preprocess and chunk text
def chunk_text(text, max_length=512):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

text_chunks = chunk_text(pdf_text)
print(f"Number of chunks: {len(text_chunks)}")

## Step 3: Function to answer questions using the saved model with context selection
# def answer_question(question, chunks, model_path='/content/drive/MyDrive/LLM/4_Qwen/qa_model_qwen2'):
#     model = AutoModelForCausalLM.from_pretrained(model_path)
#     tokenizer = AutoTokenizer.from_pretrained(model_path)
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     model.to(device)
#     model.eval()

#     # Select most relevant chunk
#     embedder = SentenceTransformer('all-mpnet-base-v2')
#     question_embedding = embedder.encode(question, convert_to_tensor=True)
#     chunk_embeddings = embedder.encode(chunks, convert_to_tensor=True)
#     cos_scores = util.cos_sim(question_embedding, chunk_embeddings)[0]
#     best_chunk_idx = torch.argmax(cos_scores).item()
#     context = chunks[best_chunk_idx]

#     # Format input for Qwen2
#     prompt = f"Question: {question}\nContext: {context}\nAnswer: "
#     inputs = tokenizer(
#         prompt,
#         add_special_tokens=True,
#         max_length=512,
#         return_tensors='pt',
#         truncation=True
#     )

#     input_ids = inputs['input_ids'].to(device)
#     attention_mask = inputs['attention_mask'].to(device)

#     with torch.no_grad():
#         outputs = model.generate(
#             input_ids=input_ids,
#             attention_mask=attention_mask,
#             max_new_tokens=100,
#             do_sample=False
#         )

#     answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     answer = answer.split("Answer: ")[-1].strip()
#     return answer, context


from transformers import TextIteratorStreamer
import threading

def answer_question(question, chunks, model_path='/content/drive/MyDrive/LLM/4_Qwen/qa_model_qwen2'):
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        device_map='auto'
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.eval()

    # Find the most relevant chunk
    embedder = SentenceTransformer('all-mpnet-base-v2')
    question_embedding = embedder.encode(question, convert_to_tensor=True)
    chunk_embeddings = embedder.encode(chunks, convert_to_tensor=True)
    cos_scores = util.cos_sim(question_embedding, chunk_embeddings)[0]
    best_chunk_idx = torch.argmax(cos_scores).item()
    context = chunks[best_chunk_idx]

    # Format input
    prompt = f"Question: {question}\nContext: {context}\nAnswer: "
    inputs = tokenizer(prompt, return_tensors='pt', max_length=512, truncation=True).to(device)

    # Use a TextIteratorStreamer for real-time output
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(
        **inputs,
        streamer=streamer,
        max_new_tokens=150,
        do_sample=True,
        top_p=0.9,
        temperature=0.8,
    )

    # Generate in background thread to stream output
    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    print("Answer:")
    full_answer = ""
    for token in streamer:
        print(token, end='', flush=True)
        full_answer += token

    print("\n")
    return full_answer.strip(), context





# Example usage
question = "What is the main topic of the document?"
answer, selected_context = answer_question(question, text_chunks)
print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"Selected context: {selected_context[:100]}...")


In [None]:
# Example usage
question = "Who are the teachers?"
answer, selected_context = answer_question(question, text_chunks)
print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"Selected context: {selected_context[:100]}...")

#5_Retrieval-Augmented Generation (RAG) or document QA with a powerful model like LLaMA 3, Mistral, or GPT-style

In [None]:
# Clean old broken installs
!pip uninstall -y faiss-cpu faiss-gpu faiss farm-haystack

# Install compatible FAISS and Haystack
!pip install -q faiss-cpu==1.7.4
!pip install -q farm-haystack==1.17.1
!pip install -q pypdf sentence-transformers


from google.colab import files
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]  # Automatically gets the uploaded filename




from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import EmbeddingRetriever, TransformersReader, PreProcessor
from haystack.pipelines import ExtractiveQAPipeline
from haystack import Document
from PyPDF2 import PdfReader

# Extract text from PDF
def extract_pdf_text(path):
    reader = PdfReader(path)
    return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])

text = extract_pdf_text(pdf_path)

# Split into smaller chunks
preprocessor = PreProcessor(split_by="word", split_length=200, split_respect_sentence_boundary=True)
docs = preprocessor.process([Document(content=text)])

# Set up FAISS vector store
document_store = FAISSDocumentStore(embedding_dim=768)

# Use sentence-transformer for embeddings
retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/all-mpnet-base-v2",
    use_gpu=True
)

document_store.write_documents(docs)
document_store.update_embeddings(retriever)

# Use a strong reader model
reader = TransformersReader(
    model_name_or_path="google/flan-t5-base",  # You can also try flan-t5-xl if enough GPU
    tokenizer="google/flan-t5-base",
    use_gpu=True
)

# Create pipeline
pipe = ExtractiveQAPipeline(reader=reader, retriever=retriever)

# Ask your question
query = "What is the phone number of John Doe?"  # Change this!
prediction = pipe.run(query=query, params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 1}})

# Show answer
print("Answer:", prediction["answers"][0].answer)


#6_Open Ai with langchain

In [None]:
# STEP 1: Uninstall everything causing conflict
!pip uninstall -y farm-haystack protobuf pydantic typing-extensions langchain-core langchain numpy torch torchaudio

# STEP 2: Reinstall correct compatible versions
!pip install -q \
  "protobuf>=3.20.3,<6.0.0" \
  "pydantic>=2.7.4" \
  "typing-extensions>=4.11.0" \
  "numpy<2" \
  langchain \
  openai \
  faiss-cpu \
  pymupdf \
  tiktoken

# STEP 3: ✅ Test imports
try:
    from langchain.text_splitter import CharacterTextSplitter
    print("✅ LangChain is working.")
except ImportError as e:
    print("❌ Import failed:", e)







# ✅ STEP 1: Install Clean Packages
!pip install -q langchain openai faiss-cpu pymupdf tiktoken

# ✅ STEP 2: Set your OpenAI API key
import os
os.environ["OPENAI_API_KEY"] = ""





# ✅ STEP 1: Uninstall conflicting packages
!pip uninstall -y farm-haystack protobuf pydantic typing-extensions torch torchaudio

# ✅ STEP 2: Install only what you need for LangChain PDF QA
!pip install -q langchain openai faiss-cpu pymupdf tiktoken pydantic typing-extensions

# # ✅ STEP 3: Set up OpenAI API Key
# import os
# os.environ["OPENAI_API_KEY"] = "sk-..."  # Replace with your real key

# ✅ STEP 4: Load PDF from path
import fitz  # PyMuPDF
from langchain.text_splitter import CharacterTextSplitter

def load_pdf(path):
    doc = fitz.open(path)
    texts = [page.get_text() for page in doc]
    full_text = "\n".join(texts)
    return full_text

pdf_path = '/content/drive/MyDrive/cse.pdf'
text = load_pdf(pdf_path)

# ✅ STEP 5: Split Text into Chunks
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.create_documents([text])

# ✅ STEP 6: Embed with OpenAI and store in FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

embedding = OpenAIEmbeddings()
db = FAISS.from_documents(chunks, embedding)

# ✅ STEP 7: Ask Questions via RetrievalQA
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

qa = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(temperature=0),
    chain_type="stuff",
    retriever=db.as_retriever()
)

# ✅ STEP 8: Ask a question
query = "What is the phone number of John Doe?"
result = qa.run(query)
print("🔍 Answer:", result)



#7_all-MiniLM-L6-v2

In [None]:
# STEP 1: Install all dependencies (run this first if in Colab)
!pip install -q faiss-cpu sentence-transformers transformers pymupdf

# STEP 2: Import all necessary modules
import fitz  # PyMuPDF
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# STEP 3: Load and extract text from PDF
def load_pdf(path):
    doc = fitz.open(path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# STEP 4: Split the text into chunks
def split_text(text, max_len=500):
    sentences = text.split(". ")
    chunks, chunk = [], ""
    for sentence in sentences:
        if len(chunk) + len(sentence) <= max_len:
            chunk += sentence + ". "
        else:
            chunks.append(chunk.strip())
            chunk = sentence + ". "
    chunks.append(chunk.strip())
    return chunks

# STEP 5: Embed chunks using sentence transformer
def embed_chunks(chunks):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(chunks)
    return model, embeddings

# STEP 6: Create FAISS index
def create_faiss_index(embeddings):
    dim = embeddings[0].shape[0]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings))
    return index

# STEP 7: Load QA model and answer questions
def answer_question(query, index, chunks, embedder, top_k=5):
    query_embedding = embedder.encode([query])
    scores, indices = index.search(np.array(query_embedding), top_k)
    top_chunks = [chunks[i] for i in indices[0]]

    qa = pipeline("question-answering", model="deepset/roberta-base-squad2")
    answers = []
    for context in top_chunks:
        result = qa(question=query, context=context)
        answers.append((result["score"], result["answer"]))

    best = sorted(answers, key=lambda x: x[0], reverse=True)[0]
    return best[1]

# STEP 8: Run everything
from google.colab import drive
drive.mount('/content/drive')
if __name__ == "__main__":
    pdf_path = "/content/drive/MyDrive/cse.pdf"  # UPDATE to your actual path
    raw_text = load_pdf(pdf_path)
    chunks = split_text(raw_text)
    embedder, embeddings = embed_chunks(chunks)
    index = create_faiss_index(embeddings)

    while True:
        query = input("\nAsk a question (or type 'exit'): ")
        if query.lower() == "exit":
            break
        answer = answer_question(query, index, chunks, embedder)
        print("Answer:", answer)
