In [1]:
!pip install langchain
!pip install openai
!pip install PyPDF2
!pip install faiss-cpu
!pip install tiktoken

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m60.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [2]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.29-py3-none-any.whl.metadata (2.9 kB)
Collecting requests<3,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.6.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.6.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.6.7->langchain-community)
  Downloading mypy_extensions-1.1.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0.3.29-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Demo LLM


In [4]:
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceHub

# ----------------------------
# 1. Read PDF
# ----------------------------
pdfreader = PdfReader("/content/sample_data/budget_speech.pdf")

raw_text = ""
for page in pdfreader.pages:
    content = page.extract_text()
    if content:
        raw_text += content

# ----------------------------
# 2. Split Text into Chunks
# ----------------------------
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)
texts = text_splitter.split_text(raw_text)

print(f"Number of text chunks: {len(texts)}")

# ----------------------------
# 3. Use Hugging Face Embeddings
# ----------------------------
# You can choose different models, e.g. "all-MiniLM-L6-v2", "sentence-transformers/all-mpnet-base-v2"
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# ----------------------------
# 4. Create FAISS Vector Store
# ----------------------------
document_search = FAISS.from_texts(texts, embeddings)

# ----------------------------
# 5. Load a QA Chain with Hugging Face LLM
# ----------------------------
# You need a Hugging Face Hub token set up as an env variable if using large models
# Or you can plug in local models via HuggingFacePipeline
from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline

# Example: use a local small model for Q&A
qa_pipeline = pipeline("text-generation", model="google/flan-t5-base", max_new_tokens=256)
llm = HuggingFacePipeline(pipeline=qa_pipeline)

chain = load_qa_chain(llm, chain_type="stuff")

# ----------------------------
# 6. Query the Document
# ----------------------------
query = "Personal Income-tax reforms"
docs = document_search.similarity_search(query)

result = chain.run(input_documents=docs, question=query)
print("\nAnswer:\n", result)

Number of text chunks: 116


  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu
The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['PeftModelForCausalLM', 'ApertusForCausalLM', 'ArceeForCausalLM', 'AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BitNetForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV2ForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'DogeForCausalLM', 'Dots1ForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'Ernie4_5ForCausalLM', 'Ernie4_5_MoeForCausalLM', 'Exaone4ForCausalLM', 'FalconForCausalLM', 'FalconH1ForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaFor


Answer:
 Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

(i) Personal Income Tax reforms with special focus on middle class  
(ii) Rationalization of TDS/TCS for easing difficulties  
(iii) Encouraging voluntary compliance  
(iv) Reducing compliance burden  
(v) Ease of doing business  
(vi) Employment and investment  
I will come to my proposal on personal income tax towards the end.  
TDS/TCS rationalization for easing difficulties  
137. I propose to rationalize Tax Deduction at Source (TDS) by reducing the 
number of rates and thresholds above which TDS is deducted. Further, 
threshold amounts for tax deduction will be increased for better clarity and 
uniformity. The limit for tax deduction  on interest for senior citizens is being  
doubled from the present ` 50,000 to ` 1 lakh. Similarly, the annual limit of ` 
2.40 lakh for TDS on rent is being  increased to ` 6 

## Model accuracy

In [7]:
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceHub

from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# ----------------------------
# 1. Read PDF
# ----------------------------
pdfreader = PdfReader("/content/sample_data/budget_speech.pdf")

raw_text = ""
for page in pdfreader.pages:
    content = page.extract_text()
    if content:
        raw_text += content

# ----------------------------
# 2. Split Text into Chunks
# ----------------------------
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)
texts = text_splitter.split_text(raw_text)

print(f"Number of text chunks: {len(texts)}")

# ----------------------------
# 3. Use Hugging Face Embeddings
# ----------------------------
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# ----------------------------
# 4. Create FAISS Vector Store
# ----------------------------
document_search = FAISS.from_texts(texts, embeddings)

# ----------------------------
# 5. Load a QA Chain with Hugging Face LLM
# ----------------------------
qa_pipeline = pipeline("text-generation", model="google/flan-t5-base", max_new_tokens=256)
llm = HuggingFacePipeline(pipeline=qa_pipeline)

chain = load_qa_chain(llm, chain_type="stuff")

# ----------------------------
# 6. Define Evaluation Dataset
# ----------------------------
eval_set = [
    {"question": "Personal Income-tax reforms",
     "answer":
"""Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

(i) Personal Income Tax reforms with special focus on middle class
(ii) Rationalization of TDS/TCS for easing difficulties
(iii) Encouraging voluntary compliance
(iv) Reducing compliance burden
(v) Ease of doing business
(vi) Employment and investment
I will come to my proposal on personal income tax towards the end.
TDS/TCS rationalization for easing difficulties
137. I propose to rationalize Tax Deduction at Source (TDS) by reducing the
number of rates and thresholds above which TDS is deducted. Further,
threshold amounts for tax deduction will be increased for better clarity and
uniformity. The limit for tax deduction  on interest for senior citizens is being
doubled from the present ` 50,000 to ` 1 lakh. Similarly, the annual limit of `
2.40 lakh for TDS on rent is being  increased to ` 6 lakh. This will reduce the
number of transactions liable to TDS, thus benefitting small tax payers
receiving small payments.

Rules 6 and 7 are being amended to increase the time limit for fulfilling
end use from current six months to one year and to file only a quarterly
statement instead of monthly statement.
 Note: AIDC – Agriculture Infrastructure and Development Cess; SWS – Social
Welfare Surcharge

G. OTHERS
There are few other changes of minor nature. For details of the budget
proposals, the Explanatory Memorandum and other relevant budget
documents may be referred to.
   46

Annexure to Part B
Amendments relating to Direct Taxes
(i) Personal Income -tax reforms with special focus on middle class
1. Substantial relief is proposed under the new tax regime with new slabs
and tax rates as under: -
 Total income  Rate of tax
Upto ` 4,00,000  Nil
From ` 4,00,001 to ` 8,00,000  5 per cent
From ` 8,00,001 to ` 12,00,000  10 per cent
From ` 12,00,001 to ` 16,00,000  15 per cent
From ` 16,00,001 to ` 20,00,000  20 per cent
From ` 20,00,001 to ` 24,00,000  25 per cent

lakh for salaried tax payers, due to standard deduction of ` 75,000.
157. Slabs and rates are being changed across the board to benefit all tax -
payers. The new structure will substantially reduce the taxes of the middle
class and leave more money in their hands, boosting household consumption,
savings and investment.
158.  In the new tax regime, I propose to revise tax rate structure as follows:
0-4 lakh rupees  Nil
4-8 lakh rupees  5 per cent
8-12 lakh rupees  10 per cent
12-16 lakh rupees  15 per cent
16-20 lakh rupees  20 per cent
20- 24 lakh rupees  25 per cent
Above 24 lakh rupees  30 per cent   28

159. To tax payers upto ` 12 lakh of normal income (other than special rate
income  such as capital gains ) tax rebate is being provided in addition to the
benefit due to slab rate reduction in such a manner that there is no tax payable
by them. The total tax benefit of slab rate changes and rebate at different

From ` 8,00,001 to ` 12,00,000  10 per cent
From ` 12,00,001 to ` 16,00,000  15 per cent
From ` 16,00,001 to ` 20,00,000  20 per cent
From ` 20,00,001 to ` 24,00,000  25 per cent
Above ` 24,00,000  30 per cent

2. Rebate on income -tax
• Resident individual with total income up to ` 7,00,000 do not pay any
tax due to rebate under the new tax regime. It is proposed to
increase the rebate for the resident individual under the new regime
so that they do not pay tax if their total income is up to ` 12,00,000.
Marginal relief as provided earlier under the new tax regime is also
applicable for income marginally higher than ` 12,00,000.
• A few examples for calculation of tax benefit are given in the table
below:   47

 Income  Tax on
Slabs and rates  Benefit
of Rebate benefit  Total
Benefit  Tax after
rebate
Benefit
 Present  Proposed  Rate
/Slab  Full upto Rs 12
lacs
8 lac  30,000  20,000  10,000  20,000  30,000  0

Question: Personal Income-tax reforms
Helpful Answer:
   """}
]

# ----------------------------
# 7. Run QA and Collect Predictions
# ----------------------------
predictions = []
for sample in eval_set:
    docs = document_search.similarity_search(sample["question"])
    pred = chain.run(input_documents=docs, question=sample["question"])
    predictions.append({"question": sample["question"],
                        "predicted": pred,
                        "ground_truth": sample["answer"]})

# ----------------------------
# 8. Evaluation Metrics
# ----------------------------

def exact_match(pred, truth):
    return int(pred.strip().lower() == truth.strip().lower())

def f1_score(pred, truth):
    pred_tokens = pred.lower().split()
    truth_tokens = truth.lower().split()
    common = set(pred_tokens) & set(truth_tokens)
    if not common:
        return 0.0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(truth_tokens)
    return 2 * (precision * recall) / (precision + recall)

def semantic_similarity(pred, truth, model):
    pred_emb = model.embed_query(pred)
    gt_emb = model.embed_query(truth)
    return cosine_similarity([pred_emb], [gt_emb])[0][0]

# ----------------------------
# 9. Compute Scores
# ----------------------------
em_scores, f1_scores, sim_scores = [], [], []
for p in predictions:
    em_scores.append(exact_match(p["predicted"], p["ground_truth"]))
    f1_scores.append(f1_score(p["predicted"], p["ground_truth"]))
    sim_scores.append(semantic_similarity(p["predicted"], p["ground_truth"], embeddings))

print("\nEvaluation Results:")
print("Exact Match Accuracy:", np.mean(em_scores))
print("F1 Score:", np.mean(f1_scores))
print("Semantic Similarity:", np.mean(sim_scores))

Number of text chunks: 116


Device set to use cpu
The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['PeftModelForCausalLM', 'ApertusForCausalLM', 'ArceeForCausalLM', 'AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BitNetForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV2ForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'DogeForCausalLM', 'Dots1ForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'Ernie4_5ForCausalLM', 'Ernie4_5_MoeForCausalLM', 'Exaone4ForCausalLM', 'FalconForCausalLM', 'FalconH1ForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaFor


Evaluation Results:
Exact Match Accuracy: 1.0
F1 Score: 0.43902439024390244
Semantic Similarity: 1.0000000000000007


In [None]:
# Supported models are ['PeftModelForCausalLM', 'ApertusForCausalLM', 'ArceeForCausalLM', 'AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BitNetForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV2ForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'DogeForCausalLM', 'Dots1ForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'Ernie4_5ForCausalLM', 'Ernie4_5_MoeForCausalLM', 'Exaone4ForCausalLM', 'FalconForCausalLM', 'FalconH1ForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'Gemma3nForConditionalGeneration', 'Gemma3nForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'Glm4ForCausalLM', 'Glm4MoeForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GptOssForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'GraniteMoeHybridForCausalLM', 'GraniteMoeSharedForCausalLM', 'HeliumForCausalLM', 'HunYuanDenseV1ForCausalLM', 'HunYuanMoEV1ForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'Lfm2ForCausalLM', 'LlamaForCausalLM', 'Llama4ForCausalLM', 'Llama4ForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MiniMaxForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MllamaForCausalLM', 'ModernBertDecoderForCausalLM', 'MoshiForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MusicgenMelodyForCausalLM', 'MvpForCausalLM', 'NemotronForCausalLM', 'OlmoForCausalLM', 'Olmo2ForCausalLM', 'OlmoeForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonForCausalLM', 'PhiForCausalLM', 'Phi3ForCausalLM', 'Phi4MultimodalForCausalLM', 'PhimoeForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'Qwen2ForCausalLM', 'Qwen2MoeForCausalLM', 'Qwen3ForCausalLM', 'Qwen3MoeForCausalLM', 'RecurrentGemmaForCausalLM', 'ReformerModelWithLMHead', 'RemBertForCausalLM', 'RobertaForCausalLM', 'RobertaPreLayerNormForCausalLM', 'RoCBertForCausalLM', 'RoFormerForCausalLM', 'RwkvForCausalLM', 'SeedOssForCausalLM', 'SmolLM3ForCausalLM', 'Speech2Text2ForCausalLM', 'StableLmForCausalLM', 'Starcoder2ForCausalLM', 'TransfoXLLMHeadModel', 'TrOCRForCausalLM', 'WhisperForCausalLM', 'XGLMForCausalLM', 'XLMWithLMHeadModel', 'XLMProphetNetForCausalLM', 'XLMRobertaForCausalLM', 'XLMRobertaXLForCausalLM', 'XLNetLMHeadModel', 'xLSTMForCausalLM', 'XmodForCausalLM', 'ZambaForCausalLM', 'Zamba2ForCausalLM'].

## Code to be refined ----

In [None]:
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline

# ----------------------------
# 1. Read PDF
# ----------------------------
pdfreader = PdfReader("/content/budget_speech.pdf")

raw_text = ""
for page in pdfreader.pages:
    content = page.extract_text()
    if content:
        raw_text += content

# ----------------------------
# 2. Split Text into Chunks
# ----------------------------
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)
texts = text_splitter.split_text(raw_text)
print(f"Number of text chunks: {len(texts)}")

# ----------------------------
# 3. Embeddings
# ----------------------------
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# ----------------------------
# 4. Create FAISS Vector Store
# ----------------------------
document_search = FAISS.from_texts(texts, embeddings)

# ----------------------------
# 5. Load QA Chain with Guardrails
# ----------------------------
# Use FLAN-T5 (smaller, efficient) via pipeline
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base", max_new_tokens=256)
llm = HuggingFacePipeline(pipeline=qa_pipeline)

# Custom Prompt (forces model to only use context)
template = """
You are a helpful assistant. Answer the question ONLY using the provided context.
If the answer is not in the context, say "I could not find information in the document."

Context:
{context}

Question: {question}
Answer:
"""
prompt = PromptTemplate(template=template, input_variables=["context", "question"])
chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)

# ----------------------------
# 6. Guardrails
# ----------------------------

# A simple moderation model (can be swapped with OpenAI moderation or others)
moderation = pipeline("text-classification", model="facebook/roberta-hate-speech-dynabench-r4-target")

def is_safe_query(query: str) -> bool:
    """Check query for toxicity/hate/offensive content."""
    result = moderation(query)[0]
    if result["label"].lower() in ["hate", "toxic", "offensive"]:
        return False
    return True

# Define allowed scope (domain guardrail)
allowed_scope = ["budget", "finance", "tax", "reforms", "economy"]

def is_in_scope(query: str) -> bool:
    return any(word in query.lower() for word in allowed_scope)

# ----------------------------
# 7. Query the Document
# ----------------------------
query = "Personal Income-tax reforms"

if not is_safe_query(query):
    print("⚠️ Query flagged as unsafe. Stopping execution.")
elif not is_in_scope(query):
    print("⚠️ Query is out of scope for this document.")
else:
    docs = document_search.similarity_search(query, k=3)
    if not docs:
        print("⚠️ No relevant context found in the document.")
    else:
        result = chain.run(input_documents=docs, question=query)
        print("\nAnswer:\n", result)

Number of text chunks: 116


Device set to use cpu


config.json:   0%|          | 0.00/816 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (751 > 512). Running this sequence through the model will result in indexing errors



Answer:
 (i)


In [None]:
# Use a summarization-focused model like facebook/bart-large-cnn or google/pegasus-xsum.

In [None]:
!pip install --upgrade transformers



## Refined code (Model training and output)


In [None]:
# 🔹 1. Imports
# ==============================================
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer
from datasets import Dataset
import json

# ==============================================
# 🔹 2. Read PDF
# ==============================================
pdfreader = PdfReader("/content/budget_speech.pdf")

raw_text = ""
for page in pdfreader.pages:
    content = page.extract_text()
    if content:
        raw_text += content

# ==============================================
# 🔹 3. Split Text into Chunks
# ==============================================
text_splitter = CharacterTextSplitter(
    separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len
)
texts = text_splitter.split_text(raw_text)
print(f"Number of text chunks: {len(texts)}")

# ==============================================
# 🔹 4. Embeddings + FAISS Vector Store
# ==============================================
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
document_search = FAISS.from_texts(texts, embeddings)

# ==============================================
# 🔹 5. Guardrails
# ==============================================
moderation = pipeline("text-classification", model="facebook/roberta-hate-speech-dynabench-r4-target")

def is_safe_query(query: str) -> bool:
    """Check query for toxicity/offensive content."""
    result = moderation(query)[0]
    return not (result["label"].lower() in ["hate", "toxic", "offensive"])

allowed_scope = ["budget", "finance", "tax", "reforms", "economy"]
def is_in_scope(query: str) -> bool:
    return any(word in query.lower() for word in allowed_scope)

# ==============================================
# 🔹 6. Create Unified Prompt (QA + Summarization)
# ==============================================
template = """
You are a helpful assistant. Perform the task requested.

If task is "question": Answer using the context only.
If task is "summarize": Provide a clear summary of the context.
If the answer is not in the context, say "I could not find information in the document."

Task: {task}
Context:
{context}

Input: {question}
Answer:
"""
prompt = PromptTemplate(template=template, input_variables=["task", "context", "question"])

# ==============================================
# 🔹 7. Load Base LLM (FLAN-T5)
# ==============================================
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base", max_new_tokens=256)
llm = HuggingFacePipeline(pipeline=qa_pipeline)
chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)

# ==============================================
# 🔹 8. Fine-Tuning Dataset (Q&A + Summarization)
# ==============================================
# Example dataset (extend with your own Q&A + summaries)
qa_sum_data = [
    {"instruction": "question", "input": "What changes were made in personal income tax?", "output": "Personal income tax slabs were revised to provide relief to middle-class taxpayers."},
    {"instruction": "summarize", "input": "Summarize the key highlights of the budget speech.", "output": "The budget emphasizes infrastructure growth, digital initiatives, and tax reforms."},
    {"instruction": "question", "input": "Does the budget mention agriculture?", "output": "Yes, it highlights new credit support schemes and measures for farmers."},
    {"instruction": "summarize", "input": "Summarize policies related to healthcare.", "output": "The budget increases spending on primary healthcare and expands insurance coverage."}
]

# Convert to input/output format
train_data = {
    "input_text": [f"task: {item['instruction']} | input: {item['input']}" for item in qa_sum_data],
    "target_text": [item["output"] for item in qa_sum_data],
}
dataset = Dataset.from_dict(train_data).train_test_split(test_size=0.2)

# ==============================================
# 🔹 9. Fine-Tune FLAN-T5
# ==============================================
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def preprocess(examples):
    model_inputs = tokenizer(examples["input_text"], max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(examples["target_text"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_ds = dataset.map(preprocess, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="./finetuned-flan-t5-qa-sum",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)
# ⚠️ Uncomment to train
# trainer.train()

# ==============================================
# 🔹 10. Reload Fine-Tuned Model
# ==============================================
# After training, load fine-tuned model
# qa_pipeline = pipeline("text2text-generation", model="./finetuned-flan-t5-qa-sum")
# llm = HuggingFacePipeline(pipeline=qa_pipeline)
# chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)

# ==============================================
# 🔹 11. Querying (Question or Summarization)
# ==============================================
query = "Summarize personal income-tax reforms"

if not is_safe_query(query):
    print("⚠️ Query flagged as unsafe. Stopping execution.")
elif not is_in_scope(query):
    print("⚠️ Query is out of scope for this document.")
else:
    docs = document_search.similarity_search(query, k=3)
    if not docs:
        print("⚠️ No relevant context found in the document.")
    else:
        # Define task
        task = "summarize" if "summarize" in query.lower() else "question"
        result = chain.run(input_documents=docs, question=query, task=task)
        print("\nAnswer:\n", result)


Number of text chunks: 116


Device set to use cpu
Device set to use cpu


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  trainer = Trainer(
Token indices sequence length is longer than the specified maximum sequence length for this model (799 > 512). Running this sequence through the model will result in indexing errors



Answer:
 (ii) Personal Income Tax reforms with special focus on middle class 1. Substantial relief is proposed under the new tax regime with new slabs and tax rates as under: - Total income Rate of tax Upto  4,00,000 Nil From  4,00,001 to  8,00,000 5 per cent From  8,00,001 to  12,00,000 10 per cent From  12,00,001 to  12,00,000 10 per cent From  12,00,001 to  16,00,000 15 per cent From  16,00,001 to  2,00,000 20 per cent From  2,00,001 to  24,00,000 25 per cent (i) Personal Income Tax reforms with special focus on middle class 1. Substantial relief is proposed under the new tax regime with new slabs and tax rates as under: - Total income Rate of tax Upto  4,00,000 Nil From  4,00,001 to  8,00,000 5 per cent From  8,00,001 to  12,00,000 10 per cent From  12,00,001 to  16,00,


In [None]:
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline
from langchain.prompts import PromptTemplate

# ----------------------------
# 1. Read PDF
# ----------------------------
pdfreader = PdfReader("/content/budget_speech.pdf")

raw_text = ""
for page in pdfreader.pages:
    content = page.extract_text()
    if content:
        raw_text += content

# ----------------------------
# 2. Split Text into Chunks
# ----------------------------
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)
texts = text_splitter.split_text(raw_text)
print(f"Number of text chunks: {len(texts)}")

# ----------------------------
# 3. Hugging Face Embeddings
# ----------------------------
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# ----------------------------
# 4. FAISS Vector Store
# ----------------------------
document_search = FAISS.from_texts(texts, embeddings)

# ----------------------------
# 5. Hugging Face LLM (Flan-T5)
# ----------------------------
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base", max_new_tokens=256)
llm = HuggingFacePipeline(pipeline=qa_pipeline)

# ----------------------------
# 6. Custom Prompts for map_reduce
# ----------------------------
map_prompt_template = """
You are a helpful assistant. Use the following context to answer the question.

Context:
{context}

Question: {question}

Answer in bullet points with clear structure.
"""
map_prompt = PromptTemplate(template=map_prompt_template, input_variables=["context", "question"])

combine_prompt_template = """
You are a summarizer. Combine the answers below into a clear, structured response.

Answers from documents:
{summaries}

Final Answer (use bullet points, avoid repetition):
"""
combine_prompt = PromptTemplate(template=combine_prompt_template, input_variables=["summaries"])

# Load chain with both prompts
chain = load_qa_chain(
    llm,
    chain_type="map_reduce",
    question_prompt=map_prompt,
    combine_prompt=combine_prompt,
)

# ----------------------------
# 7. Query the Document
# ----------------------------
query = "Personal Income-tax reforms"
docs = document_search.similarity_search(query)

result = chain.run(input_documents=docs, question=query)
print("\nAnswer:\n", result)


Number of text chunks: 116


Device set to use cpu



Answer:
 Slabs and rates are being changed across the board to benefit all tax - payers. The new structure will substantially reduce the taxes of the middle class and leave more money in their hands, boosting household consumption, savings and investment.


In [None]:
! pip install streamlit langchain faiss-cpu transformers pypdf2 sentence-transformers

Collecting streamlit
  Downloading streamlit-1.49.1-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.49.1-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m71.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m95.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.49.1


## Streamlit

In [None]:
# app.py
# ==============================================
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline
import difflib

# ==============================================
# 🔹 1. PDF Loader + Index Builder
# ==============================================
@st.cache_resource
def load_index(pdf_path="budget_speech.pdf"):
    pdfreader = PdfReader(pdf_path)
    raw_text = ""
    for page in pdfreader.pages:
        content = page.extract_text()
        if content:
            raw_text += content

    # Split text
    text_splitter = CharacterTextSplitter(
        separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len
    )
    texts = text_splitter.split_text(raw_text)

    # Vector store
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    document_search = FAISS.from_texts(texts, embeddings)

    return document_search

# ==============================================
# 🔹 2. Guardrails
# ==============================================
moderation = pipeline("text-classification", model="facebook/roberta-hate-speech-dynabench-r4-target")

def is_safe_query(query: str) -> bool:
    result = moderation(query)[0]
    return not (result["label"].lower() in ["hate", "toxic", "offensive"])

allowed_scope = ["budget", "finance", "tax", "reforms", "economy"]
def is_in_scope(query: str) -> bool:
    return any(word in query.lower() for word in allowed_scope)

# ==============================================
# 🔹 3. Prompt + LLM Chain
# ==============================================
template = """
You are a helpful assistant. Perform the task requested.

If task is "question": Answer using the context only.
If task is "summarize": Provide a clear summary of the context.
If the answer is not in the context, say "I could not find information in the document."

Task: {task}
Context:
{context}

Input: {question}
Answer:
"""
prompt = PromptTemplate(template=template, input_variables=["task", "context", "question"])

qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base", max_new_tokens=256)
llm = HuggingFacePipeline(pipeline=qa_pipeline)
chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)

# ==============================================
# 🔹 4. Accuracy Metric (String Similarity)
# ==============================================
def compute_accuracy(pred: str, reference: str) -> float:
    seq = difflib.SequenceMatcher(None, pred.lower(), reference.lower())
    return round(seq.ratio() * 100, 2)

# ==============================================
# 🔹 5. Streamlit UI
# ==============================================
st.set_page_config(page_title="📘 Budget QA & Summarizer", layout="centered")
st.title("📘 Budget QA & Summarizer")

# Load FAISS index
document_search = load_index("budget_speech.pdf")

query = st.text_input("🔹 Enter your query (question or summarization):")
if st.button("Get Answer"):
    if not query.strip():
        st.warning("⚠️ Please enter a query.")
    elif not is_safe_query(query):
        st.error("⚠️ Query flagged as unsafe. Try rephrasing.")
    elif not is_in_scope(query):
        st.error("⚠️ Query is out of scope for this document.")
    else:
        docs = document_search.similarity_search(query, k=3)
        if not docs:
            st.error("⚠️ No relevant context found in the document.")
        else:
            task = "summarize" if "summarize" in query.lower() else "question"
            result = chain.run(input_documents=docs, question=query, task=task)

            st.success("✅ Answer:")
            st.write(result)

            # For demo, compare against doc snippet (first chunk)
            reference = docs[0].page_content
            accuracy = compute_accuracy(result, reference)
            st.info(f"📊 Accuracy (vs. retrieved context): **{accuracy}%**")


Device set to use cpu
Device set to use cpu
2025-09-06 08:23:31.569 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-09-06 08:24:05.100 Session state does not function when running a script without `streamlit run`


In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=d67be2c3db6efdedc8308d05ebfe0dcc70e4a3d20a4854ff3c3607367d9bed50
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
# streamlit_app.py
# ==============================================
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline
import evaluate

# ==============================================
# 🔹 1. Setup
# ==============================================
st.set_page_config(page_title="Budget QA + Summarizer", layout="wide")
st.title("📘 Budget Speech QA & Summarizer with Guardrails + Accuracy")

# ==============================================
# 🔹 2. Load PDF and preprocess
# ==============================================
@st.cache_resource
def load_documents():
    pdfreader = PdfReader("budget_speech.pdf")
    raw_text = ""
    for page in pdfreader.pages:
        content = page.extract_text()
        if content:
            raw_text += content
    text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_text(raw_text)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return FAISS.from_texts(texts, embeddings)

document_search = load_documents()

# ==============================================
# 🔹 3. Guardrails
# ==============================================
moderation = pipeline("text-classification", model="facebook/roberta-hate-speech-dynabench-r4-target")

def is_safe_query(query: str) -> bool:
    result = moderation(query)[0]
    return not (result["label"].lower() in ["hate", "toxic", "offensive"])

allowed_scope = ["budget", "finance", "tax", "reforms", "economy"]
def is_in_scope(query: str) -> bool:
    return any(word in query.lower() for word in allowed_scope)

# ==============================================
# 🔹 4. LLM + Chain
# ==============================================
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base", max_new_tokens=256)
llm = HuggingFacePipeline(pipeline=qa_pipeline)

template = """
You are a helpful assistant. Perform the task requested.

If task is "question": Answer using the context only.
If task is "summarize": Provide a clear summary of the context.
If the answer is not in the context, say "I could not find information in the document."

Task: {task}
Context:
{context}

Input: {question}
Answer:
"""
prompt = PromptTemplate(template=template, input_variables=["task", "context", "question"])
chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)

# ==============================================
# 🔹 5. Accuracy Evaluation (ROUGE / BLEU)
# ==============================================
rouge = evaluate.load("rouge")

def compute_accuracy(pred: str, gold: str):
    scores = rouge.compute(predictions=[pred], references=[gold])
    return scores["rougeL"]

# ==============================================
# 🔹 6. Streamlit UI
# ==============================================
user_query = st.text_input("🔍 Enter your query:", placeholder="e.g., Summarize personal income-tax reforms")

gold_answer = st.text_area("✅ (Optional) Provide expected/ground-truth answer for accuracy check:")

if st.button("Get Answer"):
    if not user_query.strip():
        st.warning("Please enter a query.")
    elif not is_safe_query(user_query):
        st.error("⚠️ Query flagged as unsafe. Stopping execution.")
    elif not is_in_scope(user_query):
        st.error("⚠️ Query is out of scope for this document.")
    else:
        docs = document_search.similarity_search(user_query, k=3)
        if not docs:
            st.error("⚠️ No relevant context found in the document.")
        else:
            task = "summarize" if "summarize" in user_query.lower() else "question"
            result = chain.run(input_documents=docs, question=user_query, task=task)

            st.subheader("📖 Answer")
            st.text_area("Model's Answer:", value=result, height=200)

            if gold_answer.strip():
                acc = compute_accuracy(result, gold_answer)
                st.success(f"📊 Accuracy (ROUGE-L vs Ground Truth): {acc:.2f}")


Device set to use cpu
Device set to use cpu
