**Install necessary libraries**


In [1]:

!pip install PyPDF2 sentence-transformers faiss-cpu openai nltk


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2, faiss-cpu
Successfully installed PyPDF2-3.0.1 faiss-cpu-1.9.0.post1


**upload the pdf file**

In [2]:
from google.colab import files
uploaded = files.upload()

Saving task1_pdf.pdf to task1_pdf.pdf


In [3]:
!rm -rf /root/nltk_data  # Remove existing NLTK data folder


In [4]:
import nltk
nltk.download('punkt', force=True)
nltk.data.path.append('/root/nltk_data')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [10]:
!pip install pdfplumber




**find the tables in the pdf**

In [20]:
import pdfplumber
import pandas as pd


pdf_path = "/content/task1_pdf.pdf"

# Extract tables from the PDF
def extract_tables_from_pdf(pdf_path):
    all_tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages):
            tables = page.extract_tables()
            for table in tables:
                df = pd.DataFrame(table[1:], columns=table[0])
                df['Page'] = page_number + 1
                all_tables.append(df)
    return all_tables


tables = extract_tables_from_pdf(pdf_path)
print(f"Total tables extracted: {len(tables)}")


if tables:
    print("\nFirst Table Extracted:")
    print(tables[0])  # Print the first table
else:
    print("No tables found.")


Total tables extracted: 2

First Table Extracted:
                                                Year      2010      2011  \
0                                     All Industries  26093515  27535971   
1                                      Manufacturing   4992521   5581942   
2  Finance,\nInsurance, Real\nEstate, Rental,\nLe...   4522451   4618678   
3  Arts,\nEntertainment,\nRecreation,\nAccommodat...    964032   1015238   
4                                              Other  15614511  16320113   

       2012      2013      2014      2015  Page  
0  28663246  29601191  30895407  31397023     6  
1   5841608   5953299   6047477   5829554     6  
2   4797313   5031881   5339678   5597018     6  
3   1076249   1120496   1189646   1283813     6  
4  16948076  17495515  18318606  18686638     6  


**answering the some table related queries**

In [21]:

queries = [
    {"keyword": "Manufacturing", "year": "2013"},
    {"keyword": "Finance", "year": "2014"},
    {"keyword": "Arts", "year": "2015"}
]


def query_gdp(dataframe, keyword, year):
    result = dataframe[dataframe['Year'].str.contains(keyword, case=False, na=False)]
    if not result.empty and year in dataframe.columns:
        return result[year].values[0]
    return "Data not found"

for query in queries:
    result = query_gdp(tables[0], query["keyword"], query["year"])
    print(f"GDP for {query['keyword']} in {query['year']}: {result}")


GDP for Manufacturing in 2013: 5953299
GDP for Finance in 2014: 5339678
GDP for Arts in 2015: 1283813


**extract both tables and text in pdf and divide the text into some nuber of chunks**

In [22]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np


def extract_text_and_tables(pdf_path):
    all_content = []

    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages):

            text = page.extract_text()
            if text:
                all_content.append((f"Page {page_number+1} Text", text))


            tables = page.extract_tables()
            for table in tables:
                df = pd.DataFrame(table[1:], columns=table[0])
                table_text = df.to_string(index=False)  # Convert table to text
                all_content.append((f"Page {page_number+1} Table", table_text))

    return all_content

# Path to PDF
pdf_path = "/content/task1_pdf.pdf"

# Extract and combine content
content_chunks = extract_text_and_tables(pdf_path)

# Print the number of chunks
print(f"Total Chunks (Text + Tables): {len(content_chunks)}")
print("\nFirst Chunk:")
print(content_chunks[0])


Total Chunks (Text + Tables): 20

First Chunk:
('Page 1 Text', 'Tables, Charts, and\nGraphs\nwith Examples from History, Economics,\nEducation, Psychology, Urban Affairs and\nEveryday Life\nREVISED: MICHAEL LOLKUS 2018')


generate the word embenddings and store them in FAISS(which will act as a database to store the word embenddings)

In [23]:

model = SentenceTransformer('all-MiniLM-L6-v2')

chunk_texts = [chunk[1] for chunk in content_chunks]
embeddings = model.encode(chunk_texts, show_progress_bar=True)


embedding_dim = embeddings[0].shape[0]
index = faiss.IndexFlatL2(embedding_dim)


embeddings_array = np.array(embeddings).astype('float32')
index.add(embeddings_array)

print(f"Total Embeddings Added to FAISS Index: {index.ntotal}")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Total Embeddings Added to FAISS Index: 20


**Search FAISS for text-based answers**

In [24]:
# Search FAISS for text-based answers
def semantic_search(query, model, index, content_chunks, top_k=3):
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding).astype('float32'), top_k)
    results = [content_chunks[i] for i in indices[0]]
    return results


def query_table(tables, keyword, year):
    results = []
    for table in tables:
        for col in table.columns:
            if year in col:
                matching_rows = table[table.iloc[:, 0].str.contains(keyword, case=False, na=False)]
                for _, row in matching_rows.iterrows():
                    results.append((keyword, year, row[col]))
    return results

#querry
user_query = "What is the GDP for Manufacturing in 2013?"

# Search chunks
semantic_results = semantic_search(user_query, model, index, content_chunks, top_k=2)
print("\nTop Semantic Search Results:")
for result in semantic_results:
    print(f"{result[0]}: {result[1]}")

#results
table_results = query_table([tables[0]], "Manufacturing", "2013")
print("\nTable Search Results:")
for result in table_results:
    print(f"GDP for {result[0]} in {result[1]}: {result[2]}")



Top Semantic Search Results:
Page 7 Text: • The chart below is called a pie chart. It shows what
percent “of the pie” a particular category occupies
out of the whole.
• If total GDP in 2015 is the entire pie, then
manufacturing makes up 19% of that pie and finance
makes up 18%. Notice that visually speaking, since 19%
and 18% are so close to each other in value, their
respective slices of the pie are similarly sized.
2015 U.S. GDP (in millions of dollars)
Manufacturing
19%
Finance, insurance, real
estate, rental, and
leasing
18% Arts, entertainment,
59%
recreation,
accommodation, and
food services
Other
4%
Page 6 Text: Table of Yearly U.S. GDP by
Industry (in millions of dollars)
Source: U.S. Bureau of Labor Statistics
Year 2010 2011 2012 2013 2014 2015
All Industries 26093515 27535971 28663246 29601191 30895407 31397023
Manufacturing 4992521 5581942 5841608 5953299 6047477 5829554
Finance,
Insurance, Real
4522451 4618678 4797313 5031881 5339678 5597018
Estate, Rental,
Leasing
Arts,
E

**now generating the response by using the LLM / transformer**

In [25]:
from transformers import pipeline

#using google/flan-t5-base model to generate the text
llm_pipeline = pipeline("text-generation", model="google/flan-t5-base", max_length=200)

print("LLM pipeline loaded successfully!")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMo

LLM pipeline loaded successfully!


**generating the response for query prompt**

In [29]:
def prepare_llm_input(query, semantic_results, table_results):
    """
    Combine retrieved semantic chunks and table search results into a single input prompt for the LLM.
    Filter the results to prioritize chunks that contain year-based data (e.g., 2013).
    """
    # Filter semantic results and table results for relevance
    relevant_semantic_results = [result for result in semantic_results if "2013" in result[1]]
    relevant_table_results = [result for result in table_results if "2013" in result[1]]


    prompt = f"Question: '{query}'\n\n"


    prompt += "Context from Retrieved Information (Filtered for relevance):\n"
    for i, (source, text) in enumerate(relevant_semantic_results, 1):
        prompt += f"{i}. {source}: {text.strip()}\n"


    prompt += "\nAdditional Data from Tables (Filtered for 2013):\n"
    for result in relevant_table_results:
        keyword, year, value = result
        prompt += f"- The GDP for {keyword} in {year} is {value}.\n"


    prompt += "\nAnswer the question concisely using only the necessary information provided above."

    return prompt

# query
query = "What is the GDP for Manufacturing in 2013?"

semantic_results = [
    ("Page 7 Text", "The chart below shows the GDP for Manufacturing, Finance, etc., including 2015 data."),
    ("Page 6 Text", "Table of Yearly U.S. GDP by Industry (in millions of dollars) with data from 2010 to 2015.")
]

table_results = [
    ("Manufacturing", "2013", "5953299"),
    ("Finance", "2013", "5031881"),
    ("Arts", "2013", "1120496")
]


llm_input = prepare_llm_input(query, semantic_results, table_results)


print("Generated Input for LLM:")
print(llm_input)

# Generate the final answer using the LLM
llm_output = llm_pipeline(llm_input, truncation=True)[0]["generated_text"]

# Display the final answer
print("\nFinal Answer Generated by the LLM:")
print(llm_output)


Generated Input for LLM:
Question: 'What is the GDP for Manufacturing in 2013?'

Context from Retrieved Information (Filtered for relevance):

Additional Data from Tables (Filtered for 2013):
- The GDP for Manufacturing in 2013 is 5953299.
- The GDP for Finance in 2013 is 5031881.
- The GDP for Arts in 2013 is 1120496.

Answer the question concisely using only the necessary information provided above.

Final Answer Generated by the LLM:
Question: 'What is the GDP for Manufacturing in 2013?'

Context from Retrieved Information (Filtered for relevance):

Additional Data from Tables (Filtered for 2013):
- The GDP for Manufacturing in 2013 is 5953299.
- The GDP for Finance in 2013 is 5031881.
- The GDP for Arts in 2013 is 1120496.

Answer the question concisely using only the necessary information provided above.
