In [None]:
!pip install transformers sentence-transformers langchain==0.3.25 torch faiss-cpu numpy langchain_community pypdf sentence_transformers langchain_huggingface

Collecting langchain==0.3.25
  Downloading langchain-0.3.25-py3-none-any.whl.metadata (7.8 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting pypdf
  Downloading pypdf-5.7.0-py3-none-any.whl.metadata (7.2 kB)
Collecting langchain_huggingface
  Downloading langchain_huggingface-0.3.0-py3-none-any.whl.metadata (996 bytes)
Collecting langsmith<0.4,>=0.1.17 (from langchain==0.3.25)
  Downloading langsmith-0.3.45-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Do

In [None]:
import os
from urllib.request import urlretrieve
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate

# Data loading

In [None]:
# Download documents from U.S. Census Bureau to local directory.
os.makedirs("us_census", exist_ok=True)
files = [
    "https://www.census.gov/content/dam/Census/library/publications/2022/demo/p70-178.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-017.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-016.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-015.pdf",
]
for url in files:
    file_path = os.path.join("us_census", url.rpartition("/")[2])
    urlretrieve(url, file_path)

In [None]:
# Load pdf files in the local directory
loader = PyPDFDirectoryLoader("./us_census/")

docs_before_split = loader.load()
print(len(docs_before_split))

63


# Chunking

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap  = 50,
)
docs_after_split = text_splitter.split_documents(docs_before_split)

docs_after_split[0]

Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.5 (Windows)', 'creationdate': '2023-10-19T11:35:38-04:00', 'author': 'U.S. Census Bureau', 'keywords': 'household income in states and metropolitan areas 2022', 'moddate': '2023-11-30T12:35:09+00:00', 'title': 'Household Income in States and Metropolitan Areas: 2022', 'trapped': '/false', 'source': 'us_census/acsbr-017.pdf', 'total_pages': 9, 'page': 0, 'page_label': '1'}, page_content='KEY DEFINITIONS\nHousehold income: Includes income of the \nhouseholder and all other people 15 years and \nolder in the household, whether or not they are \nrelated to the householder.\nMedian: The point that divides the household \nincome distribution into halves, one half with \nincome above the median and the other with \nincome below the median. The median is based \non the income distribution of all households, \nincluding those with no income.\nGini index: A summary measure of income \ninequality. The Gini inde

In [None]:
docs_after_split = docs_after_split[:50]

# Embedding Model Initialization

In [None]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/all-MiniLM-l6-v2",  # alternatively use "sentence-transformers/all-MiniLM-l6-v2" for a light and faster experience.
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

  huggingface_embeddings = HuggingFaceBgeEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Embedding Model Test

In [None]:
sample_embedding = np.array(huggingface_embeddings.embed_query(docs_after_split[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

Sample embedding of a document chunk:  [-4.50778641e-02  1.50797488e-02  2.46991422e-02 -6.65423111e-04
 -4.38780524e-02 -1.07891159e-02 -5.07413708e-02  3.44191343e-02
 -5.41927777e-02 -9.30074137e-03 -1.30154910e-02 -5.37789166e-02
  2.13211495e-02  1.02954237e-02 -9.92281437e-02 -6.54240847e-02
 -3.09051052e-02 -1.58301797e-02  1.41140707e-02  6.87583685e-02
  9.43290964e-02  2.07248311e-02  7.63684185e-03 -4.11620140e-02
  1.31620556e-01 -3.66067402e-02  7.67383585e-03 -3.41178812e-02
  2.31406316e-02  1.17237911e-01  2.92219147e-02  7.96835274e-02
  1.58325985e-01 -3.14751416e-02 -3.04871630e-02 -2.70445719e-02
  5.88974282e-02 -1.05799651e-02  9.17627141e-02 -3.91758494e-02
  2.69899424e-02 -5.07464521e-02 -5.43461367e-03 -5.83154075e-02
  2.54384838e-02 -2.80841272e-02  1.25270542e-02  6.83660731e-02
  4.43656109e-02  3.13634425e-02 -9.45478603e-02  6.73463047e-02
  4.78056408e-02  3.92849743e-02  5.03921211e-02  1.96698923e-02
 -1.25939446e-02 -4.76628207e-02  5.71067259e-02  5

# Embeddings creation & Vector db data injection

In [None]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)

# Retriving Chunks

In [None]:
query = """Details of Clayton Gumber and Briana Sullivan"""
         # Sample question, change to other questions you are interested in.
relevant_documents = vectorstore.similarity_search(query,k=4)
print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
print(relevant_documents[0].page_content)

There are 4 documents retrieved which are relevant to the query. Display the first one:

North Carolina .......... 66,986 584 67,4 81 623 0.7 1.3 0.480 0.004 0.477 0.004 –0.7 1.1
North Dakota  ........... 71,901 1,970 71,970 2,072 0.1 4.0 0.453 0.012 0.468 0.013 3.2 3.8
Ohio ................... 67, 29 9 493 65,720 618 *–2.3 1.2 0.469 0.003 0.469 0.004 0.1 1.0
Oklahoma  .............. 60,342 674 59,673 689 –1.1 1.6 0.462 0.004 0.474 0.005 *2.7 1.4
Oregon ................. 77,352 1,144 75,657 1,181 *–2.2 2.1 0.461 0.005 0.468 0.005 1.5 1.5
Pennsylvania  ........... 74,537 545 71,798 557 *–3.7 1.0 0.472 0.002 0.478 0.003 *1.2 0.8
Rhode Island  ........... 79,996 2,574 81,854 2,182 2.3 4.2 0.468 0.011 0.464 0.014 –0.9 3.8


In [None]:
final_content=""
for i in range(len(relevant_documents)):
  final_content+=relevant_documents[i].page_content
final_content

'North Carolina .......... 66,986 584 67,4 81 623 0.7 1.3 0.480 0.004 0.477 0.004 –0.7 1.1\nNorth Dakota  ........... 71,901 1,970 71,970 2,072 0.1 4.0 0.453 0.012 0.468 0.013 3.2 3.8\nOhio ................... 67, 29 9 493 65,720 618 *–2.3 1.2 0.469 0.003 0.469 0.004 0.1 1.0\nOklahoma  .............. 60,342 674 59,673 689 –1.1 1.6 0.462 0.004 0.474 0.005 *2.7 1.4\nOregon ................. 77,352 1,144 75,657 1,181 *–2.2 2.1 0.461 0.005 0.468 0.005 1.5 1.5\nPennsylvania  ........... 74,537 545 71,798 557 *–3.7 1.0 0.472 0.002 0.478 0.003 *1.2 0.8\nRhode Island  ........... 79,996 2,574 81,854 2,182 2.3 4.2 0.468 0.011 0.464 0.014 –0.9 3.8most populous metropolitan areas.\n2, 3 It also includes \nselected demographic characteristics of the house-\nholder. Changes in the Gini index between 2021 and \n2022 are presented for the nation, states, the District \nof Columbia, and Puerto Rico.  \nThe ACS data (which include the PRCS) provide \ndetailed estimates of demographic, social, economic,

In [None]:
final_prompt = f"""Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

{final_content}

Question: {query}

Helpful Answer:
"""
final_prompt

'Use the following pieces of context to answer the question at the end. Please follow the following rules:\n1. If you don\'t know the answer, don\'t try to make up an answer. Just say "I can\'t find the final answer but you may want to check the following links".\n2. If you find the answer, write the answer in a concise way with five sentences maximum.\n\nNorth Carolina .......... 66,986 584 67,4 81 623 0.7 1.3 0.480 0.004 0.477 0.004 –0.7 1.1\nNorth Dakota  ........... 71,901 1,970 71,970 2,072 0.1 4.0 0.453 0.012 0.468 0.013 3.2 3.8\nOhio ................... 67, 29 9 493 65,720 618 *–2.3 1.2 0.469 0.003 0.469 0.004 0.1 1.0\nOklahoma  .............. 60,342 674 59,673 689 –1.1 1.6 0.462 0.004 0.474 0.005 *2.7 1.4\nOregon ................. 77,352 1,144 75,657 1,181 *–2.2 2.1 0.461 0.005 0.468 0.005 1.5 1.5\nPennsylvania  ........... 74,537 545 71,798 557 *–3.7 1.0 0.472 0.002 0.478 0.003 *1.2 0.8\nRhode Island  ........... 79,996 2,574 81,854 2,182 2.3 4.2 0.468 0.011 0.464 0.014 –0.9 3

In [None]:
!pip install groq

Collecting groq
  Downloading groq-0.29.0-py3-none-any.whl.metadata (16 kB)
Downloading groq-0.29.0-py3-none-any.whl (130 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.8/130.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.29.0


# Answer Generation

In [None]:
from groq import Groq

# ✅ Define your Groq API key
client = Groq(api_key="User API")

# ✅ Define the prompt clearly
final_prompt = (
    "Who are Clayton Gumber and Briana Sullivan? "
    "What is their connection to the U.S. Census Bureau or any economic data?"
)

# ✅ Call the model
chat_completion = client.chat.completions.create(
    model="llama3-70b-8192",  # Use correct model name
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": final_prompt}
    ]
)

# ✅ Print the result
print(chat_completion.choices[0].message.content)

Clayton Gumber and Briana Sullivan are both experts in the field of economics and statistics, and they have a significant connection to the U.S. Census Bureau and economic data.

Clayton Gumber is a former Associate Director for Economic Programs at the U.S. Census Bureau. In this role, he oversaw the development and publication of various economic data programs, including the Census of Retail Trade, the Census of Wholesale Trade, and the Economic Census. Gumber also played a key role in the development of the North American Industry Classification System (NAICS), which is used to classify businesses for data collection purposes.

Briana Sullivan is a statistician and economist who has worked at the U.S. Census Bureau, specifically in the Center for Economic Studies (CES). CES is a research arm of the Census Bureau that focuses on producing and analyzing economic data, particularly at the micro level (i.e., data on individual firms, households, and establishments). Sullivan's work has 

In [None]:
!pip freeze

absl-py==1.4.0
accelerate==1.7.0
aiofiles==24.1.0
aiohappyeyeballs==2.6.1
aiohttp==3.11.15
aiosignal==1.3.2
alabaster==1.0.0
albucore==0.0.24
albumentations==2.0.8
ale-py==0.11.1
altair==5.5.0
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.9.0
argon2-cffi==25.1.0
argon2-cffi-bindings==21.2.0
array_record==0.7.2
arviz==0.21.0
astropy==7.1.0
astropy-iers-data==0.2025.6.2.0.38.23
astunparse==1.6.3
atpublic==5.1
attrs==25.3.0
audioread==3.0.1
autograd==1.8.0
babel==2.17.0
backcall==0.2.0
backports.tarfile==1.2.0
beautifulsoup4==4.13.4
betterproto==2.0.0b6
bigframes==2.5.0
bigquery-magics==0.9.0
bleach==6.2.0
blinker==1.9.0
blis==1.3.0
blobfile==3.0.0
blosc2==3.3.4
bokeh==3.7.3
Bottleneck==1.4.2
bqplot==0.12.45
branca==0.8.1
build==1.2.2.post1
CacheControl==0.14.3
cachetools==5.5.2
catalogue==2.0.10
certifi==2025.4.26
cffi==1.17.1
chardet==5.2.0
charset-normalizer==3.4.2
chex==0.1.89
clarabel==0.11.0
click==8.2.1
cloudpathlib==0.21.1
cloudpickle==3.1.1
cmake==3.31.6
cmdstanpy