In [2]:
%pip install pandas

Collecting pandas
  Downloading pandas-3.0.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (79 kB)
Downloading pandas-3.0.0-cp313-cp313-macosx_11_0_arm64.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m29.2 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: pandas
Successfully installed pandas-3.0.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd

# Path to your census codebook
file_path = "/Users/kwatchomahinanda/Downloads/census_codebook.csv"

try:
    census_df = pd.read_csv(file_path)
    
    print("\n--- CENSUS CODEBOOK SUMMARY ---")
    print(f"Total Variables: {census_df.shape[0]}")
    print(f"Columns: {census_df.columns.tolist()}")
    
    print("\n--- FIRST 3 ROWS ---")
    print(census_df.head(3))
    
    print("\n--- DETAILED LOOK AT FIRST VARIABLE ---")
    for col in census_df.columns:
        print(f"{col}: {census_df.iloc[0][col]}")

except Exception as e:
    print(f"Error: {e}")


--- CENSUS CODEBOOK SUMMARY ---
Total Variables: 496956
Columns: ['title', 'description', 'vintage', 'variable', 'label']

--- FIRST 3 ROWS ---
                                               title  \
0  Apr 1989 Current Population Survey: Basic Monthly   
1  Apr 1989 Current Population Survey: Basic Monthly   
2  Apr 1989 Current Population Survey: Basic Monthly   

                                         description  vintage  variable  \
0  To provide estimates of employment, unemployme...     1989  A_ABSREA   
1  To provide estimates of employment, unemployme...     1989   A_AG_NA   
2  To provide estimates of employment, unemployme...     1989     A_AGE   

                                         label  
0  Labor Force-reasons for absence, pay status  
1     Indus.&Occ.-agriculture, non-agriculture  
2                              Demographic-age  

--- DETAILED LOOK AT FIRST VARIABLE ---
title: Apr 1989 Current Population Survey: Basic Monthly
description: To provide estimates o

In [6]:
import pandas as pd
import os

# Paths to your BWDC files
edu_path = "/Users/kwatchomahinanda/Downloads/explore_data_edu_01/edu_01.csv"
dict_path = "/Users/kwatchomahinanda/Downloads/explore_data_edu_01/data-dictionary.xlsx"

print("\n--- BWDC EDUCATION DATA ---")
try:
    edu_df = pd.read_csv(edu_path)
    print(f"Shape: {edu_df.shape}")
    print(f"Columns: {edu_df.columns.tolist()[:10]}... (Total: {len(edu_df.columns)})")
    print("\nFirst 2 rows:")
    print(edu_df.head(2))
except Exception as e:
    print(f"Error loading CSV: {e}")

print("\n--- DATA DICTIONARY (EXCEL) ---")
try:
    dict_df = pd.read_excel(dict_path)
    print(f"Columns: {dict_df.columns.tolist()}")
    print("\nFirst 3 entries:")
    print(dict_df.head(3))
except Exception as e:
    print(f"Error loading Excel: {e}")


--- BWDC EDUCATION DATA ---
Shape: (312, 5)
Columns: ['sex', 'educational_attainment', 'year', 'race_or_ethnicity', 'rate']... (Total: 5)

First 2 rows:
      sex       educational_attainment  year race_or_ethnicity  rate
0    Male  Bachelor's or higher degree  2014             Black  21.0
1  Female  Bachelor's or higher degree  2017          Hispanic  18.6

--- DATA DICTIONARY (EXCEL) ---
Columns: ['column_name', 'visual name', 'variable name on visual', 'visual_unique_id', 'data table', 'publisher', 'description']

First 3 entries:
         column_name                                        visual name  \
0               year  Median Total Assets for Households, by Race/Et...   
1               race  Median Total Assets for Households, by Race/Et...   
2  education_buckets  Median Total Assets for Households, by Race/Et...   

      variable name on visual visual_unique_id                     data table  \
0                        Year           adt_01  \nSummary Extract Public Data

In [8]:
import pandas as pd
import chromadb
from chromadb.utils import embedding_functions

# 1. Setup ChromaDB (Local Persistence)
client = chromadb.PersistentClient(path="./invest_atlanta_vault")

# Using a standard embedding model (will download automatically on first run)
emb_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

# Create a collection for Education Stats
edu_collection = client.get_or_create_collection(name="bwdc_education", embedding_function=emb_fn)

# 2. Load the CSV
df = pd.read_csv("/Users/kwatchomahinanda/Downloads/explore_data_edu_01/edu_01.csv")

documents = []
metadatas = []
ids = []

print("Vectorizing data... please wait.")

for idx, row in df.iterrows():
    # Construct a human-readable sentence for the AI to "read"
    sentence = (f"In {row['year']}, the educational attainment rate for {row['race_or_ethnicity']} "
                f"{row['sex']} individuals labeled as '{row['educational_attainment']}' was {row['rate']}%.")
    
    documents.append(sentence)
    metadatas.append({"year": int(row['year']), "race": row['race_or_ethnicity']})
    ids.append(f"edu_row_{idx}")

# 3. Add to ChromaDB
edu_collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

print(f"Successfully vectorized {len(documents)} rows into the local vault.")

# Force-save the data to the disk (specific to Jupyter environments)
client.heartbeat() 
print("Vault is locked and saved.")

Vectorizing data... please wait.
Successfully vectorized 312 rows into the local vault.
Vault is locked and saved.


In [9]:
# 1. Define your question
query = "What is the bachelor's degree rate for Black men?"

# 2. Search the vault
# n_results=3 means "give me the top 3 closest matches"
results = edu_collection.query(
    query_texts=[query],
    n_results=3
)

# 3. Print the results
print(f"--- RESULTS FOR: '{query}' ---")
for i, doc in enumerate(results['documents'][0]):
    print(f"Match {i+1}: {doc}")

--- RESULTS FOR: 'What is the bachelor's degree rate for Black men?' ---
Match 1: In 2017, the educational attainment rate for Black Male individuals labeled as 'Bachelor's or higher degree' was 22.6%.
Match 2: In 2018, the educational attainment rate for Black Male individuals labeled as 'Bachelor's or higher degree' was 23.7%.
Match 3: In 2019, the educational attainment rate for Black Male individuals labeled as 'Bachelor's or higher degree' was 24.4%.
