## Explore `censusdis`

In [1]:
!pip install censusdis



In [2]:
import censusdis.data as ced
from censusdis import states

In [3]:
df_datasets = ced.variables.all_data_sets()

df_datasets[df_datasets['DATASET'].str.contains('acs5')].to_csv('acs5_datasets.csv', index=False)

In [50]:
acs5_profile_groups = ced.variables.all_groups('acs/acs5/profile', 2020)

acs5_profile_groups.to_csv('acs5_profile_groups.csv', index=False)

In [51]:
acs5_groups = ced.variables.all_groups('acs/acs5', 2020)

acs5_groups.to_csv('acs5_groups.csv', index=False)

In [52]:
ced.variables.all_variables('acs/acs5/profile', 2020, 'DP04')

Unnamed: 0,YEAR,DATASET,GROUP,VARIABLE,LABEL,SUGGESTED_WEIGHT,VALUES
0,2020,acs/acs5/profile,DP04,DP04_0001E,Estimate!!HOUSING OCCUPANCY!!Total housing units,,
1,2020,acs/acs5/profile,DP04,DP04_0001PE,Percent!!HOUSING OCCUPANCY!!Total housing units,,
2,2020,acs/acs5/profile,DP04,DP04_0001PM,Percent Margin of Error!!HOUSING OCCUPANCY!!To...,,
3,2020,acs/acs5/profile,DP04,DP04_0002E,Estimate!!HOUSING OCCUPANCY!!Total housing uni...,,
4,2020,acs/acs5/profile,DP04,DP04_0002PE,Percent!!HOUSING OCCUPANCY!!Total housing unit...,,
...,...,...,...,...,...,...,...
424,2020,acs/acs5/profile,DP04,DP04_0142PE,Percent!!GROSS RENT AS A PERCENTAGE OF HOUSEHO...,,
425,2020,acs/acs5/profile,DP04,DP04_0142PM,Percent Margin of Error!!GROSS RENT AS A PERCE...,,
426,2020,acs/acs5/profile,DP04,DP04_0143E,Estimate!!GROSS RENT AS A PERCENTAGE OF HOUSEH...,,
427,2020,acs/acs5/profile,DP04,DP04_0143PE,Percent!!GROSS RENT AS A PERCENTAGE OF HOUSEHO...,,


In [7]:
ced.download(
    dataset='acs/acs5',
    vintage=2020, 
    download_variables=['B02001_002E'],
    state=states.MA
)

Unnamed: 0,STATE,B02001_002E
0,25,5261787


In [8]:
ced.download(
    dataset='acs/acs5',
    vintage=2020, 
    download_variables=['B02008_001E'],
    state=states.MA
)

Unnamed: 0,STATE,B02008_001E
0,25,5546174


## Tokenize descriptions

In [9]:
! pip install nltk
! pip install stopwords
! pip install KMeans
! pip install Word2Vec



In [11]:
import nltk

# Download the punkt package
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/mia694/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mia694/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

In [15]:
descriptions = acs5_groups['DESCRIPTION'].tolist()

# Preprocess descriptions
stop_words = set(stopwords.words('english'))
preprocessed_descriptions = []
for description in descriptions:
    tokens = word_tokenize(description.lower())
    filtered_tokens = [w for w in tokens if w not in stop_words and w not in string.punctuation]
    preprocessed_descriptions.append(filtered_tokens)
    
preprocessed_descriptions


[['sex', 'age'],
 ['sex', 'age', 'white', 'alone'],
 ['sex', 'age', 'black', 'african', 'american', 'alone'],
 ['sex', 'age', 'american', 'indian', 'alaska', 'native', 'alone'],
 ['sex', 'age', 'asian', 'alone'],
 ['sex', 'age', 'native', 'hawaiian', 'pacific', 'islander', 'alone'],
 ['sex', 'age', 'race', 'alone'],
 ['sex', 'age', 'two', 'races'],
 ['sex', 'age', 'white', 'alone', 'hispanic', 'latino'],
 ['sex', 'age', 'hispanic', 'latino'],
 ['median', 'age', 'sex'],
 ['median', 'age', 'sex', 'white', 'alone'],
 ['median', 'age', 'sex', 'black', 'african', 'american', 'alone'],
 ['median', 'age', 'sex', 'american', 'indian', 'alaska', 'native', 'alone'],
 ['median', 'age', 'sex', 'asian', 'alone'],
 ['median',
  'age',
  'sex',
  'native',
  'hawaiian',
  'pacific',
  'islander',
  'alone'],
 ['median', 'age', 'sex', 'race', 'alone'],
 ['median', 'age', 'sex', 'two', 'races'],
 ['median', 'age', 'sex', 'white', 'alone', 'hispanic', 'latino'],
 ['median', 'age', 'sex', 'hispanic', 'la

## Import embeddings

In [34]:
!pip install gensim




In [36]:
from gensim import downloader as api

# Download and load the embeddings model
model = api.load("glove-wiki-gigaword-300")



In [45]:
import numpy as np

# Assuming `preprocessed_descriptions` is a list of tokenized descriptions
vectors = []
for description in preprocessed_descriptions:
    # Filter out words not in the model's vocabulary
    valid_words = [word for word in description if word in model.key_to_index]
    
    # Only process descriptions with at least one valid word
    if valid_words:
        word_vectors = [model[word] for word in valid_words]
        description_vector = np.mean(word_vectors, axis=0)
        vectors.append(description_vector)
    else:
        # Handle descriptions with no valid words (e.g., by appending a zero vector)
        vectors.append(np.zeros(model.vector_size))

# Now `vectors` contains the averaged word vectors for each description,
# excluding any out-of-vocabulary words.
        
vectors


[array([-4.73675013e-01,  3.99347469e-02,  1.01050064e-02,  1.57455020e-02,
        -1.21304989e-01,  5.98909974e-01,  5.36400042e-02, -4.87389982e-01,
         3.24770004e-01, -1.22232509e+00,  7.24520013e-02,  1.29459500e-01,
        -2.12298006e-01,  2.70054996e-01,  2.60399997e-01, -5.91785014e-01,
         2.71444973e-02, -3.01869988e-01, -1.20894998e-01, -9.94170010e-02,
        -2.92539984e-01,  7.32504964e-01, -4.53880519e-01,  2.62840003e-01,
        -7.76995003e-01,  2.06949972e-02, -6.64085001e-02, -4.65871990e-01,
         1.43000007e-01, -1.05497494e-01, -3.02013516e-01,  4.77440000e-01,
        -3.55920017e-01, -3.09749484e-01, -5.77790022e-01,  3.59997153e-04,
        -2.37719998e-01, -3.39149982e-02,  2.61398494e-01,  3.72599959e-02,
         8.21259975e-01,  1.04999542e-03, -2.68416017e-01, -4.53345001e-01,
         9.44714993e-02,  2.48949975e-02,  1.35605991e-01,  1.56354994e-01,
         4.41904999e-02, -3.11903000e-01,  1.65700004e-01, -2.21378997e-01,
        -1.5

## Cluster the vectors using KMeans

In [46]:
from sklearn.cluster import KMeans

In [61]:
# Cluster the vectors
num_clusters = 15  # Adjust based on your needs
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(vectors)

# Assign descriptions to clusters
clusters = kmeans.labels_

In [62]:
# Output the results
for i, cluster in enumerate(clusters):
    print(f"Description: {descriptions[i]} -> Cluster: {cluster}")

Description: SEX BY AGE -> Cluster: 2
Description: SEX BY AGE (WHITE ALONE) -> Cluster: 2
Description: SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE) -> Cluster: 2
Description: SEX BY AGE (AMERICAN INDIAN AND ALASKA NATIVE ALONE) -> Cluster: 6
Description: SEX BY AGE (ASIAN ALONE) -> Cluster: 2
Description: SEX BY AGE (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE) -> Cluster: 6
Description: SEX BY AGE (SOME OTHER RACE ALONE) -> Cluster: 2
Description: SEX BY AGE (TWO OR MORE RACES) -> Cluster: 2
Description: SEX BY AGE (WHITE ALONE, NOT HISPANIC OR LATINO) -> Cluster: 14
Description: SEX BY AGE (HISPANIC OR LATINO) -> Cluster: 14
Description: MEDIAN AGE BY SEX -> Cluster: 2
Description: MEDIAN AGE BY SEX (WHITE ALONE) -> Cluster: 2
Description: MEDIAN AGE BY SEX (BLACK OR AFRICAN AMERICAN ALONE) -> Cluster: 2
Description: MEDIAN AGE BY SEX (AMERICAN INDIAN AND ALASKA NATIVE ALONE) -> Cluster: 6
Description: MEDIAN AGE BY SEX (ASIAN ALONE) -> Cluster: 2
Description: MEDIAN AGE BY SEX 

In [63]:
import numpy as np

# Assuming `clusters` is your list of cluster labels for each description
# And `descriptions` is the list of all descriptions

# Organize descriptions by cluster
clustered_descriptions = {}
for i, cluster in enumerate(clusters):
    if cluster not in clustered_descriptions:
        clustered_descriptions[cluster] = []
    clustered_descriptions[cluster].append(descriptions[i])

# Sample or show a few entries from each cluster
samples_per_cluster = 5
for cluster, descs in clustered_descriptions.items():
    print(f"Cluster {cluster}:")
    if len(descs) > samples_per_cluster:
        sample_descs = np.random.choice(descs, samples_per_cluster, replace=False)
    else:
        sample_descs = descs
    for desc in sample_descs:
        print(f" - {desc}")
    print()  # Add an empty line for better readability


Cluster 2:
 - MEDIAN AGE AT FIRST MARRIAGE (BLACK OR AFRICAN AMERICAN ALONE)
 - SEX BY AGE BY EMPLOYMENT STATUS FOR THE POPULATION 16 YEARS AND OVER (SOME OTHER RACE ALONE)
 - SEX BY AGE BY VETERAN STATUS FOR THE CIVILIAN POPULATION 18 YEARS AND OVER (SOME OTHER RACE ALONE)
 - MEDIAN AGE AT FIRST MARRIAGE (WHITE ALONE)
 - MARRIAGES ENDING IN WIDOWHOOD IN THE LAST YEAR BY SEX BY MARITAL STATUS FOR THE POPULATION 15 YEARS AND OVER

Cluster 6:
 - SEX BY OCCUPATION FOR THE CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER (AMERICAN INDIAN AND ALASKA NATIVE ALONE)
 - HEALTH INSURANCE COVERAGE STATUS BY AGE (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE)
 - OCCUPANTS PER ROOM (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE HOUSEHOLDER)
 - GROUP QUARTERS TYPE (3 TYPES) (AMERICAN INDIAN AND ALASKA NATIVE ALONE)
 - SEX BY OCCUPATION FOR THE CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE)

Cluster 14:
 - MEDIAN AGE BY SEX (WHITE ALONE, NOT HISP