In [1]:
#### Preamble ####
# Purpose: Extacts and downloads and saves the data from IPUMS USA
# Author: Jiazhou(Justin) Bi and Weiyang Li
# Date: 3 October 2024
# Contact: justin.bi@mail.utoronto.ca or weiyang.li@mail.utoronto.ca
# License: MIT
# Pre-requisites: python 3.10.5 or above, with pathlib and ipumspy installed for python
# Any other information needed? None

# Claim

Currently, this API data extraction has limited features, compared to the web UI for data extraction and download. For example, custom sample size is not supported through this API interaction. Details available at: https://developer.ipums.org/docs/v2/apiprogram/apis/microdata/. Therefore, the dataset downloaded here will have all the individuals in the dataset. If we need to clean the dataset, we will do so in the "03-clean_data.ipynb" notebook file.

In [1]:
from pathlib import Path
from ipumspy import IpumsApiClient, MicrodataExtract, readers, ddi
import os
import glob
from dotenv import load_dotenv
# You should set up a .env file containing your IPUMS API KEY in the same folder of this notebook
load_dotenv()
IPUMS_API_KEY = os.getenv('IPUMS_API_KEY')
ipums = IpumsApiClient(IPUMS_API_KEY)

In [3]:
# Defining the extract
extract = MicrodataExtract(
    "usa",
    ["us2022a"],
    ["STATEICP","CITY","OWNERSHP","MORTGAGE","GQ","SEX","AGE","MARST","EDUC","SCHLTYPE","OCC2010","VETSTAT","IND1990","INCTOT"],
    data_format= 'csv',
    description="first data extraction"
)
# submit your extract
ipums.submit_extract(extract)

<ipumspy.api.extract.MicrodataExtract at 0x104aaad40>

In [3]:
# Download the completed extract. Please wait until the status is 'completed' before running this cell. If you wish to check the extract status,
# you can open a new cell and use the following code:
# extract_status = ipums.extract_status(extract)
# print(extract_status)
extract_status = ipums.extract_status(extract)
file_download_path = '../data/01-raw_data'
if extract_status == 'completed':
    ipums.download_extract(extract, download_dir = file_download_path,)
    print(f"Extract downloaded to: {file_download_path}")
else:
    print("Extract is not yet completed.")

Extract downloaded to: ../data/01-raw_data


In [4]:
#Changing the downloaded file name. This only works when one file was downloaded as it is only changing the name of the first file.
download_path = '../data/01-raw_data'
new_file_name = 'raw_data.csv.gz'
old_file_path = glob.glob(os.path.join(download_path, '*.csv.gz'))[0]
new_file_path = os.path.join(download_path, new_file_name)
os.rename(old_file_path, new_file_path)
print(f"File renamed to {new_file_name}")

File renamed to raw_data.csv.gz
