In [1]:
#### Preamble ####
# Purpose: Cleans and saves the data from IPUMS USA
# Author: Jiazhou(Justin) Bi and Weiyang Li
# Date: 3 October 2024
# Contact: justin.bi@mail.utoronto.ca or weiyang.li@mail.utoronto.ca
# License: MIT
# Pre-requisites: python 3.10.5 or above, with pandas installed for python
# Any other information needed? None

# Loading the data

In [30]:
import pandas as pd

In [33]:
#Loading the dataset as DataFrame
df = pd.read_csv('../data/01-raw_data/raw_data.csv.gz')
print(df.head())

   YEAR  SAMPLE  SERIAL       CBSERIAL  HHWT        CLUSTER  STATEICP  CITY  \
0  2022  202201       1  2022010000031  69.0  2022000000011        41     0   
1  2022  202201       2  2022010000111  22.0  2022000000021        41     0   
2  2022  202201       3  2022010000200  45.0  2022000000031        41     0   
3  2022  202201       4  2022010000261   4.0  2022000000041        41     0   
4  2022  202201       5  2022010000296  47.0  2022000000051        41     0   

   STRATA  GQ  ...  SEX  AGE  MARST  EDUC  EDUCD  SCHLTYPE  OCC2010  INCTOT  \
0  280301   3  ...    2   85      5     7     71         1     9920   18800   
1  200001   3  ...    1   51      5     6     64         1     5620   12500   
2  280301   3  ...    2   36      6     2     26         1     8800   16400   
3  110001   4  ...    1   74      6     0      2         1     9920    8600   
4  150201   3  ...    1   49      4     7     71         1     6230    5000   

   VETSTAT  VETSTATD  
0        1        11  
1   

In [19]:
#Checking our sample size
#print(df.shape)
#(3373378, 25)

# Dropping Columns

In [34]:
df = df.drop(columns=['YEAR','SAMPLE','SERIAL','CBSERIAL','HHWT','CLUSTER','STRATA','OWNERSHPD','PERWT','EDUCD','VETSTATD','PERNUM'])
print(df.head())

   STATEICP  CITY  GQ  OWNERSHP  MORTGAGE  SEX  AGE  MARST  EDUC  SCHLTYPE  \
0        41     0   3         0         0    2   85      5     7         1   
1        41     0   3         0         0    1   51      5     6         1   
2        41     0   3         0         0    2   36      6     2         1   
3        41     0   4         0         0    1   74      6     0         1   
4        41     0   3         0         0    1   49      4     7         1   

   OCC2010  INCTOT  VETSTAT  
0     9920   18800        1  
1     5620   12500        1  
2     8800   16400        1  
3     9920    8600        1  
4     6230    5000        2  


# Data Validation

In [22]:
# missing_values = df.isnull().sum()
# print(missing_values)
#PASS. No missing values found in this dataset.

# Changing to Correct Data Types

In [36]:
df.dtypes
# STATEICP    object
# CITY        object
# GQ          object
# OWNERSHP    object
# MORTGAGE    object
# SEX         object
# AGE          int64
# MARST       object
# EDUC        object
# SCHLTYPE    object
# OCC2010     object
# INCTOT       int64
# VETSTAT     object
# dtype: object

STATEICP    object
CITY        object
GQ          object
OWNERSHP    object
MORTGAGE    object
SEX         object
AGE          int64
MARST       object
EDUC        object
SCHLTYPE    object
OCC2010     object
INCTOT       int64
VETSTAT     object
dtype: object

In [35]:
df['STATEICP']=df['STATEICP'].astype('str')
df['CITY']=df['CITY'].astype('str')
df['GQ']=df['GQ'].astype('str')
df['OWNERSHP']=df['OWNERSHP'].astype('str')
df['MORTGAGE']=df['MORTGAGE'].astype('str')
df['SEX']=df['SEX'].astype('str')
df['MARST']=df['MARST'].astype('str')
df['EDUC']=df['EDUC'].astype('str')
df['SCHLTYPE']=df['SCHLTYPE'].astype('str')
df['OCC2010']=df['OCC2010'].astype('str')
df['VETSTAT']=df['VETSTAT'].astype('str')

# Saving the DataFrame as a csv file

In [37]:
df.to_csv('../data/02-analysis_data/cleaned_data.csv', index=False)