In [1]:
#### Preamble ####
# Purpose: Cleans and saves the data from IPUMS USA
# Author: Jiazhou(Justin) Bi and Weiyang Li
# Date: 3 October 2024
# Contact: justin.bi@mail.utoronto.ca or weiyang.li@mail.utoronto.ca
# License: MIT
# Pre-requisites: python 3.10.5 or above, with pandas installed for python
# Any other information needed? None

# Loading the data

In [2]:
import pandas as pd

In [3]:
#Loading the dataset as DataFrame
df = pd.read_csv('../data/01-raw_data/raw_data.csv.gz')
print(df.head())

   YEAR  SAMPLE  SERIAL       CBSERIAL  HHWT        CLUSTER  STRATA  GQ  \
0  2022  202201       1  2022010000031  69.0  2022000000011  280301   3   
1  2022  202201       2  2022010000111  22.0  2022000000021  200001   3   
2  2022  202201       3  2022010000200  45.0  2022000000031  280301   3   
3  2022  202201       4  2022010000261   4.0  2022000000041  110001   4   
4  2022  202201       5  2022010000296  47.0  2022000000051  150201   3   

   PERNUM  PERWT  SEX  AGE  MARST  EDUC  EDUCD  SCHLTYPE  OCC2010  INCTOT  
0       1   69.0    2   85      5     7     71         1     9920   18800  
1       1   22.0    1   51      5     6     64         1     5620   12500  
2       1   45.0    2   36      6     2     26         1     8800   16400  
3       1    4.0    1   74      6     0      2         1     9920    8600  
4       1   47.0    1   49      4     7     71         1     6230    5000  


In [4]:
#Checking our sample size
print(df.shape)
#(3373378, 18)

(3373378, 18)


In [5]:
unique_year = df['PERNUM'].unique()
unique_year

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20])

# Dropping Columns

In [6]:
df = df.drop(columns=['YEAR','SAMPLE','SERIAL','CBSERIAL','HHWT','PERNUM',])
print(df.head())

         CLUSTER  STRATA  GQ  PERWT  SEX  AGE  MARST  EDUC  EDUCD  SCHLTYPE  \
0  2022000000011  280301   3   69.0    2   85      5     7     71         1   
1  2022000000021  200001   3   22.0    1   51      5     6     64         1   
2  2022000000031  280301   3   45.0    2   36      6     2     26         1   
3  2022000000041  110001   4    4.0    1   74      6     0      2         1   
4  2022000000051  150201   3   47.0    1   49      4     7     71         1   

   OCC2010  INCTOT  
0     9920   18800  
1     5620   12500  
2     8800   16400  
3     9920    8600  
4     6230    5000  


# Data Validation

In [7]:
missing_values = df.isnull().sum()
print(missing_values)
#PASS. No missing values found in this dataset.

CLUSTER     0
STRATA      0
GQ          0
PERWT       0
SEX         0
AGE         0
MARST       0
EDUC        0
EDUCD       0
SCHLTYPE    0
OCC2010     0
INCTOT      0
dtype: int64


# Saving the DataFrame as a csv file

In [8]:
df.to_csv('../data/02-analysis_data/cleaned_data.csv', index=False)