In [56]:
#### Preamble ####
# Purpose: Validate all of the variables found the dataset
# Author: Jiazhou(Justin) Bi and Weiyang Li
# Date: 7 October 2024
# Contact: justin.bi@mail.utoronto.ca or weiyang.li@mail.utoronto.ca
# License: MIT
# Pre-requisites: python 3.10.5 or above, with pandas installed for python
# Any other information needed? None

# Loading the Dataset and Necessary Libraries 

In [57]:
import pandas as pd
df = pd.read_csv('../data/02-analysis_data/cleaned_data.csv')
df.head()

Unnamed: 0,STATEICP,GQ,OWNERSHP,MORTGAGE,SEX,AGE,MARST,EDUC,SCHLTYPE,OCC2010,INCTOT,VETSTAT
0,41,3,0,0,2,85,5,7,1,9920,18800,1
1,41,3,0,0,1,51,5,6,1,5620,12500,1
2,41,3,0,0,2,36,6,2,1,8800,16400,1
3,41,4,0,0,1,74,6,0,1,9920,8600,1
4,41,3,0,0,1,49,4,7,1,6230,5000,2


# STATE

In [58]:
# Checking it there are only 50 states, plus 4 other 
unique_state = df['STATEICP'].nunique()
unique_state

51

In [59]:
# Checking which values are missing from the variable
missing_state = df['STATEICP'].value_counts().sort_index()
missing_state
# 96                  State groupings (1980 Urban/rural sample)
# 97                  Military/Mil. Reservations
# 99                  State not identified
#The aboved entries are missing from the dataset. 

1      32037
2      12708
3      63066
4      12263
5       9073
6       5966
11      8335
12     78826
13    174579
14    113239
21    107426
22     58022
23     86387
24    101651
25     52297
31     27999
32     24784
33     49031
34     53938
35     16514
36      6645
37      7567
40     74844
41     43783
42     26300
43    189646
44     91841
45     37932
46     25023
47     93101
48     46515
49    242939
51     38947
52     52416
53     32310
54     61037
56     15557
61     62814
62     50668
63     16119
64      9337
65     25908
66     17236
67     27800
68      4992
71    331423
72     37475
73     67872
81      5548
82     12750
98      6016
Name: STATEICP, dtype: int64

# GQ

In [60]:
# There should be a maximum of 7 values for this variable (0 to 6)
# 0                   Vacant unit
#                     Households:
# 1                   Households under 1970 definition
# 2                   Additional households under 1990 definition
#                     Group Quarters:
# 3                   Group quarters--Institutions
# 4                   Other group quarters
# 5                   Additional households under 2000 definition
# 6                   Fragment
value_counts_GQ = df['GQ'].value_counts()
value_counts_GQ
#PASS

1    2664493
4      96405
3      83119
2       5213
5       1272
Name: GQ, dtype: int64

# Ownership

In [61]:
# A maximum of 3 values of OWNERSHP
# OWNERSHP            Ownership of dwelling (tenure) [general version]
# 0                   N/A
# 1                   Owned or being bought (loan)
# 2                   Rented
value_counts_OWNERSHP = df['OWNERSHP'].value_counts()
value_counts_OWNERSHP

1    2018514
2     652464
0     179524
Name: OWNERSHP, dtype: int64

# Mortgage

In [62]:
# 0                   N/A
# 1                   No, owned free and clear
# 2                   Check mark on manuscript (probably yes)
# 3                   Yes, mortgaged/ deed of trust or similar debt
# 4                   Yes, contract to purchase
value_counts_MORTGAGE = df['MORTGAGE'].value_counts()
value_counts_MORTGAGE

3    1156283
1     847218
0     831988
4      15013
Name: MORTGAGE, dtype: int64

# Sex

In [63]:
# 1                   Male
# 2                   Female
# 9                   Missing/blank
value_counts_SEX = df['SEX'].value_counts()
value_counts_SEX

2    1461772
1    1388730
Name: SEX, dtype: int64

# Age

In [64]:
#Checking if this variable is working as intended
median_AGE = df['AGE'].median()
print(median_AGE)
#PASS

50.0


# MARST (Marital Status)

In [65]:
# 1                   Married, spouse present
# 2                   Married, spouse absent
# 3                   Separated
# 4                   Divorced
# 5                   Widowed
# 6                   Never married/single
# 9                   Blank, missing
value_counts_MARST = df['MARST'].value_counts()
value_counts_MARST

1    1382852
6     875582
4     301831
5     185070
2      63717
3      41450
Name: MARST, dtype: int64

# EDUC (Educational Attainment)

In [66]:
# 00                  N/A or no schooling
# 01                  Nursery school to grade 4
# 02                  Grade 5, 6, 7, or 8
# 03                  Grade 9
# 04                  Grade 10
# 05                  Grade 11
# 06                  Grade 12
# 07                  1 year of college
# 08                  2 years of college
# 09                  3 years of college
# 10                  4 years of college
# 11                  5+ years of college
# 99                  Missing
value_counts_EDUC = df['EDUC'].value_counts()
value_counts_EDUC

6     1003224
10     556018
7      366908
11     360991
8      230535
5       81345
4       70671
2       61009
3       58534
0       50420
1       10847
Name: EDUC, dtype: int64

# SCHLTYPE (School Type)

In [67]:
# 0                   N/A
# 1                   Not enrolled
# 2                   Public school
# 3                   Private school (1960,1990-2000,ACS,PRCS)
# 4                   Church-related (1980)
# 5                   Parochial (1970)
# 6                   Other private, 1980
# 7                   Other private, 1970
value_counts_SCHLTYPE = df['SCHLTYPE'].value_counts()
value_counts_SCHLTYPE

1    2495007
2     274184
3      81311
Name: SCHLTYPE, dtype: int64

# OCC2010 (Occupation)

In [68]:
value_counts_OCC2010= df['OCC2010'].value_counts()
value_counts_OCC2010

9920    853596
430      66809
4700     46543
3130     46278
9130     44366
         ...  
8850       125
3900       117
5340       110
6740       104
8910        94
Name: OCC2010, Length: 427, dtype: int64

# VETSTAT (Veteran Status)

In [69]:
# 0                   N/A
# 1                   Not a veteran
# 2                   Veteran
# 9                   Unknown
value_counts_VETSTAT= df['VETSTAT'].value_counts()
value_counts_VETSTAT

1    2576515
2     192112
0      81875
Name: VETSTAT, dtype: int64

# INCTOT (Total Income)

In [70]:
mean_INCTOT = df['INCTOT'].mean()
median_INCTOT = df['INCTOT'].median()
print(f"The mean total income of this dataset is {mean_INCTOT}.")
print(f"The median total income of this dataset is {median_INCTOT}.")

The mean total income of this dataset is 49422.73825908559.
The median total income of this dataset is 30000.0.
