In [2]:
#### Preamble ####
# Purpose: Validate all of the variables found the dataset
# Author: Jiazhou(Justin) Bi
# Date: 16 Nov 2024
# Contact: justin.bi@mail.utoronto.ca
# License: MIT
# Pre-requisites: see requirements.txt
# Any other information needed? None

# Loading the Dataset

In [1]:
import pandas as pd
df_1m = pd.read_parquet('../data/02-analysis_data/cleaned_data_1m.parquet')
df_1h = pd.read_parquet('../data/02-analysis_data/cleaned_data_1h.parquet')
df_1d = pd.read_parquet('../data/02-analysis_data/cleaned_data_1d.parquet')
df_1h.head()

Unnamed: 0,index,open,high,low,close,volume,direction
0,2017-08-17 05:00:00,4308.83,4328.69,4291.37,4315.32,23.234916,1
1,2017-08-17 06:00:00,4330.29,4345.45,4309.37,4324.35,7.229691,1
2,2017-08-17 07:00:00,4316.62,4349.99,4287.41,4349.99,4.443249,1
3,2017-08-17 08:00:00,4333.32,4377.85,4333.32,4360.69,0.972807,1
4,2017-08-17 09:00:00,4360.0,4445.78,4360.0,4444.0,10.763623,1


# Open

The open price should always be lower than or equal to the high price, and always higher or equal to the low price. This subsection will check if this logic is correct.

for i in len(df_1m['open']):

In [5]:
if all((df_1m['open'] <= df_1m['high']) & (df_1m['open'] >= df_1m['low'])):
    print('PASS')
else:
    print('FAIL')

FAIL


In [6]:
all_pass = True
for i in range(len(df_1m)):
    if not (df_1m.loc[i, 'open'] <= df_1m.loc[i, 'high'] and df_1m.loc[i, 'open'] >= df_1m.loc[i, 'low']):
        all_pass = False
        break

if all_pass:
    print('PASS')
else:
    print('FAIL')

FAIL


In [7]:
# Evaluate the condition for all rows
condition = (df_1m['open'] <= df_1m['high']) & (df_1m['open'] >= df_1m['low'])

# Find rows where the condition is False
failing_rows = df_1m[~condition]

# Print the failing rows
if failing_rows.empty:
    print("All rows passed.")
else:
    print("Failing rows:")
    print(failing_rows)


Failing rows:
                      index  open  high  low  close  volume  direction
29520   2017-09-06 16:01:00   NaN   NaN  NaN    NaN     NaN          0
29521   2017-09-06 16:02:00   NaN   NaN  NaN    NaN     NaN          0
29522   2017-09-06 16:03:00   NaN   NaN  NaN    NaN     NaN          0
29523   2017-09-06 16:04:00   NaN   NaN  NaN    NaN     NaN          0
29524   2017-09-06 16:05:00   NaN   NaN  NaN    NaN     NaN          0
...                     ...   ...   ...  ...    ...     ...        ...
2945394 2023-03-24 13:55:00   NaN   NaN  NaN    NaN     NaN          0
2945395 2023-03-24 13:56:00   NaN   NaN  NaN    NaN     NaN          0
2945396 2023-03-24 13:57:00   NaN   NaN  NaN    NaN     NaN          0
2945397 2023-03-24 13:58:00   NaN   NaN  NaN    NaN     NaN          0
2945398 2023-03-24 13:59:00   NaN   NaN  NaN    NaN     NaN          0

[8632 rows x 7 columns]


# High

# Low

# Close

# Volume

# Direction

In [3]:
# Checking it there are only 50 states, plus 4 other 
unique_state = df['STATEICP'].nunique()
unique_state

51

In [4]:
# Checking which values are missing from the variable
missing_state = df['STATEICP'].value_counts().sort_index()
missing_state
# 96                  State groupings (1980 Urban/rural sample)
# 97                  Military/Mil. Reservations
# 99                  State not identified
#The aboved entries are missing from the dataset. 

1      17194
11      4538
12     42396
13     80090
14     61003
2       6913
21     59413
22     32023
23     47745
24     54054
25     30055
3      32262
31     16101
32     13593
33     30192
34     28140
35      9502
36      3827
37      4237
4       7251
40     38695
41     21327
42     12266
43     88745
44     46615
45     18322
46     11901
47     45372
48     23739
49    121865
5       4682
51     19504
52     29319
53     14884
54     30796
56      7443
6       3494
61     29469
62     27327
63      8473
64      4898
65     12181
66      8135
67     15948
68      2788
71    140065
72     17620
73     33881
81      2751
82      5964
98      2353
Name: STATEICP, dtype: int64

# GQ

In [5]:
# There should be a maximum of 7 values for this variable (0 to 6)
# 0                   Vacant unit
#                     Households:
# 1                   Households under 1970 definition
# 2                   Additional households under 1990 definition
#                     Group Quarters:
# 3                   Group quarters--Institutions
# 4                   Other group quarters
# 5                   Additional households under 2000 definition
# 6                   Fragment
value_counts_GQ = df['GQ'].value_counts()
value_counts_GQ
#PASS

1    1429322
2       1785
5        244
Name: GQ, dtype: int64

# Mortgage

In [6]:
# 0                   N/A
# 1                   No, owned free and clear
# 2                   Check mark on manuscript (probably yes)
# 3                   Yes, mortgaged/ deed of trust or similar debt
# 4                   Yes, contract to purchase
value_counts_MORTGAGE = df['MORTGAGE'].value_counts()
value_counts_MORTGAGE

3    921093
1    499181
4     11077
Name: MORTGAGE, dtype: int64

# Sex

In [7]:
# 1                   Male
# 2                   Female
# 9                   Missing/blank
value_counts_SEX = df['SEX'].value_counts()
value_counts_SEX

1    734219
2    697132
Name: SEX, dtype: int64

# Age

In [8]:
#Checking if this variable is working as intended
median_AGE = df['AGE'].median()
print(median_AGE)
#PASS

49.0


# MARST (Marital Status)

In [9]:
# 1                   Married, spouse present
# 2                   Married, spouse absent
# 3                   Separated
# 4                   Divorced
# 5                   Widowed
# 6                   Never married/single
# 9                   Blank, missing
value_counts_MARST = df['MARST'].value_counts()
value_counts_MARST

1    875028
6    334779
4    140832
5     42638
2     22902
3     15172
Name: MARST, dtype: int64

# EDUC (Educational Attainment)

In [10]:
# 00                  N/A or no schooling
# 01                  Nursery school to grade 4
# 02                  Grade 5, 6, 7, or 8
# 03                  Grade 9
# 04                  Grade 10
# 05                  Grade 11
# 06                  Grade 12
# 07                  1 year of college
# 08                  2 years of college
# 09                  3 years of college
# 10                  4 years of college
# 11                  5+ years of college
# 99                  Missing
value_counts_EDUC = df['EDUC'].value_counts()
value_counts_EDUC

6     448566
10    345871
11    236958
7     174428
8     138555
5      28262
4      18698
0      15559
2      12809
3       9093
1       2552
Name: EDUC, dtype: int64

# EDUC_new (Regrouped)

In [11]:
# 0: No Schooling
# 1: Nursery School
# 2: Primary School
# 3: Middle School
# 4: College 1-4 Years
# 5: College 5+ Years
value_counts_EDUC_new = df['EDUC_new'].value_counts()
value_counts_EDUC_new

4    658854
3    504619
5    236958
0     15559
2     12809
1      2552
Name: EDUC_new, dtype: int64

# SCHLTYPE (School Type)

In [12]:
# 0                   N/A
# 1                   Not enrolled
# 2                   Public school
# 3                   Private school (1960,1990-2000,ACS,PRCS)
# 4                   Church-related (1980)
# 5                   Parochial (1970)
# 6                   Other private, 1980
# 7                   Other private, 1970
value_counts_SCHLTYPE = df['SCHLTYPE'].value_counts()
value_counts_SCHLTYPE

1    1311397
2      94571
3      25383
Name: SCHLTYPE, dtype: int64

# OCC2010 (Occupation)

In [13]:
value_counts_OCC2010= df['OCC2010'].value_counts()
value_counts_OCC2010

430     53906
3130    38810
4700    34828
2310    33784
9130    31788
        ...  
6740       85
5340       84
7830       80
8850       78
8910       76
Name: OCC2010, Length: 427, dtype: int64

# VETSTAT (Veteran Status)

In [14]:
# 0                   N/A
# 1                   Not a veteran
# 2                   Veteran
# 9                   Unknown
value_counts_VETSTAT= df['VETSTAT'].value_counts()
value_counts_VETSTAT

1    1334292
2      85654
0      11405
Name: VETSTAT, dtype: int64

# INCTOT (Total Income)

In [15]:
mean_INCTOT = df['INCTOT'].mean()
median_INCTOT = df['INCTOT'].median()
print(f"The mean total income of this dataset is {mean_INCTOT}.")
print(f"The median total income of this dataset is {median_INCTOT}.")

The mean total income of this dataset is 68730.31508204486.
The median total income of this dataset is 46800.0.
