### 1. Data Load

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./Access_to_Care_Dataset.csv")
print("No.of rows & columns:", df.shape)
print("First 5 rows of the data:")
df.head()

No.of rows & columns: (26208, 25)
First 5 rows of the data:


Unnamed: 0,TOPIC,SUBTOPIC,SUBTOPIC_ID,TAXONOMY,TAXONOMY_ID,CLASSIFICATION,CLASSIFICATION_ID,GROUP,GROUP_ID,GROUP_ORDER,...,ESTIMATE_TYPE,ESTIMATE_TYPE_ID,TIME_PERIOD,TIME_PERIOD_ID,ESTIMATE,STANDARD_ERROR,ESTIMATE_LCI,ESTIMATE_UCI,FLAG,FOOTNOTE_ID_LIST
0,Angina/angina pectoris,,,Cardiovascular diseases,60,Total,0,Total,1,1,...,"Percent of population, crude",1,2019,,1.7,,1.5,1.9,,"NT_NHISA00,NT_NHISA999,FN_NHISA18,SC_NHISA00"
1,Angina/angina pectoris,,,Cardiovascular diseases,60,Total,0,Total,1,1,...,"Percent of population, crude",1,2020,,1.5,,1.3,1.6,,"NT_NHISA00,NT_NHISA999,FN_NHISA18,SC_NHISA00"
2,Angina/angina pectoris,,,Cardiovascular diseases,60,Total,0,Total,1,1,...,"Percent of population, crude",1,2021,,1.5,,1.4,1.7,,"NT_NHISA00,NT_NHISA999,FN_NHISA18,SC_NHISA00"
3,Angina/angina pectoris,,,Cardiovascular diseases,60,Total,0,Total,1,1,...,"Percent of population, crude",1,2022,,1.6,,1.5,1.8,,"NT_NHISA00,NT_NHISA999,FN_NHISA18,SC_NHISA00"
4,Angina/angina pectoris,,,Cardiovascular diseases,60,Total,0,Total,1,1,...,"Percent of population, crude",1,2023,,1.6,,1.4,1.8,,"NT_NHISA00,NT_NHISA999,FN_NHISA18,SC_NHISA00"


### 2. Inspect Features

In [3]:
print("Features: ", df.columns)

Features:  Index(['TOPIC', 'SUBTOPIC', 'SUBTOPIC_ID', 'TAXONOMY', 'TAXONOMY_ID',
       'CLASSIFICATION', 'CLASSIFICATION_ID', 'GROUP', 'GROUP_ID',
       'GROUP_ORDER', 'SUBGROUP', 'SUBGROUP_ID', 'SUBGROUP_ORDER',
       'NESTING_LABEL', 'NESTING_LABEL_ID', 'ESTIMATE_TYPE',
       'ESTIMATE_TYPE_ID', 'TIME_PERIOD', 'TIME_PERIOD_ID', 'ESTIMATE',
       'STANDARD_ERROR', 'ESTIMATE_LCI', 'ESTIMATE_UCI', 'FLAG',
       'FOOTNOTE_ID_LIST'],
      dtype='object')


### 3. Select Columns

In [4]:
df_clean = df[['TOPIC', 'GROUP', 'SUBGROUP', 'TIME_PERIOD', 'ESTIMATE', 'FLAG']]
df_clean.head()

Unnamed: 0,TOPIC,GROUP,SUBGROUP,TIME_PERIOD,ESTIMATE,FLAG
0,Angina/angina pectoris,Total,18 years and older,2019,1.7,
1,Angina/angina pectoris,Total,18 years and older,2020,1.5,
2,Angina/angina pectoris,Total,18 years and older,2021,1.5,
3,Angina/angina pectoris,Total,18 years and older,2022,1.6,
4,Angina/angina pectoris,Total,18 years and older,2023,1.6,


### 4. Check Null Values

In [5]:
df_clean.isna().sum()

TOPIC              0
GROUP              0
SUBGROUP           0
TIME_PERIOD        0
ESTIMATE        2369
FLAG           23609
dtype: int64

### 5. Extracting Rows with Reliable Estimates

In [6]:
df_preprocessed = df_clean[df_clean['FLAG'].isna()] #if NULL then the estimate is reliable.
print("No.of rows & columns after preprocessing: ", df_preprocessed.shape)
df_preprocessed.isna().sum()

No.of rows & columns after preprocessing:  (23609, 6)


TOPIC              0
GROUP              0
SUBGROUP           0
TIME_PERIOD        0
ESTIMATE           0
FLAG           23609
dtype: int64

In [7]:
df_preprocessed.head(10)

Unnamed: 0,TOPIC,GROUP,SUBGROUP,TIME_PERIOD,ESTIMATE,FLAG
0,Angina/angina pectoris,Total,18 years and older,2019,1.7,
1,Angina/angina pectoris,Total,18 years and older,2020,1.5,
2,Angina/angina pectoris,Total,18 years and older,2021,1.5,
3,Angina/angina pectoris,Total,18 years and older,2022,1.6,
4,Angina/angina pectoris,Total,18 years and older,2023,1.6,
5,Angina/angina pectoris,Total,18 years and older,2024,1.6,
6,Angina/angina pectoris,Age groups with 65 years and older,18-34 years,2019,0.2,
7,Angina/angina pectoris,Age groups with 65 years and older,18-34 years,2020,0.3,
8,Angina/angina pectoris,Age groups with 65 years and older,18-34 years,2021,0.4,
9,Angina/angina pectoris,Age groups with 65 years and older,18-34 years,2022,0.2,


### 5. Historic Years

In [8]:
df_preprocessed["TIME_PERIOD"].unique()

array([2019, 2020, 2021, 2022, 2023, 2024])

### 6. Writing Data Back to CSV

In [9]:
df_preprocessed.to_csv('./preprocessed_data.csv', index = False)

In [10]:
pd.read_csv('./preprocessed_data.csv')

Unnamed: 0,TOPIC,GROUP,SUBGROUP,TIME_PERIOD,ESTIMATE,FLAG
0,Angina/angina pectoris,Total,18 years and older,2019,1.7,
1,Angina/angina pectoris,Total,18 years and older,2020,1.5,
2,Angina/angina pectoris,Total,18 years and older,2021,1.5,
3,Angina/angina pectoris,Total,18 years and older,2022,1.6,
4,Angina/angina pectoris,Total,18 years and older,2023,1.6,
...,...,...,...,...,...,...
23604,Wellness visit,Poverty level,≥200% FPL,2020,78.9,
23605,Wellness visit,Poverty level,≥200% FPL,2021,77.0,
23606,Wellness visit,Poverty level,≥200% FPL,2022,78.8,
23607,Wellness visit,Poverty level,≥200% FPL,2023,80.5,


In [11]:
df_preprocessed['TIME_PERIOD'].dtype

dtype('int64')