In [1]:
# Imports libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

import sys
sys.path.append('../')

from scripts.utils.utils import Utils as utils


In [116]:
# Step 1: Load data
df = pd.read_csv("../data/raw/data_train.csv")

df

Unnamed: 0,Id,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,ExtraMedTest,ExtraAlcoholTest,Status
0,0,0,0,1,24,1,0,0,1,0,...,0,0,0,0,8,4,5,60,0,0
1,1,0,0,1,28,0,0,0,1,1,...,1,0,0,0,2,6,8,0,-64,0
2,2,0,0,1,36,1,0,0,1,1,...,30,30,1,0,3,2,1,-46,0,0
3,3,0,1,1,35,0,0,0,1,1,...,0,0,0,0,8,6,8,-83,-188,0
4,4,0,1,1,27,0,0,0,1,0,...,0,0,0,0,9,5,4,-58,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202939,202939,0,0,1,29,0,0,0,1,0,...,10,2,0,0,8,6,7,-66,4,0
202940,202940,0,0,1,23,0,0,0,1,0,...,0,0,0,0,11,5,7,0,-84,0
202941,202941,1,1,1,38,0,0,0,1,1,...,0,2,0,1,10,6,6,-57,-16,0
202942,202942,0,1,1,22,0,0,0,1,1,...,0,14,0,0,9,6,7,59,0,0


In [117]:
# Step 2: Inspect data
# Note: Given that all features have 202944 non-null counts and the dataset has 202944 rows, it's likely that there are no null values present.
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202944 entries, 0 to 202943
Data columns (total 25 columns):
 #   Column                Non-Null Count   Dtype
---  ------                --------------   -----
 0   Id                    202944 non-null  int64
 1   HighBP                202944 non-null  int64
 2   HighChol              202944 non-null  int64
 3   CholCheck             202944 non-null  int64
 4   BMI                   202944 non-null  int64
 5   Smoker                202944 non-null  int64
 6   Stroke                202944 non-null  int64
 7   HeartDiseaseorAttack  202944 non-null  int64
 8   PhysActivity          202944 non-null  int64
 9   Fruits                202944 non-null  int64
 10  Veggies               202944 non-null  int64
 11  HvyAlcoholConsump     202944 non-null  int64
 12  AnyHealthcare         202944 non-null  int64
 13  NoDocbcCost           202944 non-null  int64
 14  GenHlth               202944 non-null  int64
 15  MentHlth              202944 non-n

In [118]:
# Step 3.1: Remove unused column(s)
df.drop(
    columns=["Id"],
    inplace=True,
)

# Step 3.2: Remove duplicate row(S)
df.drop_duplicates(inplace=True)

In [119]:
# Step 4: Handle invalid datas (non numerical)

# Convert all features to numeric
df = utils.to_numeric(df.copy())

# Summarize invalid data per feature
df.isna().sum()

HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
ExtraMedTest            0
ExtraAlcoholTest        0
Status                  0
dtype: int64

In [120]:
# Step 5: Feature scaling
# Min-max scaling is applied to maintain uniformity within the data range.
# Formular: X = (X - min) / (max - min)

min_max_scaler = MinMaxScaler()

columns_to_scale = df.columns
df[columns_to_scale] = min_max_scaler.fit_transform(df[columns_to_scale])

# Inspect data after scaling
df.describe().round(3)

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,ExtraMedTest,ExtraAlcoholTest,Status
count,202736.0,202736.0,202736.0,202736.0,202736.0,202736.0,202736.0,202736.0,202736.0,202736.0,...,202736.0,202736.0,202736.0,202736.0,202736.0,202736.0,202736.0,202736.0,202736.0,202736.0
mean,0.429,0.425,0.963,0.191,0.443,0.041,0.094,0.756,0.635,0.811,...,0.107,0.142,0.169,0.44,0.587,0.81,0.722,0.481,0.481,0.176
std,0.495,0.494,0.19,0.077,0.497,0.198,0.293,0.429,0.481,0.391,...,0.248,0.291,0.375,0.496,0.254,0.197,0.296,0.191,0.191,0.381
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.14,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.417,0.6,0.571,0.362,0.362,0.0
50%,0.0,0.0,1.0,0.174,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.583,0.8,0.857,0.5,0.5,0.0
75%,1.0,1.0,1.0,0.221,1.0,0.0,0.0,1.0,1.0,1.0,...,0.067,0.1,0.0,1.0,0.75,1.0,1.0,0.601,0.601,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [121]:
def get_outliers_IQR(data):
    percentile_25 = np.percentile(data, 25)
    percentile_75 = np.percentile(data, 75)

    IQR = percentile_75 - percentile_25
    upper_limit = percentile_75 + IQR * 1.5
    lower_limit = percentile_25 - IQR * 1.5

    outliers = [x for x in data if x > upper_limit or x < lower_limit]

    return outliers


# for column in df.columns:
#     outliers = get_outliers_IQR(df[column])
#     print("Column:" + column + outliers)

# len(get_outliers_IQR(data=df["BMI"]))
print(get_outliers_IQR(data=df["CholCheck"]))

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [122]:
# df.to_csv("../data/processed/data_train_processed.csv")