# Data EDA and visualization

### Importing required libraries

In [68]:
import numpy as np
import pandas as pd
import math

In [69]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer
from sklearn.preprocessing import LabelEncoder

In [49]:
data = pd.read_csv('csv_data.csv')

In [50]:
data.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,,No,...,,,,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No
1,Alabama,Female,Excellent,0.0,0.0,,No,6.0,,No,...,1.6,68.04,26.57,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No
2,Alabama,Female,Very good,2.0,3.0,Within past year (anytime less than 12 months ...,Yes,5.0,,No,...,1.57,63.5,25.61,No,No,No,No,,No,Yes
3,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,,No,...,1.65,63.5,23.3,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
4,Alabama,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,,No,...,1.57,53.98,21.77,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No


### NULL values in data

In [51]:
data.isnull().sum()

State                            0
Sex                              0
GeneralHealth                 1198
PhysicalHealthDays           10927
MentalHealthDays              9067
LastCheckupTime               8308
PhysicalActivities            1093
SleepHours                    5453
RemovedTeeth                 11360
HadHeartAttack                3065
HadAngina                     4405
HadStroke                     1557
HadAsthma                     1773
HadSkinCancer                 3143
HadCOPD                       2219
HadDepressiveDisorder         2812
HadKidneyDisease              1926
HadArthritis                  2633
HadDiabetes                   1087
DeafOrHardOfHearing          20647
BlindOrVisionDifficulty      21564
DifficultyConcentrating      24240
DifficultyWalking            24012
DifficultyDressingBathing    23915
DifficultyErrands            25656
SmokerStatus                 35462
ECigaretteUsage              35660
ChestScan                    56046
RaceEthnicityCategor

We will remove NULL values present in the data

In [52]:
print(f"Old data shape: {data.shape}")
data = data.dropna()
print(f"No NULL data shape: {data.shape}")

Old data shape: (445132, 40)
No NULL data shape: (246022, 40)


### Duplicate records in data

In [53]:
data.duplicated().sum()

9

We will remove the duplicate records in data

In [54]:
print(f"Old data shape: {data.shape}")
data = data.drop_duplicates()
print(f"No duplicates data shape: {data.shape}")

Old data shape: (246022, 40)
No duplicates data shape: (246013, 40)


### Description of data

In [55]:
data.describe()

Unnamed: 0,PhysicalHealthDays,MentalHealthDays,SleepHours,HeightInMeters,WeightInKilograms,BMI
count,246013.0,246013.0,246013.0,246013.0,246013.0,246013.0
mean,4.119055,4.167292,7.021312,1.70515,83.615522,28.668258
std,8.405803,8.102796,1.440698,0.106654,21.323232,6.514005
min,0.0,0.0,1.0,0.91,28.12,12.02
25%,0.0,0.0,6.0,1.63,68.04,24.27
50%,0.0,0.0,7.0,1.7,81.65,27.46
75%,3.0,4.0,8.0,1.78,95.25,31.89
max,30.0,30.0,24.0,2.41,292.57,97.65


### Value Counts of the target feature 'HadHeartAttack'

In [56]:
data['HadHeartAttack'].value_counts()

HadHeartAttack
No     232578
Yes     13435
Name: count, dtype: int64

As we can see, the data is imbalanced and is skewed.<br>
We will fix this later in the project.

### Value Counts of the rest of the data

In [57]:
for feature, feature_type in zip(data.columns.to_list(), data.dtypes.to_list()):
    if feature_type == 'object' and feature != 'HadHeartAttack':
        print(data[feature].value_counts(), end='\n\n')

State
Washington              14998
Maryland                 9163
Minnesota                9161
Ohio                     8995
New York                 8923
Texas                    7408
Florida                  7315
Kansas                   6145
Wisconsin                6126
Maine                    6013
Iowa                     5672
Hawaii                   5596
Virginia                 5565
Indiana                  5502
South Carolina           5471
Massachusetts            5465
Arizona                  5461
Utah                     5373
Michigan                 5370
Colorado                 5159
Nebraska                 5107
California               5096
Connecticut              5053
Georgia                  4978
Vermont                  4844
South Dakota             4404
Montana                  4264
Missouri                 4195
New Jersey               3966
New Hampshire            3756
Puerto Rico              3589
Idaho                    3468
Alaska                   3205
Rhod

### Correlation in the data

In [60]:
features = zip(data.columns.to_list(), data.dtypes.to_list())
for feature, feature_type in features:
    if feature_type == 'object':
        label_encoder = LabelEncoder()
        data[feature] = label_encoder.fit_transform(data[feature])

In [75]:
data.corr()['HadHeartAttack'].apply(lambda x: abs(x)).sort_values()

State                        0.002085
SleepHours                   0.003634
TetanusLast10Tdap            0.010006
HIVTesting                   0.014565
ECigaretteUsage              0.015482
CovidPos                     0.020968
HighRiskLastYear             0.021129
GeneralHealth                0.021599
HeightInMeters               0.023059
HadDepressiveDisorder        0.023701
HadAsthma                    0.023759
RaceEthnicityCategory        0.024206
MentalHealthDays             0.025888
BMI                          0.030409
WeightInKilograms            0.038432
FluVaxLast12                 0.045242
HadSkinCancer                0.049415
DifficultyConcentrating      0.051661
LastCheckupTime              0.065852
BlindOrVisionDifficulty      0.072962
Sex                          0.073317
RemovedTeeth                 0.073714
AlcoholDrinkers              0.074176
SmokerStatus                 0.077334
DifficultyDressingBathing    0.083089
PhysicalActivities           0.083186
DifficultyEr

Features with very correlations are strong features