## Study_1 Agenda:

1. Clean Data<br>
2. One-hot encoding<br>
3. Data Scaling<br>
4. Modelling with Pipeline

### 1. Clean Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('heart_disease_data.csv')
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4238 non-null   int64  
 1   age              4238 non-null   int64  
 2   education        4133 non-null   float64
 3   currentSmoker    4238 non-null   int64  
 4   cigsPerDay       4209 non-null   float64
 5   BPMeds           4185 non-null   float64
 6   prevalentStroke  4238 non-null   int64  
 7   prevalentHyp     4238 non-null   int64  
 8   diabetes         4238 non-null   int64  
 9   totChol          4188 non-null   float64
 10  sysBP            4238 non-null   float64
 11  diaBP            4238 non-null   float64
 12  BMI              4219 non-null   float64
 13  heartRate        4237 non-null   float64
 14  glucose          3850 non-null   float64
 15  TenYearCHD       4238 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 529.9 KB


In [4]:
df.columns = df.columns.str.lower()

In [5]:
df.columns

Index(['male', 'age', 'education', 'currentsmoker', 'cigsperday', 'bpmeds',
       'prevalentstroke', 'prevalenthyp', 'diabetes', 'totchol', 'sysbp',
       'diabp', 'bmi', 'heartrate', 'glucose', 'tenyearchd'],
      dtype='object')

In [6]:
# Main goal is to explore techniques. Therefore, we will be dropping columns for easier data manipulation
df.drop(['bpmeds', 'prevalentstroke', 'prevalenthyp', 'diabetes', 'totchol', 'sysbp', 'diabp'], axis=1, inplace=True)

In [7]:
# Target variable is tenyearchd (Ten Year Coronary Heart Disease)
df.head()

Unnamed: 0,male,age,education,currentsmoker,cigsperday,bmi,heartrate,glucose,tenyearchd
0,1,39,4.0,0,0.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,23.1,85.0,85.0,0


In [53]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
male,4237.0,0.429077,0.495003,0.0,0.0,0.0,1.0,1.0
age,4237.0,49.581544,8.57031,32.0,42.0,49.0,56.0,70.0
education,4237.0,1.930139,1.053046,0.0,1.0,2.0,3.0,4.0
currentsmoker,4237.0,0.493982,0.500023,0.0,0.0,0.0,1.0,1.0
cigsperday,4237.0,9.000765,11.879972,0.0,0.0,0.0,20.0,70.0
bmi,4237.0,25.799382,4.071168,15.54,23.08,25.4,28.03,56.8
heartrate,4237.0,75.878924,12.026596,44.0,68.0,75.0,83.0,143.0
glucose,4237.0,81.601369,22.867488,40.0,72.0,78.0,85.0,394.0
tenyearchd,4237.0,0.151758,0.358829,0.0,0.0,0.0,0.0,1.0
education_encoded,4237.0,1.930139,1.053046,0.0,1.0,2.0,3.0,4.0


In [8]:
df.isnull().sum()

male               0
age                0
education        105
currentsmoker      0
cigsperday        29
bmi               19
heartrate          1
glucose          388
tenyearchd         0
dtype: int64

In [9]:
(df.isnull().sum()) / df.shape[0] * 100 # Proportion of missing values to entire dataset 

male             0.000000
age              0.000000
education        2.477584
currentsmoker    0.000000
cigsperday       0.684285
bmi              0.448325
heartrate        0.023596
glucose          9.155262
tenyearchd       0.000000
dtype: float64

In [10]:
# Fillna with 0, because there could be a person with no education level 
df.education.fillna(0, inplace=True)

In [11]:
# Median resist outliers 
df.glucose.fillna(df.glucose.median(), inplace=True)

In [12]:
df.groupby('education').agg({'cigsperday': 'mean'})

Unnamed: 0_level_0,cigsperday
education,Unnamed: 1_level_1
0.0,7.932039
1.0,8.683626
2.0,9.839228
3.0,8.090909
4.0,9.510638


In [13]:
# Fillna of cigsperday by education's mean for each group 
df['cigsperday'] = df.groupby('education')['cigsperday'].apply(lambda x: x.fillna(x.mean()))

In [14]:
df.bmi.fillna(df.bmi.median(), inplace=True)

In [15]:
df.dropna(how='any',axis=0, inplace=True) # use drop row for 1 missing values in heartrate

In [55]:
df.isnull().sum()

male                 0
age                  0
education            0
currentsmoker        0
cigsperday           0
bmi                  0
heartrate            0
glucose              0
tenyearchd           0
education_encoded    0
dtype: int64

__We have explored different ways of Data Imputation__ 

In [17]:
df.shape

(4237, 9)

In [18]:
df.dtypes

male               int64
age                int64
education        float64
currentsmoker      int64
cigsperday       float64
bmi              float64
heartrate        float64
glucose          float64
tenyearchd         int64
dtype: object

In [56]:
df[df.duplicated()] # Find duplicated value. In this case, we have none

Unnamed: 0,male,age,education,currentsmoker,cigsperday,bmi,heartrate,glucose,tenyearchd,education_encoded


In [57]:
df.isnull().values.any() # Final Check

False

### 2. One-hot Encoding

In [21]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [22]:
le = LabelEncoder()
df['education_encoded'] = le.fit_transform(df.education) # Transform education into label encoder column 

In [23]:
df.head()# Imagine education is a animal column with Dog, Cat, Fish, Bird, Snake. Label encoder assign a number for 
# each type of value in categorical column. 

Unnamed: 0,male,age,education,currentsmoker,cigsperday,bmi,heartrate,glucose,tenyearchd,education_encoded
0,1,39,4.0,0,0.0,26.97,80.0,77.0,0,4
1,0,46,2.0,0,0.0,28.73,95.0,76.0,0,2
2,1,48,1.0,1,20.0,25.34,75.0,70.0,0,1
3,0,61,3.0,1,30.0,28.58,65.0,103.0,1,3
4,0,46,3.0,1,23.0,23.1,85.0,85.0,0,3


In [24]:
df.education.value_counts()

1.0    1719
2.0    1253
3.0     687
4.0     473
0.0     105
Name: education, dtype: int64

In [25]:
df.education_encoded.value_counts()

1    1719
2    1253
3     687
4     473
0     105
Name: education_encoded, dtype: int64

In [26]:
# This line of code transform 'Label Encoder' into a Binary Matrix, respectively to the category column(education) 
encoder = OneHotEncoder(categories = 'auto')
X = encoder.fit_transform(df.education_encoded.values.reshape(-1,1)).toarray()
X

array([[0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.]])

In [52]:
unknown = encoder.fit_transform(df.education_encoded.values.reshape(-1,1)) # figure out the code without .toarray()
type(unknown) # What is this? 

scipy.sparse.csr.csr_matrix

In [27]:
dfonehot = pd.DataFrame(X) # Turn matrix to dataframe
df1 = pd.concat([df, dfonehot], axis=1) # combine old dataframe with new datafame(one-hot encoder)
df1.head()

Unnamed: 0,male,age,education,currentsmoker,cigsperday,bmi,heartrate,glucose,tenyearchd,education_encoded,0,1,2,3,4
0,1.0,39.0,4.0,0.0,0.0,26.97,80.0,77.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0
1,0.0,46.0,2.0,0.0,0.0,28.73,95.0,76.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0
2,1.0,48.0,1.0,1.0,20.0,25.34,75.0,70.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.0,61.0,3.0,1.0,30.0,28.58,65.0,103.0,1.0,3.0,0.0,0.0,0.0,1.0,0.0
4,0.0,46.0,3.0,1.0,23.0,23.1,85.0,85.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0


In [28]:
df1 = df1.rename(columns = {0: 'E_0', 1: 'E_1', 2: 'E_2', 3: 'E_3', 4: 'E_4'}) # Correctly rename to reflect the category
df1.head()
# We can drop the education column to reduce data dimension

Unnamed: 0,male,age,education,currentsmoker,cigsperday,bmi,heartrate,glucose,tenyearchd,education_encoded,E_0,E_1,E_2,E_3,E_4
0,1.0,39.0,4.0,0.0,0.0,26.97,80.0,77.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0
1,0.0,46.0,2.0,0.0,0.0,28.73,95.0,76.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0
2,1.0,48.0,1.0,1.0,20.0,25.34,75.0,70.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.0,61.0,3.0,1.0,30.0,28.58,65.0,103.0,1.0,3.0,0.0,0.0,0.0,1.0,0.0
4,0.0,46.0,3.0,1.0,23.0,23.1,85.0,85.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0


### 3. Data Scaling

In [29]:
from sklearn.preprocessing import MinMaxScaler

In [30]:
# Normalize data
scaler = MinMaxScaler()
cols = df1.columns
a = scaler.fit_transform(df1)
scaled_df1 = pd.DataFrame(a, columns=cols) # Create new dataframe with scaled array & column names
scaled_df1.head()

Unnamed: 0,male,age,education,currentsmoker,cigsperday,bmi,heartrate,glucose,tenyearchd,education_encoded,E_0,E_1,E_2,E_3,E_4
0,1.0,0.184211,1.0,0.0,0.0,0.277024,0.363636,0.10452,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.368421,0.5,0.0,0.0,0.31968,0.515152,0.101695,0.0,0.5,0.0,0.0,1.0,0.0,0.0
2,1.0,0.421053,0.25,1.0,0.285714,0.237518,0.313131,0.084746,0.0,0.25,0.0,1.0,0.0,0.0,0.0
3,0.0,0.763158,0.75,1.0,0.428571,0.316045,0.212121,0.177966,1.0,0.75,0.0,0.0,0.0,1.0,0.0
4,0.0,0.368421,0.75,1.0,0.328571,0.183228,0.414141,0.127119,0.0,0.75,0.0,0.0,0.0,1.0,0.0


In [31]:
# Data without scaling has higher range
df1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
male,4237.0,0.429077,0.495003,0.0,0.0,0.0,1.0,1.0
age,4237.0,49.581544,8.57031,32.0,42.0,49.0,56.0,70.0
education,4237.0,1.930139,1.053046,0.0,1.0,2.0,3.0,4.0
currentsmoker,4237.0,0.493982,0.500023,0.0,0.0,0.0,1.0,1.0
cigsperday,4237.0,9.000765,11.879972,0.0,0.0,0.0,20.0,70.0
bmi,4237.0,25.799382,4.071168,15.54,23.08,25.4,28.03,56.8
heartrate,4237.0,75.878924,12.026596,44.0,68.0,75.0,83.0,143.0
glucose,4237.0,81.601369,22.867488,40.0,72.0,78.0,85.0,394.0
tenyearchd,4237.0,0.151758,0.358829,0.0,0.0,0.0,0.0,1.0
education_encoded,4237.0,1.930139,1.053046,0.0,1.0,2.0,3.0,4.0


In [32]:
# Scaled data have 0 - 1 of min and max range
scaled_df1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
male,4237.0,0.429077,0.495003,0.0,0.0,0.0,1.0,1.0
age,4237.0,0.462672,0.225534,0.0,0.263158,0.447368,0.631579,1.0
education,4237.0,0.482535,0.263261,0.0,0.25,0.5,0.75,1.0
currentsmoker,4237.0,0.493982,0.500023,0.0,0.0,0.0,1.0,1.0
cigsperday,4237.0,0.128582,0.169714,0.0,0.0,0.0,0.285714,1.0
bmi,4237.0,0.248652,0.098671,0.0,0.182744,0.238972,0.302714,1.0
heartrate,4237.0,0.322009,0.121481,0.0,0.242424,0.313131,0.393939,1.0
glucose,4237.0,0.117518,0.064597,0.0,0.090395,0.107345,0.127119,1.0
tenyearchd,4237.0,0.151758,0.358829,0.0,0.0,0.0,0.0,1.0
education_encoded,4237.0,0.482535,0.263261,0.0,0.25,0.5,0.75,1.0


### 4. Pipeline

In [41]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # Standardization is different from Normalization

In [43]:
# Call Target Variable and Predictors
X = df1.drop('tenyearchd', axis=1)
y = df1['tenyearchd']
print(X.shape, y.shape)

(4236, 14) (4236,)


In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# Instantiate Logistic Regression and Data Scaling 
logreg = LogisticRegression()
scaler = StandardScaler() 

# Use pipeline to combine LogisticRegression and StandardScaler
steps = [('s', scaler), ('log', logreg)]
pipeline = Pipeline(steps) # Pipeline combine all the tools into 1 function. COOOOOL !!

pipeline.fit(X_train, y_train)

score = pipeline.score(X_test, y_test)
print(f'Model Accuracy: {score:.3f}')

Model Accuracy: 0.865


__LOVE to take questions or suggestions. Please message Ricky.__