In [54]:
import pandas as pd
import numpy as np


# DataCleaning(Numerical & categorical)

In [55]:

data=pd.read_csv('cardiouncleaned.csv')

y_value_counts = data['cardio'].value_counts() # to count the values in cardio individualy
print(y_value_counts.dtype)
print(data.shape)

data.drop(labels=data[data["ap_hi"]<60].index,axis = 0,inplace = True)#delete rows that have ap_hi value less than 60
data.drop(data[data["ap_hi"]>240].index,axis = 0,inplace = True)#delete rows that have ap_hi value grater than 240
#data.drop(data[data["ap_hi"]<60 and data["ap_hi"]>240].index,axis = 0,inplace = True)
data.drop(data[data["ap_lo"]>180].index,axis = 0,inplace = True)#delete rows that have ap_lo value grater than 180
data.drop(data[data["ap_lo"]<40].index,axis = 0,inplace = True)#delete rows that have ap_lo value less than 40
#data.drop(data[data["ap_lo"]<40 and data[data["ap_lo"]>180]].index,axis = 0,inplace = True)
data.drop(data[data["weight"]<50].index,axis = 0,inplace = True)#delete rows that have weight less than 50 kg
data.drop(data[data["height"]<120].index,axis = 0,inplace = True)#delete rows that have height less than 120

data['gender']= data['gender'].replace(1,'female') #replace gender value 1 to female
data['gender']= data['gender'].replace(2,'male') #replace gender value 2 to male
data['cholesterol']= data['cholesterol'].replace(1,'c1_average') #replacing colestrol and all
data['cholesterol']= data['cholesterol'].replace(2,'c2_aboveaverage')
data['cholesterol']= data['cholesterol'].replace(3,'c3_high')
data['gluc']= data['gluc'].replace(1,'g1_average')
data['gluc']= data['gluc'].replace(2,'g2_aboveaverage')
data['gluc']= data['gluc'].replace(3,'g3_high')

print(data.shape)
print(data.head())

data.to_csv (r'cardiocleaned.csv') # saving data in a new file in csv format

int64
(70000, 13)
(67746, 13)
   id    age  gender  height  weight  ap_hi  ap_lo cholesterol        gluc  \
0   0  18393    male     168    62.0    110     80  c1_average  g1_average   
1   1  20228  female     156    85.0    140     90     c3_high  g1_average   
2   2  18857  female     165    64.0    130     70     c3_high  g1_average   
3   3  17623    male     169    82.0    150    100  c1_average  g1_average   
4   4  17474  female     156    56.0    100     60  c1_average  g1_average   

   smoke  alco  active  cardio  
0      0     0       1       0  
1      0     0       1       1  
2      0     0       0       1  
3      0     0       1       1  
4      0     0       0       0  


# Dividing dataframe into inputs(features) and output(labels)

In [56]:
data=pd.read_csv('cardiocleaned.csv')# reading the cleaned dataset

label = data['cardio'].values # output is seperated to a variable called label
features = data.drop(['cardio'], axis=1, inplace=False) #Input is seperated to variable data
print(label)
print(features)

[0 1 1 ... 1 1 0]
       Unnamed: 0     id    age  gender  height  weight  ap_hi  ap_lo  \
0               0      0  18393    male     168    62.0    110     80   
1               1      1  20228  female     156    85.0    140     90   
2               2      2  18857  female     165    64.0    130     70   
3               3      3  17623    male     169    82.0    150    100   
4               4      4  17474  female     156    56.0    100     60   
...           ...    ...    ...     ...     ...     ...    ...    ...   
67741       69995  99993  19240    male     168    76.0    120     80   
67742       69996  99995  22601  female     158   126.0    140     90   
67743       69997  99996  19066    male     183   105.0    180     90   
67744       69998  99998  22431  female     163    72.0    135     80   
67745       69999  99999  20540  female     170    72.0    120     80   

           cholesterol             gluc  smoke  alco  active  
0           c1_average       g1_average   

# Splitting data into Training,CrossValidation & Testing DataSets

In [57]:
# split data  into training, validation, testing data
from sklearn.model_selection import train_test_split # class to split data 

#For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros and 75% of ones, stratify=y will make sure that your random split has 25% of 0's and 75% of 1's.
#In this context, stratification means that the train_test_split method returns training and test subsets that have the same proportions of class labels as the input dataset.
inputtrain, inputtest, outputtrain, outputtest = train_test_split(features, label, test_size=0.2, stratify=label)
inputtrain, inputcv, outputtrain, outputcv = train_test_split(inputtrain, outputtrain, test_size=0.2, stratify=outputtrain)
#60%Train,20%CrossValidation,20%Test DataSets
print(inputtrain.shape)
print(inputcv.shape)
print(inputtest.shape)
print(outputtrain.shape)
print(outputcv.shape)
print(outputtest.shape)


(43356, 13)
(10840, 13)
(13550, 13)
(43356,)
(10840,)
(13550,)


# Data Preprocessing on Training,CV&Testing datasets

In [58]:
#fit() applied only on inputtrain datasets, transform() applied on inputtrain,inputtest,inputcv datasets
#fit() & transform() applied on each & every column(features)

In [59]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer

# scale age using min max
scale = MinMaxScaler()
 # reshape because of minmax take column and scale
scale.fit(inputtrain['age'].values.reshape(-1,1))
inputtrain_age = scale.transform(inputtrain['age'].values.reshape(-1,1))
inputcv_age = scale.transform(inputcv['age'].values.reshape(-1,1))
inputtest_age = scale.transform(inputtest['age'].values.reshape(-1,1))
print(inputtrain_age.shape)
print(inputcv_age.shape)
print(inputtest_age.shape)
print(outputtrain.shape)
print(outputcv.shape)
print(outputtest.shape)
print("..........................")




# convert gender to one hot encoding

vec = CountVectorizer()
vec.fit(inputtrain['gender'].values) 

inputtrain_gender = vec.transform(inputtrain['gender'].values)
inputcv_gender = vec.transform(inputcv['gender'].values)
inputtest_gender = vec.transform(inputtest['gender'].values)

print(inputtrain_gender.shape)
print(inputcv_gender.shape)
print(inputtest_gender.shape)
print(outputtrain.shape)
print(outputcv.shape)
print(outputtest.shape)
print("..........................")


# scale height using min max
scale = MinMaxScaler()
scale.fit(inputtrain['height'].values.reshape(-1,1)) # reshape because of minmax take column and scale

inputtrain_height = scale.transform(inputtrain['height'].values.reshape(-1,1))
inputcv_height = scale.transform(inputcv['height'].values.reshape(-1,1))
inputtest_height = scale.transform(inputtest['height'].values.reshape(-1,1))
print(inputtrain_height.shape)
print(inputcv_height.shape)
print(inputtest_height.shape)
print(outputtrain.shape)
print(outputcv.shape)
print(outputtest.shape)
print("..........................")


# scale weight using min max
scale = MinMaxScaler()
scale.fit(inputtrain['weight'].values.reshape(-1,1)) # reshape because of minmax take column and scale

inputtrain_weight = scale.transform(inputtrain['weight'].values.reshape(-1,1))
inputcv_weight = scale.transform(inputcv['weight'].values.reshape(-1,1))
inputtest_weight = scale.transform(inputtest['weight'].values.reshape(-1,1))
print(inputtrain_weight.shape)
print(inputcv_weight.shape)
print(inputtest_weight.shape)
print(outputtrain.shape)
print(outputcv.shape)
print(outputtest.shape)
print("..........................")


# scale ap_hi using min max
scale = MinMaxScaler()
scale.fit(inputtrain['ap_hi'].values.reshape(-1,1)) # reshape because of minmax take column and scale

inputtrain_ap_hi = scale.transform(inputtrain['ap_hi'].values.reshape(-1,1))
inputcv_ap_hi = scale.transform(inputcv['ap_hi'].values.reshape(-1,1))
inputtest_ap_hi = scale.transform(inputtest['ap_hi'].values.reshape(-1,1))
print(inputtrain_ap_hi.shape)
print(inputcv_ap_hi.shape)
print(inputtest_ap_hi.shape)
print(outputtrain.shape)
print(outputcv.shape)
print(outputtest.shape)
print("..........................")


# scale ap_lo using min max
scale = MinMaxScaler()
scale.fit(inputtrain['ap_lo'].values.reshape(-1,1)) # reshape because of minmax take column and scale

inputtrain_ap_lo = scale.transform(inputtrain['ap_lo'].values.reshape(-1,1))
inputcv_ap_lo = scale.transform(inputcv['ap_lo'].values.reshape(-1,1))
inputtest_ap_lo = scale.transform(inputtest['ap_lo'].values.reshape(-1,1))
print(inputtrain_ap_lo.shape)
print(inputcv_ap_lo.shape)
print(inputtest_ap_lo.shape)
print(outputtrain.shape)
print(outputcv.shape)
print(outputtest.shape)
print("..........................")


# convert cholesterol to one hot encoding

vec = CountVectorizer()
vec.fit(inputtrain['cholesterol'].values) 

inputtrain_cholesterol = vec.transform(inputtrain['cholesterol'].values)
inputcv_cholesterol = vec.transform(inputcv['cholesterol'].values)
inputtest_cholesterol = vec.transform(inputtest['cholesterol'].values)

print(inputtrain_cholesterol.shape)
print(inputcv_cholesterol.shape)
print(inputtest_cholesterol.shape)
print(outputtrain.shape)
print(outputcv.shape)
print(outputtest.shape)
print("..........................")



# convert gluc to one hot encoding

vec = CountVectorizer()
vec.fit(inputtrain['gluc'].values) 

inputtrain_gluc = vec.transform(inputtrain['gluc'].values)
inputcv_gluc = vec.transform(inputcv['gluc'].values)
inputtest_gluc = vec.transform(inputtest['gluc'].values)

print(inputtrain_gluc.shape)
print(inputcv_gluc.shape)
print(inputtest_gluc.shape)
print(outputtrain.shape)
print(outputcv.shape)
print(outputtest.shape)
print("..........................")


# reshape binary feature smoke
inputtrain_smoke = inputtrain['smoke'].values.reshape(-1,1)
inputcv_smoke = inputcv['smoke'].values.reshape(-1,1)
inputtest_smoke = inputtest['smoke'].values.reshape(-1,1)
print(inputtrain_smoke.shape)
print(inputcv_smoke.shape)
print(inputtest_smoke.shape)
print(outputtrain.shape)
print(outputcv.shape)
print(outputtest.shape)
print("..........................")



# reshape binary feature alco
inputtrain_alco = inputtrain['alco'].values.reshape(-1,1)
inputcv_alco = inputcv['alco'].values.reshape(-1,1)
inputtest_alco = inputtest['alco'].values.reshape(-1,1)
print(inputtrain_alco.shape)
print(inputcv_alco.shape)
print(inputtest_alco.shape)
print(outputtrain.shape)
print(outputcv.shape)
print(outputtest.shape)
print("..........................")


# reshape binary feature active
inputtrain_active = inputtrain['active'].values.reshape(-1,1)
inputcv_active = inputcv['active'].values.reshape(-1,1)
inputtest_active = inputtest['active'].values.reshape(-1,1)
print(inputtrain_active.shape)
print(inputcv_active.shape)
print(inputtest_active.shape)
print(outputtrain.shape)
print(outputcv.shape)
print(outputtest.shape)
print("..........................")


(43356, 1)
(10840, 1)
(13550, 1)
(43356,)
(10840,)
(13550,)
..........................
(43356, 2)
(10840, 2)
(13550, 2)
(43356,)
(10840,)
(13550,)
..........................
(43356, 1)
(10840, 1)
(13550, 1)
(43356,)
(10840,)
(13550,)
..........................
(43356, 1)
(10840, 1)
(13550, 1)
(43356,)
(10840,)
(13550,)
..........................
(43356, 1)
(10840, 1)
(13550, 1)
(43356,)
(10840,)
(13550,)
..........................
(43356, 1)
(10840, 1)
(13550, 1)
(43356,)
(10840,)
(13550,)
..........................
(43356, 3)
(10840, 3)
(13550, 3)
(43356,)
(10840,)
(13550,)
..........................
(43356, 3)
(10840, 3)
(13550, 3)
(43356,)
(10840,)
(13550,)
..........................
(43356, 1)
(10840, 1)
(13550, 1)
(43356,)
(10840,)
(13550,)
..........................
(43356, 1)
(10840, 1)
(13550, 1)
(43356,)
(10840,)
(13550,)
..........................
(43356, 1)
(10840, 1)
(13550, 1)
(43356,)
(10840,)
(13550,)
..........................


In [60]:
# stack arrays horizontally

from scipy.sparse import hstack

train = hstack((inputtrain_age,inputtrain_gender,inputtrain_height,inputtrain_weight,inputtrain_ap_hi,inputtrain_ap_lo,inputtrain_cholesterol,inputtrain_gluc,inputtrain_smoke,inputtrain_alco,inputtrain_active)).tocsr()
cv = hstack((inputcv_age,inputcv_gender,inputcv_height,inputcv_weight,inputcv_ap_hi,inputcv_ap_lo,inputcv_cholesterol,inputcv_gluc,inputcv_smoke,inputcv_alco,inputcv_active)).tocsr()
test = hstack((inputtest_age,inputtest_gender,inputtest_height,inputtest_weight,inputtest_ap_hi,inputtest_ap_lo,inputtest_cholesterol,inputtest_gluc,inputtest_smoke,inputtest_alco,inputtest_active)).tocsr()

print(train.shape)
print(cv.shape)
print(test.shape)
print(outputtrain.shape)
print(outputcv.shape)
print(outputtest.shape)
print("..........................")

print(train)#sparse Matrix
print(train.toarray())# convert sparse matrix to array format

(43356, 16)
(10840, 16)
(13550, 16)
(43356,)
(10840,)
(13550,)
..........................
  (0, 0)	0.6238482384823848
  (0, 1)	1.0
  (0, 3)	0.5641025641025643
  (0, 4)	0.3466666666666667
  (0, 5)	0.4117647058823529
  (0, 6)	0.3571428571428571
  (0, 8)	1.0
  (0, 10)	1.0
  (1, 0)	0.694463801780875
  (1, 2)	1.0
  (1, 3)	0.641025641025641
  (1, 4)	0.2466666666666667
  (1, 5)	0.3529411764705882
  (1, 6)	0.2857142857142857
  (1, 7)	1.0
  (1, 10)	1.0
  (1, 15)	1.0
  (2, 0)	0.981262098335269
  (2, 1)	1.0
  (2, 3)	0.4487179487179487
  (2, 4)	0.14
  (2, 5)	0.23529411764705888
  (2, 6)	0.2857142857142857
  (2, 7)	1.0
  (2, 10)	1.0
  :	:
  (43353, 5)	0.4117647058823529
  (43353, 6)	0.2857142857142857
  (43353, 7)	1.0
  (43353, 10)	1.0
  (43353, 15)	1.0
  (43354, 0)	0.6957801006581493
  (43354, 2)	1.0
  (43354, 3)	0.5641025641025643
  (43354, 4)	0.046666666666666634
  (43354, 5)	0.23529411764705888
  (43354, 6)	0.2142857142857143
  (43354, 8)	1.0
  (43354, 10)	1.0
  (43354, 13)	1.0
  (43354, 14)	1.

In [74]:
inputtrain['smoke'].values

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [75]:
inputtrain['smoke'].values.reshape(-1,1)

array([[0],
       [0],
       [0],
       ...,
       [0],
       [1],
       [0]], dtype=int64)