In [94]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.model_selection import  train_test_split
from sklearn.linear_model import  LogisticRegression
from sklearn.tree import  DecisionTreeClassifier
from sklearn.naive_bayes import  GaussianNB
from sklearn.ensemble import  RandomForestClassifier
from sklearn.metrics import  accuracy_score,precision_score,classification_report,confusion_matrix,recall_score,f1_score
from sklearn.impute import  SimpleImputer
from sklearn.compose import  ColumnTransformer
from sklearn.pipeline import  Pipeline
import warnings
warnings.filterwarnings('ignore')

In [96]:
dataset = pd.read_csv('/config/workspace/CardioVascular_Disease_Prediction/notebooks/data/CVD_resampled_labeled.csv')
dataset.head()

Unnamed: 0.1,Unnamed: 0,Height_(cm),Weight_(kg),BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,General_Health,Checkup,Exercise,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Smoking_History,Heart_Disease
0,0,150.0,32.66,14.54,0.0,30.0,16.0,12.0,Poor,Within the past 2 years,No,No,No,No,No,Yes,Female,70-74,Yes,No
1,1,165.0,77.11,28.29,0.0,30.0,0.0,4.0,Very Good,Within the past year,No,No,No,No,Yes,No,Female,70-74,No,Yes
2,2,163.0,88.45,33.47,4.0,12.0,3.0,16.0,Very Good,Within the past year,Yes,No,No,No,Yes,No,Female,60-64,No,No
3,3,180.0,93.44,28.73,0.0,30.0,30.0,8.0,Poor,Within the past year,Yes,No,No,No,Yes,No,Male,75-79,No,Yes
4,4,191.0,88.45,24.37,0.0,8.0,4.0,0.0,Good,Within the past year,No,No,No,No,No,No,Male,80+,Yes,No


In [98]:
dataset = dataset.drop('Unnamed: 0',axis=1)
dataset.head()

KeyError: "['Unnamed: 0'] not found in axis"

In [99]:
X = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1]

In [100]:
X.head(10)

Unnamed: 0,Height_(cm),Weight_(kg),BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,General_Health,Checkup,Exercise,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Smoking_History
0,150.0,32.66,14.54,0.0,30.0,16.0,12.0,Poor,Within the past 2 years,No,No,No,No,No,Yes,Female,70-74,Yes
1,165.0,77.11,28.29,0.0,30.0,0.0,4.0,Very Good,Within the past year,No,No,No,No,Yes,No,Female,70-74,No
2,163.0,88.45,33.47,4.0,12.0,3.0,16.0,Very Good,Within the past year,Yes,No,No,No,Yes,No,Female,60-64,No
3,180.0,93.44,28.73,0.0,30.0,30.0,8.0,Poor,Within the past year,Yes,No,No,No,Yes,No,Male,75-79,No
4,191.0,88.45,24.37,0.0,8.0,4.0,0.0,Good,Within the past year,No,No,No,No,No,No,Male,80+,Yes
5,183.0,154.22,46.11,0.0,12.0,12.0,12.0,Good,Within the past year,No,No,No,Yes,No,Yes,Male,60-64,No
6,175.0,69.85,22.74,0.0,16.0,8.0,0.0,Fair,Within the past year,Yes,No,No,No,No,Yes,Male,60-64,Yes
7,165.0,108.86,39.94,3.0,30.0,8.0,8.0,Good,Within the past year,Yes,No,No,No,No,Yes,Female,65-69,Yes
8,163.0,72.57,27.46,0.0,12.0,12.0,4.0,Fair,Within the past year,No,No,No,Yes,No,No,Female,65-69,Yes
9,163.0,91.63,34.67,0.0,12.0,12.0,1.0,Fair,Within the past year,No,No,No,No,Yes,Yes,Female,70-74,No


In [101]:
y.head(10)

0     No
1    Yes
2     No
3    Yes
4     No
5     No
6    Yes
7     No
8     No
9     No
Name: Heart_Disease, dtype: object

In [106]:
Heart_Disease_catgeory = {'No':0,'Yes':1}
y = y.map(Heart_Disease_catgeory)

In [107]:
y.head(10)

0    0
1    1
2    0
3    1
4    0
5    0
6    1
7    0
8    0
9    0
Name: Heart_Disease, dtype: int64

In [108]:
numerical_features = [col for col in X.columns if X[col].dtype != 'O']
categorical_features = [col for col in X.columns if X[col].dtype == 'O']

In [109]:
numerical_features,categorical_features

(['Height_(cm)',
  'Weight_(kg)',
  'BMI',
  'Alcohol_Consumption',
  'Fruit_Consumption',
  'Green_Vegetables_Consumption',
  'FriedPotato_Consumption'],
 ['General_Health',
  'Checkup',
  'Exercise',
  'Skin_Cancer',
  'Other_Cancer',
  'Depression',
  'Diabetes',
  'Arthritis',
  'Sex',
  'Age_Category',
  'Smoking_History'])

In [110]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=42)

In [111]:
General_Health_category = ["Poor",'Fair','Good','Very Good','Excellent'] 
Checkup_category = ['Never','Within the past year','Within the past 2 years','Within the past 5 years','5 or more years ago']
Exercise_category = ['No','Yes']
Skin_Cancer_category = ['No','Yes']
Other_Cancer_category = ['No','Yes']
Depression_category = ['No','Yes']
Diabetes_category = ['No','No, pre-diabetes or borderline diabetes','Yes, but female told only during pregnancy','Yes']
Arthritis_category = ['No','Yes']
Sex_category = ['Male','Female']
Age_Category_category = ['18-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59','60-64','65-69','70-74','75-79','80+']
Smoking_History_category = ['No','Yes']

In [112]:
            num_pipeline = Pipeline(
            steps=[
                ('imputer',SimpleImputer(strategy='median')),
                ('scaler',StandardScaler())
            ]   
            )

            cat_pipeline = Pipeline(
                steps=[
                    ('imputer',SimpleImputer(strategy='most_frequent')),
                    ('encoder',OrdinalEncoder(categories=[General_Health_category,Checkup_category,Exercise_category,Skin_Cancer_category,Other_Cancer_category,Depression_category,Diabetes_category,Arthritis_category,Sex_category,Age_Category_category,Smoking_History_category])),
                ]
            )

            preprocessor = ColumnTransformer([
                ('num_pipeline',num_pipeline,numerical_features),
                ('cat_pipeline',cat_pipeline,categorical_features)
            ])

In [113]:
X_train = preprocessor.fit_transform(X_train)
X_test  = preprocessor.transform(X_test)

In [114]:
def evaluate_models(models):
    try:
        # logging.info("Initiate Evaluation of Models.")
        accuracy_score_list = []
        precision_score_list = []
        recall_score_list = []
        f1_score_list = []

        for i in range(len(list(models.keys()))):
            model = list(models.values())[i]

            model.fit(X_train,y_train)

            y_pred = model.predict(X_test)

            accuracy_score_list.append(accuracy_score(y_pred=y_pred, y_true=y_test))
            precision_score_list.append(precision_score(y_pred=y_pred, y_true=y_test))
            recall_score_list.append(recall_score(y_pred=y_pred, y_true=y_test))
            f1_score_list.append(f1_score(y_pred=y_pred, y_true=y_test))

        # logging.info("Evaluation of Models Terminated Successfully.")
        return (
            accuracy_score_list,
            precision_score_list,
            recall_score_list,
            f1_score_list
        )


    except Exception as e:
        # logging.info("Error occured in Model Evaluation Process.")
        raise CustomException(e, sys)

In [115]:
models = {
            'Logistic Regression'      : LogisticRegression(),
            'Decision Tree'            : DecisionTreeClassifier(),
            'Naive Bayes'              : GaussianNB(),
            'Random Forest Classifier' : RandomForestClassifier()
        }

accuracy_score_list, precision_score_list, recall_score_list, f1_score_list = evaluate_models(models)
best_score_index = accuracy_score_list.index(max(accuracy_score_list))

best_accuracy_score = accuracy_score_list[best_score_index]
            
model_precsison_score = precision_score_list[best_score_index]
            
model_recall_score = recall_score_list[best_score_index]
            
model_f1_score = f1_score_list[best_score_index]

model_name = list(models.keys())[best_score_index]

best_model = list(models.values())[best_score_index]

In [116]:
for i in range(len(accuracy_score_list)):
    print(list(models.keys())[i],":",accuracy_score_list[i])

Logistic Regression : 0.8556923221479663
Decision Tree : 0.9025851235010159
Naive Bayes : 0.8352673799931878
Random Forest Classifier : 0.9389307149317015


In [117]:
# ----------------Dataset resapmled and labeled------------------------------

In [69]:
df = pd.read_csv('/config/workspace/CardioVascular_Disease_Prediction/notebooks/data/CVD_resampled.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Height_(cm),Weight_(kg),BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,General_Health,Checkup,Exercise,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Smoking_History,Heart_Disease
0,0,150.0,32.66,14.54,0.0,30.0,16.0,12.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,10.0,1.0,0.0
1,1,165.0,77.11,28.29,0.0,30.0,0.0,4.0,3.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,10.0,0.0,1.0
2,2,163.0,88.45,33.47,4.0,12.0,3.0,16.0,3.0,1.0,1.0,0.0,0.0,0.0,3.0,0.0,1.0,8.0,0.0,0.0
3,3,180.0,93.44,28.73,0.0,30.0,30.0,8.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,11.0,0.0,1.0
4,4,191.0,88.45,24.37,0.0,8.0,4.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,1.0,0.0


In [70]:
df = df.drop('Unnamed: 0',axis=1)
df.head()

Unnamed: 0,Height_(cm),Weight_(kg),BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,General_Health,Checkup,Exercise,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Smoking_History,Heart_Disease
0,150.0,32.66,14.54,0.0,30.0,16.0,12.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,10.0,1.0,0.0
1,165.0,77.11,28.29,0.0,30.0,0.0,4.0,3.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,10.0,0.0,1.0
2,163.0,88.45,33.47,4.0,12.0,3.0,16.0,3.0,1.0,1.0,0.0,0.0,0.0,3.0,0.0,1.0,8.0,0.0,0.0
3,180.0,93.44,28.73,0.0,30.0,30.0,8.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,11.0,0.0,1.0
4,191.0,88.45,24.37,0.0,8.0,4.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,1.0,0.0


In [71]:
df.isna().sum()

Height_(cm)                     0
Weight_(kg)                     0
BMI                             0
Alcohol_Consumption             0
Fruit_Consumption               0
Green_Vegetables_Consumption    0
FriedPotato_Consumption         0
General_Health                  0
Checkup                         0
Exercise                        0
Skin_Cancer                     0
Other_Cancer                    0
Depression                      0
Diabetes                        0
Arthritis                       0
Sex                             0
Age_Category                    0
Smoking_History                 0
Heart_Disease                   0
dtype: int64

In [72]:
General_Health_map =    {0: "Poor",1: 'Fair',2: 'Good',3: 'Very Good',4: 'Excellent'} 
Checkup_map =           {0: 'Never',1: 'Within the past year',2: 'Within the past 2 years',3: 'Within the past 5 years',4: '5 or more years ago'}
Exercise_map =          {0: 'No',1: 'Yes'}
Skin_Cancer_map =       {0: 'No',1: 'Yes'}
Other_Cancer_map =      {0: 'No',1: 'Yes'}
Depression_map =        {0: 'No',1: 'Yes'}
Diabetes_map =          {0: 'No',1: 'No, pre-diabetes or borderline diabetes',2: 'Yes, but female told only during pregnancy',3: 'Yes'}
Arthritis_map =         {0: 'No',1: 'Yes'}
Sex_map =               {0: 'Male',1: 'Female'}
Age_Category_map =      {0: '18-24',1: '25-29',2: '30-34',3: '35-39',4: '40-44',5: '45-49',6: '50-54',7: '55-59',8: '60-64',9: '65-69',10: '70-74',11: '75-79',12: '80+'}
Smoking_History_map =   {0: 'No',1: 'Yes'}
Heart_Disease_map =     {0: 'No',1: 'Yes'}

In [73]:
        df['General_Health'] = df['General_Health'].map(General_Health_map)
        df['Checkup'] = df['Checkup'].map(Checkup_map)
        df['Exercise'] = df['Exercise'].map(Exercise_map)
        df['Skin_Cancer'] = df['Skin_Cancer'].map(Skin_Cancer_map)
        df['Other_Cancer'] = df['Other_Cancer'].map(Other_Cancer_map)
        df['Depression'] = df['Depression'].map(Depression_map)
        df['Diabetes'] = df['Diabetes'].map(Diabetes_map)
        df['Arthritis'] = df['Arthritis'].map(Arthritis_map)
        df['Sex'] = df['Sex'].map(Sex_map)
        df['Age_Category'] = df['Age_Category'].map(Age_Category_map)
        df['Smoking_History'] = df['Smoking_History'].map(Smoking_History_map)
        df['Heart_Disease'] = df['Heart_Disease'].map(Smoking_History_map)

In [74]:
df.head(10)

Unnamed: 0,Height_(cm),Weight_(kg),BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,General_Health,Checkup,Exercise,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Smoking_History,Heart_Disease
0,150.0,32.66,14.54,0.0,30.0,16.0,12.0,Poor,Within the past 2 years,No,No,No,No,No,Yes,Female,70-74,Yes,No
1,165.0,77.11,28.29,0.0,30.0,0.0,4.0,Very Good,Within the past year,No,No,No,No,Yes,No,Female,70-74,No,Yes
2,163.0,88.45,33.47,4.0,12.0,3.0,16.0,Very Good,Within the past year,Yes,No,No,No,Yes,No,Female,60-64,No,No
3,180.0,93.44,28.73,0.0,30.0,30.0,8.0,Poor,Within the past year,Yes,No,No,No,Yes,No,Male,75-79,No,Yes
4,191.0,88.45,24.37,0.0,8.0,4.0,0.0,Good,Within the past year,No,No,No,No,No,No,Male,80+,Yes,No
5,183.0,154.22,46.11,0.0,12.0,12.0,12.0,Good,Within the past year,No,No,No,Yes,No,Yes,Male,60-64,No,No
6,175.0,69.85,22.74,0.0,16.0,8.0,0.0,Fair,Within the past year,Yes,No,No,No,No,Yes,Male,60-64,Yes,Yes
7,165.0,108.86,39.94,3.0,30.0,8.0,8.0,Good,Within the past year,Yes,No,No,No,No,Yes,Female,65-69,Yes,No
8,163.0,72.57,27.46,0.0,12.0,12.0,4.0,Fair,Within the past year,No,No,No,Yes,No,No,Female,65-69,Yes,No
9,163.0,91.63,34.67,0.0,12.0,12.0,1.0,Fair,Within the past year,No,No,No,No,Yes,Yes,Female,70-74,No,No


In [75]:
            cat_pipeline = Pipeline(
                steps=[
                    ('imputer',SimpleImputer(strategy='most_frequent')),
                    # ('encoder',OrdinalEncoder(categories=[General_Health_category,Checkup_category,Exercise_category,Skin_Cancer_category,Other_Cancer_category,Depression_category,Diabetes_category,Arthritis_category,Sex_category,Age_Category_category,Smoking_History_category])),
                ]
            )

In [76]:
categorical_features = [col for col in df.columns if df[col].dtype == 'O']
categorical_features

['General_Health',
 'Checkup',
 'Exercise',
 'Skin_Cancer',
 'Other_Cancer',
 'Depression',
 'Diabetes',
 'Arthritis',
 'Sex',
 'Age_Category',
 'Smoking_History',
 'Heart_Disease']

In [77]:
df_cat = df.iloc[:,7:]
df_cat.head(10)

Unnamed: 0,General_Health,Checkup,Exercise,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Smoking_History,Heart_Disease
0,Poor,Within the past 2 years,No,No,No,No,No,Yes,Female,70-74,Yes,No
1,Very Good,Within the past year,No,No,No,No,Yes,No,Female,70-74,No,Yes
2,Very Good,Within the past year,Yes,No,No,No,Yes,No,Female,60-64,No,No
3,Poor,Within the past year,Yes,No,No,No,Yes,No,Male,75-79,No,Yes
4,Good,Within the past year,No,No,No,No,No,No,Male,80+,Yes,No
5,Good,Within the past year,No,No,No,Yes,No,Yes,Male,60-64,No,No
6,Fair,Within the past year,Yes,No,No,No,No,Yes,Male,60-64,Yes,Yes
7,Good,Within the past year,Yes,No,No,No,No,Yes,Female,65-69,Yes,No
8,Fair,Within the past year,No,No,No,Yes,No,No,Female,65-69,Yes,No
9,Fair,Within the past year,No,No,No,No,Yes,Yes,Female,70-74,No,No


In [78]:
df = df.drop(categorical_features,axis=1)
df.head()

Unnamed: 0,Height_(cm),Weight_(kg),BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,150.0,32.66,14.54,0.0,30.0,16.0,12.0
1,165.0,77.11,28.29,0.0,30.0,0.0,4.0
2,163.0,88.45,33.47,4.0,12.0,3.0,16.0
3,180.0,93.44,28.73,0.0,30.0,30.0,8.0
4,191.0,88.45,24.37,0.0,8.0,4.0,0.0


In [79]:
df_cat = pd.DataFrame(cat_pipeline.fit_transform(df_cat))
df_cat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,Poor,Within the past 2 years,No,No,No,No,No,Yes,Female,70-74,Yes,No
1,Very Good,Within the past year,No,No,No,No,Yes,No,Female,70-74,No,Yes
2,Very Good,Within the past year,Yes,No,No,No,Yes,No,Female,60-64,No,No
3,Poor,Within the past year,Yes,No,No,No,Yes,No,Male,75-79,No,Yes
4,Good,Within the past year,No,No,No,No,No,No,Male,80+,Yes,No


In [80]:
df_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 567606 entries, 0 to 567605
Data columns (total 12 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       567606 non-null  object
 1   1       567606 non-null  object
 2   2       567606 non-null  object
 3   3       567606 non-null  object
 4   4       567606 non-null  object
 5   5       567606 non-null  object
 6   6       567606 non-null  object
 7   7       567606 non-null  object
 8   8       567606 non-null  object
 9   9       567606 non-null  object
 10  10      567606 non-null  object
 11  11      567606 non-null  object
dtypes: object(12)
memory usage: 52.0+ MB


In [81]:
df_cat.isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
dtype: int64

In [82]:
df_cat.columns = categorical_features
df_cat.head(10)

Unnamed: 0,General_Health,Checkup,Exercise,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Smoking_History,Heart_Disease
0,Poor,Within the past 2 years,No,No,No,No,No,Yes,Female,70-74,Yes,No
1,Very Good,Within the past year,No,No,No,No,Yes,No,Female,70-74,No,Yes
2,Very Good,Within the past year,Yes,No,No,No,Yes,No,Female,60-64,No,No
3,Poor,Within the past year,Yes,No,No,No,Yes,No,Male,75-79,No,Yes
4,Good,Within the past year,No,No,No,No,No,No,Male,80+,Yes,No
5,Good,Within the past year,No,No,No,Yes,No,Yes,Male,60-64,No,No
6,Fair,Within the past year,Yes,No,No,No,No,Yes,Male,60-64,Yes,Yes
7,Good,Within the past year,Yes,No,No,No,No,Yes,Female,65-69,Yes,No
8,Fair,Within the past year,No,No,No,Yes,No,No,Female,65-69,Yes,No
9,Fair,Within the past year,No,No,No,No,Yes,Yes,Female,70-74,No,No


In [83]:
df_cat['Heart_Disease'].value_counts()

Heart_Disease
No     283803
Yes    283803
Name: count, dtype: int64

In [84]:
df_resampled_labeled = pd.concat([df,df_cat],axis=1)

In [85]:
df_resampled_labeled.isna().sum()

Height_(cm)                     0
Weight_(kg)                     0
BMI                             0
Alcohol_Consumption             0
Fruit_Consumption               0
Green_Vegetables_Consumption    0
FriedPotato_Consumption         0
General_Health                  0
Checkup                         0
Exercise                        0
Skin_Cancer                     0
Other_Cancer                    0
Depression                      0
Diabetes                        0
Arthritis                       0
Sex                             0
Age_Category                    0
Smoking_History                 0
Heart_Disease                   0
dtype: int64

In [86]:
df_resampled_labeled['Heart_Disease'].value_counts()

Heart_Disease
No     283803
Yes    283803
Name: count, dtype: int64

In [87]:
df_resampled_labeled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 567606 entries, 0 to 567605
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Height_(cm)                   567606 non-null  float64
 1   Weight_(kg)                   567606 non-null  float64
 2   BMI                           567606 non-null  float64
 3   Alcohol_Consumption           567606 non-null  float64
 4   Fruit_Consumption             567606 non-null  float64
 5   Green_Vegetables_Consumption  567606 non-null  float64
 6   FriedPotato_Consumption       567606 non-null  float64
 7   General_Health                567606 non-null  object 
 8   Checkup                       567606 non-null  object 
 9   Exercise                      567606 non-null  object 
 10  Skin_Cancer                   567606 non-null  object 
 11  Other_Cancer                  567606 non-null  object 
 12  Depression                    567606 non-nul

In [88]:
df_resampled_labeled.shape

(567606, 19)

In [91]:
import pandas as pd

In [93]:
df_resampled_labeled.to_csv('/config/workspace/CardioVascular_Disease_Prediction/notebooks/data/CVD_resampled_labeled.csv')