In [2]:
import pandas as pd

In [126]:
df_brain_stroke = pd.read_csv('./brain_stroke_cleaned.csv', sep = ';')

In [127]:
df_brain_stroke.head()

Unnamed: 0,gender,age,bmi,smoking_status,stroke
0,1,67,36.6,2,1
1,1,80,32.5,0,1
2,0,49,34.4,3,1
3,0,79,24.0,0,1
4,1,81,29.0,2,1


In [128]:
df_cardiovascular_disease = pd.read_csv('./cardiovascular_disease_cleaned.csv')

In [129]:
df_cardiovascular_disease.head()

Unnamed: 0,age,gender,height,weight,smoke,alco,active,cardio
0,50,1,168,62.0,0,0,1,0
1,55,0,156,85.0,0,0,1,1
2,51,0,165,64.0,0,0,0,1
3,48,1,169,82.0,0,0,1,1
4,47,0,156,56.0,0,0,0,0


In [130]:
df_cardiovascular_disease.tail()

Unnamed: 0,age,gender,height,weight,smoke,alco,active,cardio
69995,52,1,168,76.0,1,0,1,0
69996,61,0,158,126.0,0,0,1,1
69997,52,1,183,105.0,0,1,0,1
69998,61,0,163,72.0,0,0,0,1
69999,56,0,170,72.0,0,0,1,0


In [131]:
df_lung_cancer = pd.read_csv('./lung_cancer_cleaned.csv')

In [132]:
df_lung_cancer.head()

Unnamed: 0,GENDER,AGE,SMOKING,FATIGUE,ALLERGY,ALCOHOL CONSUMING,LUNG_CANCER
0,1,69,0,2,1,1,1
1,1,74,1,2,2,0,1
2,0,59,0,2,1,0,0
3,1,63,1,1,1,1,0
4,0,63,0,1,1,0,0


In [133]:
df_obesity = pd.read_csv('./obesity_cleaned.csv')

In [134]:
df_obesity.head()

Unnamed: 0,Age,Gender,Height,Weight,BMI,Label
0,25,1,175,80,25.3,1
1,30,0,160,60,22.5,1
2,35,1,180,90,27.3,2
3,40,0,150,50,20.0,0
4,45,1,190,100,31.2,3


In [135]:
from sklearn.model_selection import train_test_split

In [136]:
y_brain_stroke = df_brain_stroke['stroke']

X_brain_stroke = df_brain_stroke.drop('stroke', axis = 1)
print(y_brain_stroke.shape, X_brain_stroke.shape)

(4981,) (4981, 4)


In [137]:
X_train_brain_stroke, X_test_brain_stroke, y_train_brain_stroke, y_test_brain_stroke, = train_test_split(X_brain_stroke, y_brain_stroke, test_size = 0.20, random_state = 42)

In [138]:
y_cardiovascular_disease = df_cardiovascular_disease['cardio']

X_cardiovascular_disease = df_cardiovascular_disease.drop('cardio', axis = 1)
print(y_cardiovascular_disease.shape, X_cardiovascular_disease.shape)

(70000,) (70000, 7)


In [139]:
X_train_cardiovascular_disease, X_test_cardiovascular_disease, y_train_cardiovascular_disease, y_test_cardiovascular_disease, = train_test_split(X_cardiovascular_disease, y_cardiovascular_disease, test_size = 0.20, random_state = 42)

In [141]:
y_lung_cancer = df_lung_cancer['LUNG_CANCER']

X_lung_cancer = df_lung_cancer[['GENDER', 'SMOKING', 'ALCOHOL CONSUMING', 'LUNG_CANCER']]
print(y_lung_cancer.shape, X_lung_cancer.shape)

(309,) (309, 4)


In [142]:
X_train_lung_cancer, X_test_lung_cancer, y_train_lung_cancer, y_test_lung_cancer, = train_test_split(X_lung_cancer, y_lung_cancer, test_size = 0.20, random_state = 42)

In [143]:
y_obesity = df_obesity['Label']

X_obesity = df_obesity.drop('Label', axis = 1)
print(y_obesity.shape, X_obesity.shape)

(108,) (108, 5)


In [144]:
X_train_obesity, X_test_obesity, y_train_obesity, y_test_obesity, = train_test_split(X_obesity, y_obesity, test_size = 0.20, random_state = 42)

In [145]:
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import warnings
warnings.filterwarnings('ignore')

## Decision Tree Classifier

In [146]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()

In [147]:
tree_params = {
    'max_depth': [None, 1, 2, 3, 4, 5],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'class_weight': [None, 'balanced'],
    'random_state': [42]
}

In [148]:
search_obj = RandomizedSearchCV(tree, param_distributions = tree_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train_brain_stroke, y_train_brain_stroke)
print(fit_obj.cv_results_['mean_test_score'])
model_tree_brain_stroke = fit_obj.best_estimator_

[0.48752254 0.48752254 0.40982097 0.48725843 0.4967495  0.53934707
 0.4873244  0.56663872 0.51294593 0.55010611]


In [149]:
model_tree_brain_stroke.fit(X_train_brain_stroke, y_train_brain_stroke)
print("Train: ", model_tree_brain_stroke.score(X_train_brain_stroke, y_train_brain_stroke))
print("Test: ", model_tree_brain_stroke.score(X_test_brain_stroke, y_test_brain_stroke))

Train:  0.8727409638554217
Test:  0.8635907723169508


In [150]:
y_pred_tree_brain_stroke = model_tree_brain_stroke.predict(X_test_brain_stroke)

In [151]:
accuracy = accuracy_score(y_test_brain_stroke, y_pred_tree_brain_stroke)
conf_matrix = confusion_matrix(y_test_brain_stroke, y_pred_tree_brain_stroke)
f1 = f1_score(y_test_brain_stroke, y_pred_tree_brain_stroke, average='weighted')

In [152]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.8635907723169508
F1 Score: 0.8847930169289691
Confusion Matrix:
 [[847  96]
 [ 40  14]]


In [153]:
search_obj = RandomizedSearchCV(tree, param_distributions = tree_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train_cardiovascular_disease, y_train_cardiovascular_disease)
print(fit_obj.cv_results_['mean_test_score'])
model_tree_cardiovascular_disease = fit_obj.best_estimator_

[0.46311536 0.5896756  0.55135787 0.58958605 0.55338964 0.55960274
 0.5608426  0.50161138 0.46311536 0.54394027]


In [154]:
model_tree_cardiovascular_disease.fit(X_train_cardiovascular_disease, y_train_cardiovascular_disease)
print("Train: ", model_tree_cardiovascular_disease.score(X_train_cardiovascular_disease, y_train_cardiovascular_disease))
print("Test: ", model_tree_cardiovascular_disease.score(X_test_cardiovascular_disease, y_test_cardiovascular_disease))

Train:  0.5927142857142857
Test:  0.5928571428571429


In [155]:
search_obj = RandomizedSearchCV(tree, param_distributions = tree_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train_lung_cancer, y_train_lung_cancer)
print(fit_obj.cv_results_['mean_test_score'])
model_tree_lung_cancer = fit_obj.best_estimator_

[1.         1.         1.         1.         1.         1.
 0.45953177 1.         1.         1.        ]


In [156]:
y_pred_tree_cardiovascular_disease = model_tree_cardiovascular_disease.predict(X_test_cardiovascular_disease)

In [157]:
accuracy = accuracy_score(y_test_cardiovascular_disease, y_pred_tree_cardiovascular_disease)
conf_matrix = confusion_matrix(y_test_cardiovascular_disease, y_pred_tree_cardiovascular_disease)
f1 = f1_score(y_test_cardiovascular_disease, y_pred_tree_cardiovascular_disease, average='weighted')

In [158]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.5928571428571429
F1 Score: 0.59001397357275
Confusion Matrix:
 [[4727 2261]
 [3439 3573]]


In [159]:
model_tree_lung_cancer.fit(X_train_lung_cancer, y_train_lung_cancer)
print("Train: ", model_tree_lung_cancer.score(X_train_lung_cancer, y_train_lung_cancer))
print("Test: ", model_tree_lung_cancer.score(X_test_lung_cancer, y_test_lung_cancer))

Train:  1.0
Test:  1.0


In [160]:
y_pred_tree_lung_cancer = model_tree_lung_cancer.predict(X_test_lung_cancer)

In [161]:
accuracy = accuracy_score(y_test_lung_cancer, y_pred_tree_lung_cancer)
conf_matrix = confusion_matrix(y_test_lung_cancer, y_pred_tree_lung_cancer)
f1 = f1_score(y_test_lung_cancer, y_pred_tree_lung_cancer, average='weighted')

In [162]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 1.0
F1 Score: 1.0
Confusion Matrix:
 [[ 2  0]
 [ 0 60]]


In [163]:
search_obj = RandomizedSearchCV(tree, param_distributions = tree_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train_obesity, y_train_obesity)
print(fit_obj.cv_results_['mean_test_score'])
model_tree_obesity = fit_obj.best_estimator_

[0.93230159 0.93230159 0.93230159 0.50356074 0.93230159 0.92517677
 0.73018232 0.84706349 0.29463203 0.50356074]


In [164]:
model_tree_obesity.fit(X_train_obesity, y_train_obesity)
print("Train: ", model_tree_obesity.score(X_train_obesity, y_train_obesity))
print("Test: ", model_tree_obesity.score(X_test_obesity, y_test_obesity))

Train:  0.9767441860465116
Test:  0.9090909090909091


In [165]:
y_pred_tree_obesity = model_tree_obesity.predict(X_test_obesity)

In [166]:
accuracy = accuracy_score(y_test_obesity, y_pred_tree_obesity)
conf_matrix = confusion_matrix(y_test_obesity, y_pred_tree_obesity)
f1 = f1_score(y_test_obesity, y_pred_tree_obesity, average='weighted')

In [167]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9090909090909091
F1 Score: 0.9090909090909091
Confusion Matrix:
 [[8 0 0 0]
 [0 4 2 0]
 [0 0 4 0]
 [0 0 0 4]]


## Random Forest

In [168]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()

In [169]:
random_forest_params = {
    'n_estimators': [50, 100, 200, 300],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced'],
    'random_state': [42]
}

In [170]:
search_obj = RandomizedSearchCV(random_forest, param_distributions = random_forest_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train_brain_stroke, y_train_brain_stroke)
print(fit_obj.cv_results_['mean_test_score'])
model_random_forest_brain_stroke = fit_obj.best_estimator_

[0.49682603 0.50142377 0.50859335 0.53567838 0.5664929  0.49900174
 0.57275362 0.49082471 0.55201453 0.49801276]


In [171]:
model_random_forest_brain_stroke.fit(X_train_brain_stroke, y_train_brain_stroke)
print("Train: ", model_random_forest_brain_stroke.score(X_train_brain_stroke, y_train_brain_stroke))
print("Test: ", model_random_forest_brain_stroke.score(X_test_brain_stroke, y_test_brain_stroke))

Train:  0.9317269076305221
Test:  0.8846539618856569


In [172]:
y_pred_random_forest_brain_stroke = model_random_forest_brain_stroke.predict(X_test_brain_stroke)

In [173]:
accuracy = accuracy_score(y_test_brain_stroke, y_pred_random_forest_brain_stroke)
conf_matrix = confusion_matrix(y_test_brain_stroke, y_pred_random_forest_brain_stroke)
f1 = f1_score(y_test_brain_stroke, y_pred_random_forest_brain_stroke, average='weighted')

In [174]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.8846539618856569
F1 Score: 0.899242096161714
Confusion Matrix:
 [[865  78]
 [ 37  17]]


In [175]:
search_obj = RandomizedSearchCV(random_forest, param_distributions = random_forest_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train_cardiovascular_disease, y_train_cardiovascular_disease)
print(fit_obj.cv_results_['mean_test_score'])
model_random_forest_cardiovascular_disease = fit_obj.best_estimator_

[0.59214249 0.5998741  0.58087729 0.58263381 0.59729741 0.58367983
 0.622007   0.59719614 0.59679656 0.62092543]


In [176]:
model_random_forest_cardiovascular_disease.fit(X_train_cardiovascular_disease, y_train_cardiovascular_disease)
print("Train: ", model_random_forest_cardiovascular_disease.score(X_train_cardiovascular_disease, y_train_cardiovascular_disease))
print("Test: ", model_random_forest_cardiovascular_disease.score(X_test_cardiovascular_disease, y_test_cardiovascular_disease))

Train:  0.6472321428571428
Test:  0.6301428571428571


In [177]:
y_pred_random_forest_cardiovascular_disease = model_random_forest_cardiovascular_disease.predict(X_test_cardiovascular_disease)

In [178]:
accuracy = accuracy_score(y_test_cardiovascular_disease, y_pred_random_forest_cardiovascular_disease)
conf_matrix = confusion_matrix(y_test_cardiovascular_disease, y_pred_random_forest_cardiovascular_disease)
f1 = f1_score(y_test_cardiovascular_disease, y_pred_random_forest_cardiovascular_disease, average='weighted')

In [179]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.6301428571428571
F1 Score: 0.6299732122584685
Confusion Matrix:
 [[4255 2733]
 [2445 4567]]


In [180]:
search_obj = RandomizedSearchCV(random_forest, param_distributions = random_forest_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train_lung_cancer, y_train_lung_cancer)
print(fit_obj.cv_results_['mean_test_score'])
model_random_forest_lung_cancer = fit_obj.best_estimator_

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [181]:
model_random_forest_lung_cancer.fit(X_train_lung_cancer, y_train_lung_cancer)
print("Train: ", model_random_forest_lung_cancer.score(X_train_lung_cancer, y_train_lung_cancer))
print("Test: ", model_random_forest_lung_cancer.score(X_test_lung_cancer, y_test_lung_cancer))

Train:  1.0
Test:  1.0


In [182]:
y_pred_random_forest_lung_cancer = model_random_forest_lung_cancer.predict(X_test_lung_cancer)

In [183]:
accuracy = accuracy_score(y_test_lung_cancer, y_pred_random_forest_lung_cancer)
conf_matrix = confusion_matrix(y_test_lung_cancer, y_pred_random_forest_lung_cancer)
f1 = f1_score(y_test_lung_cancer, y_pred_random_forest_lung_cancer, average='weighted')

In [184]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 1.0
F1 Score: 1.0
Confusion Matrix:
 [[ 2  0]
 [ 0 60]]


In [185]:
search_obj = RandomizedSearchCV(random_forest, param_distributions = random_forest_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train_obesity, y_train_obesity)
print(fit_obj.cv_results_['mean_test_score'])
model_random_forest_obesity = fit_obj.best_estimator_

[0.92849206 0.92849206 0.95230159 0.92849206 0.95230159 0.92849206
 0.95230159 0.92849206 0.92849206 0.92849206]


In [186]:
model_random_forest_obesity.fit(X_train_obesity, y_train_obesity)
print("Train: ", model_random_forest_obesity.score(X_train_obesity, y_train_obesity))
print("Test: ", model_random_forest_obesity.score(X_test_obesity, y_test_obesity))

Train:  0.9883720930232558
Test:  0.9090909090909091


In [187]:
y_pred_random_forest_obesity = model_random_forest_obesity.predict(X_test_obesity)

In [188]:
accuracy = accuracy_score(y_test_obesity, y_pred_random_forest_obesity)
conf_matrix = confusion_matrix(y_test_obesity, y_pred_random_forest_obesity)
f1 = f1_score(y_test_obesity, y_pred_random_forest_obesity, average='weighted')

In [189]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9090909090909091
F1 Score: 0.9090909090909091
Confusion Matrix:
 [[8 0 0 0]
 [0 4 2 0]
 [0 0 4 0]
 [0 0 0 4]]


In [209]:
import joblib

In [210]:
brain_stroke_file = 'brain_stroke.pkl'
joblib.dump(model_random_forest_brain_stroke, brain_stroke_file)

['brain_stroke.pkl']

In [211]:
cardiovascular_disease_file = 'cardiovascular_disease.pkl'
joblib.dump(model_random_forest_cardiovascular_disease, cardiovascular_disease_file)

['cardiovascular_disease.pkl']

In [212]:
lung_cancer_file = 'lung_cancer.pkl'
joblib.dump(model_random_forest_lung_cancer, lung_cancer_file)

['lung_cancer.pkl']

In [213]:
obesity_file = 'obesity.pkl'
joblib.dump(model_tree_obesity, obesity_file)

['obesity.pkl']