In [2]:
import pandas as pd

In [3]:
df_brain_stroke = pd.read_csv('./brain_stroke_cleaned.csv', sep = ';')

In [4]:
df_brain_stroke.head()

Unnamed: 0,gender,age,bmi,smoking_status,stroke
0,1,67,36.6,2,1
1,1,80,32.5,0,1
2,0,49,34.4,3,1
3,0,79,24.0,0,1
4,1,81,29.0,2,1


In [5]:
df_cardiovascular_disease = pd.read_csv('./cardiovascular_disease_cleaned.csv')

In [6]:
df_cardiovascular_disease.head()

Unnamed: 0,age,gender,height,weight,smoke,alco,active,cardio
0,50,1,168,62.0,0,0,1,0
1,55,0,156,85.0,0,0,1,1
2,51,0,165,64.0,0,0,0,1
3,48,1,169,82.0,0,0,1,1
4,47,0,156,56.0,0,0,0,0


In [7]:
df_cardiovascular_disease.tail()

Unnamed: 0,age,gender,height,weight,smoke,alco,active,cardio
69995,52,1,168,76.0,1,0,1,0
69996,61,0,158,126.0,0,0,1,1
69997,52,1,183,105.0,0,1,0,1
69998,61,0,163,72.0,0,0,0,1
69999,56,0,170,72.0,0,0,1,0


In [8]:
df_lung_cancer = pd.read_csv('./lung_cancer_cleaned.csv')

In [9]:
df_lung_cancer.head()

Unnamed: 0,GENDER,AGE,SMOKING,FATIGUE,ALLERGY,ALCOHOL CONSUMING,LUNG_CANCER
0,1,69,0,2,1,1,1
1,1,74,1,2,2,0,1
2,0,59,0,2,1,0,0
3,1,63,1,1,1,1,0
4,0,63,0,1,1,0,0


In [10]:
df_obesity = pd.read_csv('./obesity_cleaned.csv')

In [11]:
df_obesity.head()

Unnamed: 0,Age,Gender,Height,Weight,BMI,Label
0,25,1,175,80,25.3,1
1,30,0,160,60,22.5,1
2,35,1,180,90,27.3,2
3,40,0,150,50,20.0,0
4,45,1,190,100,31.2,3


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
y_brain_stroke = df_brain_stroke['stroke']

X_brain_stroke = df_brain_stroke.drop('stroke', axis = 1)
print(y_brain_stroke.shape, X_brain_stroke.shape)

(4981,) (4981, 4)


In [14]:
X_train_brain_stroke, X_test_brain_stroke, y_train_brain_stroke, y_test_brain_stroke, = train_test_split(X_brain_stroke, y_brain_stroke, test_size = 0.20, random_state = 42)

In [15]:
y_cardiovascular_disease = df_cardiovascular_disease['cardio']

X_cardiovascular_disease = df_cardiovascular_disease.drop('cardio', axis = 1)
print(y_cardiovascular_disease.shape, X_cardiovascular_disease.shape)

(70000,) (70000, 7)


In [16]:
X_train_cardiovascular_disease, X_test_cardiovascular_disease, y_train_cardiovascular_disease, y_test_cardiovascular_disease, = train_test_split(X_cardiovascular_disease, y_cardiovascular_disease, test_size = 0.20, random_state = 42)

In [17]:
y_lung_cancer = df_lung_cancer['LUNG_CANCER']

X_lung_cancer = df_lung_cancer[['GENDER', 'SMOKING', 'ALCOHOL CONSUMING', 'LUNG_CANCER']]
print(y_lung_cancer.shape, X_lung_cancer.shape)

(309,) (309, 4)


In [18]:
X_train_lung_cancer, X_test_lung_cancer, y_train_lung_cancer, y_test_lung_cancer, = train_test_split(X_lung_cancer, y_lung_cancer, test_size = 0.20, random_state = 42)

In [19]:
y_obesity = df_obesity['Label']

X_obesity = df_obesity.drop('Label', axis = 1)
print(y_obesity.shape, X_obesity.shape)

(108,) (108, 5)


In [20]:
X_train_obesity, X_test_obesity, y_train_obesity, y_test_obesity, = train_test_split(X_obesity, y_obesity, test_size = 0.20, random_state = 42)

In [21]:
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import warnings
warnings.filterwarnings('ignore')

## Decision Tree Classifier

In [22]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()

In [23]:
tree_params = {
    'max_depth': [None, 1, 2, 3, 4, 5],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'class_weight': [None, 'balanced'],
    'random_state': [42]
}

In [148]:
search_obj = RandomizedSearchCV(tree, param_distributions = tree_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train_brain_stroke, y_train_brain_stroke)
print(fit_obj.cv_results_['mean_test_score'])
model_tree_brain_stroke = fit_obj.best_estimator_

[0.48752254 0.48752254 0.40982097 0.48725843 0.4967495  0.53934707
 0.4873244  0.56663872 0.51294593 0.55010611]


In [149]:
model_tree_brain_stroke.fit(X_train_brain_stroke, y_train_brain_stroke)
print("Train: ", model_tree_brain_stroke.score(X_train_brain_stroke, y_train_brain_stroke))
print("Test: ", model_tree_brain_stroke.score(X_test_brain_stroke, y_test_brain_stroke))

Train:  0.8727409638554217
Test:  0.8635907723169508


In [150]:
y_pred_tree_brain_stroke = model_tree_brain_stroke.predict(X_test_brain_stroke)

In [151]:
accuracy = accuracy_score(y_test_brain_stroke, y_pred_tree_brain_stroke)
conf_matrix = confusion_matrix(y_test_brain_stroke, y_pred_tree_brain_stroke)
f1 = f1_score(y_test_brain_stroke, y_pred_tree_brain_stroke, average='weighted')

In [152]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.8635907723169508
F1 Score: 0.8847930169289691
Confusion Matrix:
 [[847  96]
 [ 40  14]]


In [153]:
search_obj = RandomizedSearchCV(tree, param_distributions = tree_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train_cardiovascular_disease, y_train_cardiovascular_disease)
print(fit_obj.cv_results_['mean_test_score'])
model_tree_cardiovascular_disease = fit_obj.best_estimator_

[0.46311536 0.5896756  0.55135787 0.58958605 0.55338964 0.55960274
 0.5608426  0.50161138 0.46311536 0.54394027]


In [154]:
model_tree_cardiovascular_disease.fit(X_train_cardiovascular_disease, y_train_cardiovascular_disease)
print("Train: ", model_tree_cardiovascular_disease.score(X_train_cardiovascular_disease, y_train_cardiovascular_disease))
print("Test: ", model_tree_cardiovascular_disease.score(X_test_cardiovascular_disease, y_test_cardiovascular_disease))

Train:  0.5927142857142857
Test:  0.5928571428571429


In [155]:
search_obj = RandomizedSearchCV(tree, param_distributions = tree_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train_lung_cancer, y_train_lung_cancer)
print(fit_obj.cv_results_['mean_test_score'])
model_tree_lung_cancer = fit_obj.best_estimator_

[1.         1.         1.         1.         1.         1.
 0.45953177 1.         1.         1.        ]


In [156]:
y_pred_tree_cardiovascular_disease = model_tree_cardiovascular_disease.predict(X_test_cardiovascular_disease)

In [157]:
accuracy = accuracy_score(y_test_cardiovascular_disease, y_pred_tree_cardiovascular_disease)
conf_matrix = confusion_matrix(y_test_cardiovascular_disease, y_pred_tree_cardiovascular_disease)
f1 = f1_score(y_test_cardiovascular_disease, y_pred_tree_cardiovascular_disease, average='weighted')

In [158]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.5928571428571429
F1 Score: 0.59001397357275
Confusion Matrix:
 [[4727 2261]
 [3439 3573]]


In [159]:
model_tree_lung_cancer.fit(X_train_lung_cancer, y_train_lung_cancer)
print("Train: ", model_tree_lung_cancer.score(X_train_lung_cancer, y_train_lung_cancer))
print("Test: ", model_tree_lung_cancer.score(X_test_lung_cancer, y_test_lung_cancer))

Train:  1.0
Test:  1.0


In [160]:
y_pred_tree_lung_cancer = model_tree_lung_cancer.predict(X_test_lung_cancer)

In [161]:
accuracy = accuracy_score(y_test_lung_cancer, y_pred_tree_lung_cancer)
conf_matrix = confusion_matrix(y_test_lung_cancer, y_pred_tree_lung_cancer)
f1 = f1_score(y_test_lung_cancer, y_pred_tree_lung_cancer, average='weighted')

In [162]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 1.0
F1 Score: 1.0
Confusion Matrix:
 [[ 2  0]
 [ 0 60]]


In [24]:
search_obj = RandomizedSearchCV(tree, param_distributions = tree_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train_obesity, y_train_obesity)
print(fit_obj.cv_results_['mean_test_score'])
model_tree_obesity = fit_obj.best_estimator_

[0.65805556 0.66088819 0.7759127  0.29463203 0.67029295 0.92690458
        nan 0.29463203 0.81763709 0.81088638]


In [25]:
model_tree_obesity.fit(X_train_obesity, y_train_obesity)
print("Train: ", model_tree_obesity.score(X_train_obesity, y_train_obesity))
print("Test: ", model_tree_obesity.score(X_test_obesity, y_test_obesity))

Train:  0.9186046511627907
Test:  0.8181818181818182


In [26]:
y_pred_tree_obesity = model_tree_obesity.predict(X_test_obesity)

In [27]:
accuracy = accuracy_score(y_test_obesity, y_pred_tree_obesity)
conf_matrix = confusion_matrix(y_test_obesity, y_pred_tree_obesity)
f1 = f1_score(y_test_obesity, y_pred_tree_obesity, average='weighted')

In [28]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.8181818181818182
F1 Score: 0.8097784568372804
Confusion Matrix:
 [[8 0 0 0]
 [1 4 1 0]
 [0 2 2 0]
 [0 0 0 4]]


## Random Forest

In [29]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()

In [30]:
random_forest_params = {
    'n_estimators': [50, 100, 200, 300],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced'],
    'random_state': [42]
}

In [31]:
search_obj = RandomizedSearchCV(random_forest, param_distributions = random_forest_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train_brain_stroke, y_train_brain_stroke)
print(fit_obj.cv_results_['mean_test_score'])
model_random_forest_brain_stroke = fit_obj.best_estimator_

[0.49906822 0.49201719        nan        nan        nan 0.50142377
        nan 0.56910755        nan 0.49258528]


In [32]:
model_random_forest_brain_stroke.fit(X_train_brain_stroke, y_train_brain_stroke)
print("Train: ", model_random_forest_brain_stroke.score(X_train_brain_stroke, y_train_brain_stroke))
print("Test: ", model_random_forest_brain_stroke.score(X_test_brain_stroke, y_test_brain_stroke))

Train:  0.9723895582329317
Test:  0.9187562688064193


In [33]:
y_pred_random_forest_brain_stroke = model_random_forest_brain_stroke.predict(X_test_brain_stroke)

In [34]:
accuracy = accuracy_score(y_test_brain_stroke, y_pred_random_forest_brain_stroke)
conf_matrix = confusion_matrix(y_test_brain_stroke, y_pred_random_forest_brain_stroke)
f1 = f1_score(y_test_brain_stroke, y_pred_random_forest_brain_stroke, average='weighted')

In [35]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9187562688064193
F1 Score: 0.9143852166360555
Confusion Matrix:
 [[908  35]
 [ 46   8]]


In [36]:
search_obj = RandomizedSearchCV(random_forest, param_distributions = random_forest_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train_cardiovascular_disease, y_train_cardiovascular_disease)
print(fit_obj.cv_results_['mean_test_score'])
model_random_forest_cardiovascular_disease = fit_obj.best_estimator_

[       nan 0.6064683         nan 0.59877655        nan 0.59695888
 0.62232656 0.60644446 0.60063078        nan]


In [37]:
model_random_forest_cardiovascular_disease.fit(X_train_cardiovascular_disease, y_train_cardiovascular_disease)
print("Train: ", model_random_forest_cardiovascular_disease.score(X_train_cardiovascular_disease, y_train_cardiovascular_disease))
print("Test: ", model_random_forest_cardiovascular_disease.score(X_test_cardiovascular_disease, y_test_cardiovascular_disease))

Train:  0.6469464285714286
Test:  0.6304285714285714


In [38]:
y_pred_random_forest_cardiovascular_disease = model_random_forest_cardiovascular_disease.predict(X_test_cardiovascular_disease)

In [39]:
accuracy = accuracy_score(y_test_cardiovascular_disease, y_pred_random_forest_cardiovascular_disease)
conf_matrix = confusion_matrix(y_test_cardiovascular_disease, y_pred_random_forest_cardiovascular_disease)
f1 = f1_score(y_test_cardiovascular_disease, y_pred_random_forest_cardiovascular_disease, average='weighted')

In [40]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.6304285714285714
F1 Score: 0.6302745412000708
Confusion Matrix:
 [[4264 2724]
 [2450 4562]]


In [41]:
search_obj = RandomizedSearchCV(random_forest, param_distributions = random_forest_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train_lung_cancer, y_train_lung_cancer)
print(fit_obj.cv_results_['mean_test_score'])
model_random_forest_lung_cancer = fit_obj.best_estimator_

[nan  1. nan  1.  1.  1.  1.  1.  1. nan]


In [42]:
model_random_forest_lung_cancer.fit(X_train_lung_cancer, y_train_lung_cancer)
print("Train: ", model_random_forest_lung_cancer.score(X_train_lung_cancer, y_train_lung_cancer))
print("Test: ", model_random_forest_lung_cancer.score(X_test_lung_cancer, y_test_lung_cancer))

Train:  1.0
Test:  1.0


In [43]:
y_pred_random_forest_lung_cancer = model_random_forest_lung_cancer.predict(X_test_lung_cancer)

In [44]:
accuracy = accuracy_score(y_test_lung_cancer, y_pred_random_forest_lung_cancer)
conf_matrix = confusion_matrix(y_test_lung_cancer, y_pred_random_forest_lung_cancer)
f1 = f1_score(y_test_lung_cancer, y_pred_random_forest_lung_cancer, average='weighted')

In [45]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 1.0
F1 Score: 1.0
Confusion Matrix:
 [[ 2  0]
 [ 0 60]]


In [46]:
search_obj = RandomizedSearchCV(random_forest, param_distributions = random_forest_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train_obesity, y_train_obesity)
print(fit_obj.cv_results_['mean_test_score'])
model_random_forest_obesity = fit_obj.best_estimator_

[0.95230159 0.95230159        nan 0.95230159 0.95230159 0.92849206
        nan        nan 0.95230159 0.95230159]


In [47]:
model_random_forest_obesity.fit(X_train_obesity, y_train_obesity)
print("Train: ", model_random_forest_obesity.score(X_train_obesity, y_train_obesity))
print("Test: ", model_random_forest_obesity.score(X_test_obesity, y_test_obesity))

Train:  0.9883720930232558
Test:  0.9090909090909091


In [48]:
y_pred_random_forest_obesity = model_random_forest_obesity.predict(X_test_obesity)

In [49]:
accuracy = accuracy_score(y_test_obesity, y_pred_random_forest_obesity)
conf_matrix = confusion_matrix(y_test_obesity, y_pred_random_forest_obesity)
f1 = f1_score(y_test_obesity, y_pred_random_forest_obesity, average='weighted')

In [50]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9090909090909091
F1 Score: 0.9090909090909091
Confusion Matrix:
 [[8 0 0 0]
 [0 4 2 0]
 [0 0 4 0]
 [0 0 0 4]]


In [51]:
import joblib

In [52]:
brain_stroke_file = 'brain_stroke.pkl'
joblib.dump(model_random_forest_brain_stroke, brain_stroke_file)

['brain_stroke.pkl']

In [53]:
cardiovascular_disease_file = 'cardiovascular_disease.pkl'
joblib.dump(model_random_forest_cardiovascular_disease, cardiovascular_disease_file)

['cardiovascular_disease.pkl']

In [54]:
lung_cancer_file = 'lung_cancer.pkl'
joblib.dump(model_random_forest_lung_cancer, lung_cancer_file)

['lung_cancer.pkl']

In [55]:
obesity_file = 'obesity.pkl'
joblib.dump(model_tree_obesity, obesity_file)

['obesity.pkl']