In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
df1 = pd.read_csv("./data/cardio_set1.csv", index_col=0)
df2 = pd.read_csv("./data/cardio_set2.csv", index_col=0)

In [8]:
# randomforest
# logistic regression
# svm - långsam
# knn


df2.info

<bound method DataFrame.info of          age  ap_hi  ap_lo  cholesterol  gluc  smoke  alco  active  cardio  \
id                                                                           
0      18393    110     80            1     1      0     0       1       0   
1      20228    140     90            3     1      0     0       1       1   
2      18857    130     70            3     1      0     0       0       1   
3      17623    150    100            1     1      0     0       1       1   
4      17474    100     60            1     1      0     0       0       0   
...      ...    ...    ...          ...   ...    ...   ...     ...     ...   
99993  19240    120     80            1     1      1     0       1       0   
99995  22601    140     90            2     2      0     0       1       1   
99996  19066    180     90            3     1      0     1       0       1   
99998  22431    135     80            1     2      0     0       0       1   
99999  20540    120     80      

In [11]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
X1, y1 = df1.drop(columns=["cardio"]), df1["cardio"]
X2, y2 = df2.drop(columns=["cardio"]), df2["cardio"]

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

In [None]:



scaler_set1 = StandardScaler()
scaler_set2 = StandardScaler()
normalisation_set1 = MinMaxScaler()
normalisation_set2 = MinMaxScaler()

scaled_X1_train = scaler_set1.fit_transform(X1_train)
scaled_X2_train = scaler_set2.fit_transform(X2_train)
scaled_X1_test = scaler_set1.transform(X1_test)
scaled_X2_test = scaler_set2.transform(X2_test)

norm_X1_train = normalisation_set1.fit_transform(scaled_X1_train)
norm_X2_train = normalisation_set2.fit_transform(scaled_X2_train)
norm_X1_test = normalisation_set1.transform(scaled_X1_test)
norm_X2_test = normalisation_set2.transform(scaled_X2_test)

rf = RandomForestClassifier()
lr = LogisticRegression()
knn = KNeighborsClassifier()



In [None]:
rf_param_grid = {
    "n_estimators": [100, 150, 200],
    "max_features": ["auto"],
    "max_depth": [10,20,None],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
}

l1_ratio = np.linspace(0, 1, 20)

lr_param_grid = {
    "penalty": ["l1", "l2", "elasticnet"],
    "C": [0.01, 0.1, 1, 10, 100],
    "solver": ["saga"],
}

knn_param_grid = {
    "n_neighbors": [3, 5, 7, 9, 11, 15],
    "weights": ["uniform", "distance"],
}


In [None]:
def Grid_search_CV(X_train, y_train, X_test, y_test, model, param_grid, model_name):
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train, y_train)
    
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    y_pred = best_model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"{model_name} best Parameters: {best_params}")
    print(f"{model_name} performance:")
    print(f"accuracy: {accuracy:.4f}")
    print(f"precision: {precision:.4f}")
    print(f"recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    return {
        'model': best_model,
        'params': best_params,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }


results = {}

results['Random forest set1'] = Grid_search_CV(norm_X1_train, y1_train, norm_X1_test, y1_test, rf, rf_param_grid, "Random Forest")
results['random forest set2'] = Grid_search_CV(norm_X2_train, y2_train, norm_X2_test, y2_test, rf, rf_param_grid, "Random Forest")

results['Logistic regression set1'] = Grid_search_CV(norm_X1_train, y1_train, norm_X1_test, y1_test, lr, lr_param_grid, "Logistic Regression")
results['Logistic regression set2'] = Grid_search_CV(norm_X2_train, y2_train, norm_X2_test, y2_test, lr, lr_param_grid, "Logistic Regression")

results['KNN set1'] = Grid_search_CV(norm_X1_train, y1_train, norm_X1_test, y1_test, knn, knn_param_grid, "KNN")
results['KNN set2'] = Grid_search_CV(norm_X2_train, y2_train, norm_X2_test, y2_test, knn, knn_param_grid, "KNN")

print(results)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


KeyboardInterrupt: 