In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

from lab2 import Helper

In [2]:
df1 = pd.read_csv('Data/df1.csv')
df2 = pd.read_csv('Data/df2.csv')

display(df1.head(3), df2.head(3))

Unnamed: 0,id,age,cholesterol,gluc,smoke,alco,active,cardio,gender_2,bmi-feature_obese (class I),bmi-feature_obese (class II),bmi-feature_obese (class III),bmi-feature_overweight,bp-feature_healthy,bp-feature_hypertension crises,bp-feature_stage 1 hypertension,bp-feature_stage 2 hypertension
0,0,18393,1,1,0,0,1,0,1,0,0,0,0,0,0,1,0
1,1,20228,3,1,0,0,1,1,0,1,0,0,0,0,0,0,1
2,2,18857,3,1,0,0,0,1,0,0,0,0,0,0,0,1,0


Unnamed: 0,id,age,bmi,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,gender_2
0,0,18393,21,110,80,1,1,0,0,1,0,1
1,1,20228,34,140,90,3,1,0,0,1,1,0
2,2,18857,23,130,70,3,1,0,0,0,1,0


In [3]:
# Drop id as its not useful
df1, df2 = df1.drop('id', axis=1), df2.drop('id', axis=1)

### 2.4 - Välja modell

Chosen models:
* Logistic regression
* decision tree
* random forest

The reason for choosing the above models is quite simple. The problem where trying to solve is that of classification. That immediately limited the options for which models we could use. We can for example not use (multiple) linear regression, given that such a model wouldn't be able to do classification (e.g give 1's & 0's for if the patient has cardiovascular disease or not).

As for why i decided to use logistic regression, decision tree.. etc. over something like SVM or KNN all comes down to the datasets. The data simply looked to be more suited towards something like a decision tree given how categorical it was. Now where obviously using two datasets that are quite different, but i still figured that an algorithm like logistic regression was up for the task. Especially given that it uses a continuous S-curve for classifying.

In [4]:
df1_x, df1_y = df1.drop('cardio', axis=1), df1['cardio']
df2_x, df2_y = df2.drop('cardio', axis=1), df2['cardio']

display(
    df1_x.head(1),
    df2_x.head(1)
)

Unnamed: 0,age,cholesterol,gluc,smoke,alco,active,gender_2,bmi-feature_obese (class I),bmi-feature_obese (class II),bmi-feature_obese (class III),bmi-feature_overweight,bp-feature_healthy,bp-feature_hypertension crises,bp-feature_stage 1 hypertension,bp-feature_stage 2 hypertension
0,18393,1,1,0,0,1,1,0,0,0,0,0,0,1,0


Unnamed: 0,age,bmi,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,gender_2
0,18393,21,110,80,1,1,0,0,1,1


In [5]:
df1_x_train, df1_x_val, df1_x_test, df1_y_train, df1_y_val, df1_y_test = Helper.train_val_test_split(df1_x, df1_y, split_size=0.2, rand_state=42)
df2_x_train, df2_x_val, df2_x_test, df2_y_train, df2_y_val, df2_y_test = Helper.train_val_test_split(df2_x, df2_y, split_size=0.2, rand_state=42)

display(
    (df1_x_train.shape, df1_y_train.shape, df1_x_val.shape, df1_y_val.shape, df1_x_test.shape, df1_y_test.shape),
    (df2_x_train.shape, df2_y_train.shape, df2_x_val.shape, df2_y_val.shape, df2_x_test.shape, df2_y_test.shape),
)

((40918, 15), (40918,), (13640, 15), (13640,), (13640, 15), (13640,))

((40918, 10), (40918,), (13640, 10), (13640,), (13640, 10), (13640,))

In [6]:
df1_x_train, df1_x_val = Helper.scaler('minmax', df1_x_train, df1_x_val)
df1_x_train.mean(), df1_x_train.std(), df1_x_val.mean(), df1_x_val.std()

(0.24469073517266024,
 0.4111901139656925,
 0.24653791612942894,
 0.4122246262036306)

In [8]:
model_metrics = pd.DataFrame(columns=['Dataset', 'Scaling', 'Model', 'Hyper params', 'Accuracy'])
model_metrics

Unnamed: 0,Dataset,Scaling,Model,Hyper params,Accuracy


### All paraters and explanations as to why i made the choices i made.
###### NOTE: Obvious choices lack a "Reason" field.



#### Grid Search parameters
- Scoring: 'Accuracy'
    - Reason: Given that where using classification based models, accuracy seems like the only appropriate metric, given it directly tells us how good the model's doing at actually classifying things correctly.
- cv: 5
    - Reason: After some online research, it seems as though 5 is a generally good number of folds for avoiding things like overfitting.

#### LogisticRegression
- Chosen parameters
    - penalty: 'elasticnet'
        - Reason: We'd like to use both L1 and or L2 regularization in the grid search.
    
    - solver: saga

    - max_iter: 1000
        - Reason: Some models require quite a few iterations before they converge. Given that max_iter only specifies the maximum number of allowed iterations before "giving up" on a model converging, we won't actually be wasting any compute power on running a more iterations than will be needed.

- Hyperparameters

    - l1_ratio: [0.0, 0.1, ... 1.0]
        - Reason: Uses both solvers independently as well as in ratios with an increment resolution of 10%. This simply just seems like a resonable thing to do.
    
#### DecisionTreeClassifier
- Chosen parameters
    - max_depth: None (Default)
        - Reason: Not needed as we'd like expand the nodes until all leaves are pure.

    - min_samples_split: 2 (Default)
        - Reason: We'd like to only split if we've got 2 samples or more 

    - min_samples_leaf: 1 (Default)
        - Reason: We'd like our leaf nodes to only contain one sample

    - min_weight_fraction_leaf: 0 (Default)
        - Reason: We'd like to keep building the tree regardless of "amount of data" (so to say) available.

    - max_leaf_nodes: None (Default)
        - Reason: Generally not needed, as we'd simply like to build the tree until we've got pure leaves, not obsess about impurity decrease.

    - min_impurity_decrease: 0 (Default)
        - Reason: We'd like to keep splitting even if the impurity decrease is low. I dont think this will result in overfitting.

    - class_weight: None (Default)
        - Reason: We've got no clue about the importance of our classes.

    - ccp_alpha: 0 (Default)
        - Reason: Don't think we need this, as it won't meaningfully decrease the complexity of the tree.

- Hyperparameters
    - criterion ['gini', 'entropy', 'log_loss']
    - splitter ['best', 'random']
    - max_features: [None, 'sqrt', 'log2']


#### RandomForestClassifier
###### Note: Matching 'Chosen parameters' from above have been omitted, given the models both rely on decision trees. This is purely for aesthetics.
- Chosen parameters

    - bootstrap: True (Default)
        - Reason: Part of what sets random forest and decision tree apart. So it'd be dumb to not use it.
        
    - oob_score: False (Default)
        - Reason: Part of what sets random forest and decision tree apart. So it'd be dumb to not use it.

- Hyperparameters
    - n_estimators: [10, 20, ... 100]
        - Reason: Seemed like a resonable range. Random forest doesn't "slow down" unlike some other algorithms, and eventually stabalizes, meaning you you won't gain anything by adding more trees. Our dataset also isnt particularly large or complex.
    
    - criterion ['gini', 'entropy', 'log_loss']
    - max_features: [None, 'sqrt', 'log2']
    
    - max_samples: [None, 0.1, 0.2, ... 1.0] # would this actually work?
        - Reason: We'll try all ratios, see what works best. Simple as that.

In [7]:
# All the data needed for chosing the right model and doing hyperparameter tuning via GridSearchCV

datasets = {
    'df1': {
        'x_train': df1_x_train,
        'x_val': df1_x_val,
        'x_test': df1_x_test,
        'y_train': df1_y_train,
        'y_val': df1_y_val,
        'y_test': df1_y_test
    },
    
    'df2': {
        'x_train': df2_x_train,
        'x_val': df2_x_val,
        'x_test': df2_x_test,
        'y_train': df2_y_train,
        'y_val': df2_y_val,
        'y_val': df2_y_test
     }
}


model_data = {
    'LogisticRegression': {
        'model': LogisticRegression(),
        'search space': {
            # Chosen parameters (defaults excluded)
            'penalty': ['elasticnet'],
            'solver': ['saga'],
            'max_iter': [1000],
            
            # Hyperparamaters
            'l1_ratio' : [round(x * 0.1, 1) for x in range(11)]
        }
    },
    
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(),
        'search space': {
            
            # Hyperparamaters
            'criterion': ['gini', 'entropy', 'log_loss'],
            'splitter': ['best', 'random'],
            'max_features' : [None, 'sqrt', 'log2'],
        }
    },
    
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'search space': {
            
            # Hyperparamaters
            'n_estimators': [x for x in range(10, 110, 10)],
            
            'criterion': ['gini', 'entropy', 'log_loss'],
            'max_features' : [None, 'sqrt', 'log2']
        }
    },
}

In [9]:
#import warnings
#warnings.filterwarnings('ignore')

for dataset_name, dataset in datasets.items():
    
    # Loop through and use the minmax and standard scaler
    for scaler_name in ['minmax', 'standard']:
        
        # Scale the data
        scaled_x_train, scaled_x_val = Helper.scaler(scaler_name, dataset['x_train'], dataset['x_val'])

        # loop through all models. data = dict with model object and parameter info
        for model_name, data in model_data.items():
            
            # Search # find the best hyperparamters
            GS = GridSearchCV(
                estimator = data['model'],
                param_grid = data['search space'],
                scoring = 'accuracy',
                #n_jobs=2, # add when running final
                cv = 5,
                verbose = 3 # remove when running final
            )
            
            GS.fit(scaled_x_train, dataset['y_train'])
            
            val_pred = GS.predict(scaled_x_val)
            
            score = accuracy_score(dataset['y_val'], val_pred)
            
            model_metrics.loc[len(model_metrics.index)] = [
                dataset_name,
                scaler_name,
                model_name,
                GS.best_params_,
                score
            ]

Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV 1/5] END l1_ratio=0.0, max_iter=1000, penalty=elasticnet, solver=saga;, score=0.696 total time=   0.4s
[CV 2/5] END l1_ratio=0.0, max_iter=1000, penalty=elasticnet, solver=saga;, score=0.693 total time=   0.2s
[CV 3/5] END l1_ratio=0.0, max_iter=1000, penalty=elasticnet, solver=saga;, score=0.692 total time=   0.1s
[CV 4/5] END l1_ratio=0.0, max_iter=1000, penalty=elasticnet, solver=saga;, score=0.700 total time=   0.2s
[CV 5/5] END l1_ratio=0.0, max_iter=1000, penalty=elasticnet, solver=saga;, score=0.710 total time=   0.2s
[CV 1/5] END l1_ratio=0.1, max_iter=1000, penalty=elasticnet, solver=saga;, score=0.696 total time=   0.2s
[CV 2/5] END l1_ratio=0.1, max_iter=1000, penalty=elasticnet, solver=saga;, score=0.693 total time=   0.2s
[CV 3/5] END l1_ratio=0.1, max_iter=1000, penalty=elasticnet, solver=saga;, score=0.692 total time=   0.2s
[CV 4/5] END l1_ratio=0.1, max_iter=1000, penalty=elasticnet, solver=saga;, score=0

Unnamed: 0,Dataset,Scaling,Model,Hyper params,Accuracy
0,df1,minmax,LogisticRegression,"{'l1_ratio': 0.9, 'max_iter': 1000, 'penalty':...",0.705205
1,df1,minmax,DecisionTreeClassifier,"{'criterion': 'entropy', 'max_features': 'log2...",0.612463
2,df1,minmax,RandomForestClassifier,"{'criterion': 'entropy', 'max_features': 'log2...",0.606965
3,df1,standard,LogisticRegression,"{'l1_ratio': 0.0, 'max_iter': 1000, 'penalty':...",0.705205
4,df1,standard,DecisionTreeClassifier,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.612243
5,df1,standard,RandomForestClassifier,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.602786
6,df2,minmax,LogisticRegression,"{'l1_ratio': 1.0, 'max_iter': 1000, 'penalty':...",0.503592
7,df2,minmax,DecisionTreeClassifier,"{'criterion': 'log_loss', 'max_features': 'log...",0.497654
8,df2,minmax,RandomForestClassifier,"{'criterion': 'gini', 'max_features': 'log2', ...",0.503739
9,df2,standard,LogisticRegression,"{'l1_ratio': 0.3, 'max_iter': 1000, 'penalty':...",0.503372


In [21]:
model_metrics = model_metrics.sort_values(by=['Accuracy'], ascending=False)
model_metrics



Unnamed: 0,Dataset,Scaling,Model,Hyper params,Accuracy
0,df1,minmax,LogisticRegression,"{'l1_ratio': 0.9, 'max_iter': 1000, 'penalty':...",0.705205
3,df1,standard,LogisticRegression,"{'l1_ratio': 0.0, 'max_iter': 1000, 'penalty':...",0.705205
1,df1,minmax,DecisionTreeClassifier,"{'criterion': 'entropy', 'max_features': 'log2...",0.612463
4,df1,standard,DecisionTreeClassifier,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.612243
2,df1,minmax,RandomForestClassifier,"{'criterion': 'entropy', 'max_features': 'log2...",0.606965
5,df1,standard,RandomForestClassifier,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.602786
8,df2,minmax,RandomForestClassifier,"{'criterion': 'gini', 'max_features': 'log2', ...",0.503739
6,df2,minmax,LogisticRegression,"{'l1_ratio': 1.0, 'max_iter': 1000, 'penalty':...",0.503592
9,df2,standard,LogisticRegression,"{'l1_ratio': 0.3, 'max_iter': 1000, 'penalty':...",0.503372
10,df2,standard,DecisionTreeClassifier,"{'criterion': 'log_loss', 'max_features': 'sqr...",0.497874


##### Based on the results, i'll be chosing [INSERT] dataset and [INSERT] model & parameters. 

In [52]:
test = model_metrics.groupby('Model')['Accuracy'].max()

# filter the rows that have the maximum accuracy
top_models = model_metrics.loc[model_metrics['Accuracy'].isin(test)]

pd.options.display.max_colwidth = 100

display(top_models)

pd.options.display.max_colwidth = 50

Unnamed: 0,Dataset,Scaling,Model,Hyper params,Accuracy
0,df1,minmax,LogisticRegression,"{'l1_ratio': 0.9, 'max_iter': 1000, 'penalty': 'elasticnet', 'solver': 'saga'}",0.705205
3,df1,standard,LogisticRegression,"{'l1_ratio': 0.0, 'max_iter': 1000, 'penalty': 'elasticnet', 'solver': 'saga'}",0.705205
1,df1,minmax,DecisionTreeClassifier,"{'criterion': 'entropy', 'max_features': 'log2', 'splitter': 'random'}",0.612463
2,df1,minmax,RandomForestClassifier,"{'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 2}",0.606965


In [11]:
clf = DecisionTreeClassifier()
clf.fit(df1_x_train, df1_y_train)
y_pred = clf.predict(df1_x_test)




# print(classification_report(df1_y_test, y_pred))
# cm = confusion_matrix(df1_y_test, y_pred)
# ConfusionMatrixDisplay(cm).plot();


