#### Solve a **Multivariate Classification** problem using **RandomForestclassifier** algorithm

- Hyperparameter tuned applying 

    - **GridSearchCV**


> ### Dataset used: &emsp;[Pima Indians Diabetes Database](https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database)

In [1]:
import numpy as np
import pandas as pd

In [2]:
# define column names
column_names: list[str] = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']

# Load the data from the .csv file
df_patient_data = pd.read_csv('../data/processed/pima-indians-diabetes.csv',
                          header = None,
                          names = column_names)

df_patient_data.head(10)              # df_patient_data.head() only gets 1st 5 rows

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [117]:
df_patient_data['Age'].value_counts()           # find individual counts of "Age" for different age groups, this is like a basic EDA

Age
22    72
21    63
25    48
24    46
23    38
28    35
26    33
27    32
29    29
31    24
41    22
30    21
37    19
42    18
33    17
38    16
36    16
32    16
45    15
34    14
46    13
43    13
40    13
39    12
35    10
50     8
51     8
52     8
44     8
58     7
47     6
54     6
49     5
48     5
57     5
53     5
60     5
66     4
63     4
62     4
55     4
67     3
56     3
59     3
65     3
69     2
61     2
72     1
81     1
64     1
70     1
68     1
Name: count, dtype: int64

In [118]:
df_patient_data.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [3]:
df_patient_data.info()              # gives information about column "datatypes" and if any null columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [120]:
df_patient_data.shape           # (768, 9) --> 768 rows and 9 columns

(768, 9)

In [4]:
df_patient_data.isnull().sum()              # checks if any of the columns has a value >= "1" , meaning sum > 1 denotes some "null" values

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [122]:
df_patient_data.isnull().any()              # determines if any columns has null values

Pregnancies                 False
Glucose                     False
BloodPressure               False
SkinThickness               False
Insulin                     False
BMI                         False
DiabetesPedigreeFunction    False
Age                         False
Outcome                     False
dtype: bool

In [123]:
(df_patient_data == 0).any()          # determines if any columns has zero values in "True" or "False"

Pregnancies                  True
Glucose                      True
BloodPressure                True
SkinThickness                True
Insulin                      True
BMI                          True
DiabetesPedigreeFunction    False
Age                         False
Outcome                      True
dtype: bool

In [124]:
# display patients with 0 Insulin (i.e, not measured or missing value) - such columns would need 
# Imputation: Replace missing values with a statistical measure such as the mean, median, or mode of the non-missing values.

df_patient_zero = df_patient_data[df_patient_data['Insulin'] == 0]
df_patient_zero

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
5,5,116,74,0,0,25.6,0.201,30,0
7,10,115,0,0,0,35.3,0.134,29,0
...,...,...,...,...,...,...,...,...,...
761,9,170,74,31,0,44.0,0.403,43,1
762,9,89,62,0,0,22.5,0.142,33,0
764,2,122,70,27,0,36.8,0.340,27,0
766,1,126,60,0,0,30.1,0.349,47,1


> #### Simple Imputation

In [125]:
# Replace zero's with "NaN" in columns where zero values do not make sense
cols_with_nonsenical_zeros: list[str] =  ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df_patient_data[cols_with_nonsenical_zeros] = df_patient_data[cols_with_nonsenical_zeros].replace(0, np.nan)

# Impute the NaN's with mean of respective column
df_patient_data.fillna(df_patient_data.mean(), inplace = True)

# check if "NaN" present - summed up
# print(df_patient_data.isna().sum())

# now checking again if any meaningful columns has zero aka missing values or NaN's
(df_patient_data == 0).any()

Pregnancies                  True
Glucose                     False
BloodPressure               False
SkinThickness               False
Insulin                     False
BMI                         False
DiabetesPedigreeFunction    False
Age                         False
Outcome                      True
dtype: bool

> #### Feature Engineering

In [126]:
# divide data into train (input / features) & test (output) data
from pandas.core.frame import DataFrame, Series
from sklearn.model_selection import train_test_split

X: DataFrame = df_patient_data.drop('Outcome', axis = 1)
y: Series = df_patient_data['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.3,
                                                    random_state = 42)

print(f'Features or input variables:\n, {X.head()}, \n\nand shape is, {X.shape}')

print(f'\nTarget or output variables:\n, {y.head()}, \nand shape is, {y.shape}')

Features or input variables:
,    Pregnancies  Glucose  BloodPressure  SkinThickness     Insulin   BMI  \
0            6    148.0           72.0       35.00000  155.548223  33.6   
1            1     85.0           66.0       29.00000  155.548223  26.6   
2            8    183.0           64.0       29.15342  155.548223  23.3   
3            1     89.0           66.0       23.00000   94.000000  28.1   
4            0    137.0           40.0       35.00000  168.000000  43.1   

   DiabetesPedigreeFunction  Age  
0                     0.627   50  
1                     0.351   31  
2                     0.672   32  
3                     0.167   21  
4                     2.288   33  , 

and shape is, (768, 8)

Target or output variables:
, 0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64, 
and shape is, (768,)


In [127]:
# Optional: Scale the input (features data - both train & test data) for better model performance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(f'Scaled X_train:\n, {X_train[:5]}, \n\nand shape is, {X_train.shape}')       # prints a large vectorised array like [[-0.8362943  -0.89610788 -1.00440048 -1.27450178 -1.14686808 -1.20403257

print(f'\nScaled X_test:\n, {X_test[:5]}, \n\nand shape is, {X_test.shape}')

Scaled X_train:
, [[-0.8362943  -0.89610788 -1.00440048 -1.27450178 -1.14686808 -1.20403257
  -0.61421636 -0.94861028]
 [ 0.39072767 -0.56399695 -0.02026586  0.02449184  1.99579164  0.66428525
  -0.90973787 -0.43466673]
 [-1.14304979  0.43233584 -0.34831073  1.55966612  1.11302206  1.44035573
  -0.30699103 -0.77729576]
 [ 0.08397217  0.29949146 -0.34831073 -0.92023079  0.12432012  0.11816158
  -0.90681191 -0.43466673]
 [-0.8362943  -0.63041914 -3.46473705  1.08730481 -0.85261155  1.58407249
  -0.83951493 -0.00638043]], 

and shape is, (537, 8)

Scaled X_test:
, [[ 6.97483158e-01 -7.96474605e-01 -1.16842292e+00  4.96853160e-01
   4.06806388e-01  2.47506663e-01 -1.16803926e-01  8.50192166e-01]
 [-5.29538810e-01 -3.31519303e-01  2.25767799e-01  3.78762831e-01
   1.29998139e-03  4.91825147e-01 -9.41923376e-01 -1.03426754e+00]
 [-5.29538810e-01 -4.64363675e-01 -6.76355609e-01  4.26092128e-02
   1.29998139e-03 -2.12386954e-01 -9.12663821e-01 -1.03426754e+00]
 [ 1.31099414e+00 -4.97574768e-01

> #### GridSearchCV
> #### Define the **hyperparameter space** / values

In [130]:
# Fitting 10 folds for each of 432 candidates, totalling 4320 fits --> estimated 16m 51s

# 1> Number of trees in random forest
n_estimators: list[int] = [20, 60, 100, 120]

# 2> Number of features to consider at every split
max_features: list[float] = [0.2, 0.6, 1.0]

# 3> Maximum number of levels in tree
max_depth = [2, 8, None]

# 4> Number of samples
max_samples: list[float] = [0.5, 0.75, 1.0]

# 5> Bootstrap samples
bootstrap: list[bool] = [True, False]

# 6> Minimum number of samples required to split a node
min_samples_split = [2, 5]

# 7> Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]

hyper_params_grid: dict[str, object] = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'max_samples': max_samples,    
    'min_samples_split':min_samples_split,
    'min_samples_leaf':min_samples_leaf
}
print(hyper_params_grid)

In [None]:
# # for SMALLER RUNS: - "Fitting 5 folds for each of 32 candidates, totalling 160 fits"
# # 1> Number of trees in random forest
# n_estimators: list[int] = [20]

# # 2> Number of features to consider at every split
# max_features: list[float] = [0.2, 0.4]

# # 3> Maximum number of levels in tree
# max_depth = [2, None]

# # 4> Number of samples
# max_samples: list[float] = [0.5, 0.6]

# # 5> Bootstrap samples
# bootstrap: list[bool] = [True, False]

# # 6> Minimum number of samples required to split a node
# min_samples_split = [2, 4]

# # 7> Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2]

# hyper_params_grid: dict[str, object] = {
#     'n_estimators': n_estimators,
#     'max_features': max_features,
#     'max_depth': max_depth,
#     'max_samples': max_samples,              # 'bootstrap':bootstrap and    "max_samples" do not go hand in hand, hence commented
#     'min_samples_split':min_samples_split,
#     'min_samples_leaf':min_samples_leaf
# }

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model = RandomForestClassifier(random_state = 42)
grid_search = GridSearchCV(estimator = model,
                           param_grid = hyper_params_grid,
                           scoring = ['accuracy','f1'],       
                           cv = 2,                              # cv = 2 from 10 to reduce load on local m/c
                           verbose = 2,                         # n_jobs = -1,                            
                           return_train_score = True,
                           refit = 'f1')

In [131]:
import mlflow as mfl
if not mfl.is_tracking_uri_set():
    mfl.set_tracking_uri(uri = 'http://127.0.0.1:5000')

# set the experiment name
mfl.set_experiment('RandomForest Grid-Search-CV HyperParameter Tuning')


with mfl.start_run(run_name = 'series_of_runs') as parent: 
    # Fit GridSearchCV to the training data
    grid_search.fit(X_train, y_train)

    results = grid_search.cv_results_
    print(f'The grid search cv tuning results are: {results}')

    for exp_num, params in enumerate(results['params']):
        with mfl.start_run(run_name = f'Experiment: {exp_num}', nested = True) as parent: 
            # Log the parameters
            mfl.log_params(params)

            # Log the metrics
            train_metrics = {
                'train_accuracy': results['mean_train_accuracy'][exp_num],
                'train_f1': results['mean_train_f1'][exp_num]
            }
            
            test_metrics = {
                'test_accuracy': results['mean_test_accuracy'][exp_num],
                'test_f1': results['mean_test_f1'][exp_num]
            }
            
            mfl.log_metrics(train_metrics)
            mfl.log_metrics(test_metrics)
            
            # Train the model with the current hyper-parameters received from the tuned results
            model.set_params(**params)
            model.fit(X_train, y_train)


            # Make predictions on the test set
            y_pred = model.predict(X_test)
            
            # Calculate additional metrics & log them
            precision = precision_score(y_test, y_pred,
                                        average = 'weighted')
            
            recall = recall_score(y_test, y_pred, 
                                  average = 'weighted')
            
            f1 = f1_score(y_test, y_pred, 
                          average = 'weighted')
            
            # Log additional metrics
            mfl.log_metric('precision', precision)
            mfl.log_metric('recall', recall)
            mfl.log_metric('f1_score', f1)

[I 2025-01-26 03:34:50,571] A new study created in memory with name: no-name-2d55485b-5b5a-438c-8126-47743534e24d


[I 2025-01-26 03:34:50,624] Trial 0 finished with value: 0.750465549348231 and parameters: {'n_estimators': 20, 'max_features': 0.6, 'max_depth': None, 'max_samples': 1.0, 'bootstrap': False, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.750465549348231.
[I 2025-01-26 03:34:50,897] Trial 1 finished with value: 0.7541899441340782 and parameters: {'n_estimators': 120, 'max_features': 1.0, 'max_depth': None, 'max_samples': 0.5, 'bootstrap': True, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.7541899441340782.
[I 2025-01-26 03:34:51,102] Trial 2 finished with value: 0.7579143389199254 and parameters: {'n_estimators': 120, 'max_features': 1.0, 'max_depth': 2, 'max_samples': 1.0, 'bootstrap': False, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 2 with value: 0.7579143389199254.
[I 2025-01-26 03:34:51,290] Trial 3 finished with value: 0.7932960893854748 and parameters: {'n_estimators': 120, 'max_features': 0.2, 'm

🏃 View run trial_0 at: http://127.0.0.1:5000/#/experiments/638940344264241299/runs/726c581b8bdf4d0a82e67b49800bd352
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/638940344264241299
🏃 View run trial_1 at: http://127.0.0.1:5000/#/experiments/638940344264241299/runs/5b045dd0e8a74253837854c5a5f6f38d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/638940344264241299
🏃 View run trial_2 at: http://127.0.0.1:5000/#/experiments/638940344264241299/runs/9a39e05e20a04a05bc7a127024fa6b76
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/638940344264241299
🏃 View run trial_3 at: http://127.0.0.1:5000/#/experiments/638940344264241299/runs/5067895965aa4ef990729f7e736aa7a5
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/638940344264241299
🏃 View run trial_4 at: http://127.0.0.1:5000/#/experiments/638940344264241299/runs/7bcce3bc6d014a938a404b8cd2c4d8ba
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/638940344264241299
🏃 View run series_of_runs at: http:

In [132]:
# Print the best results
best_params = grid_search.best_params_
print(f'Best hyperparameters: {best_params}')

best_model = grid_search.best_estimator_
print(f'Best model: {best_model}')

best_train_accuracy_score: float = grid_search.best_score_
print(f'Best training accuracy: {best_train_accuracy_score:.2f}')

best_test_accuracy_score: float = accuracy_score(y_test, best_model.predict(X_test))
print(f'Test Accuracy with best hyperparameters: {best_test_accuracy_score:.2f}')

Best trial accuracy: 0.79
Best hyperparameters: {'n_estimators': 120, 'max_features': 0.2, 'max_depth': None, 'max_samples': 0.5, 'bootstrap': True, 'min_samples_split': 2, 'min_samples_leaf': 2}
Test Accuracy with best hyperparameters: 0.76
Best precision score: 0.65
Best confusion matrix: [[124  27]
 [ 29  51]]


In [134]:
signature = mfl.models.infer_signature(model_input = X_train,
                                       model_output = best_model.predict(X_test))
# Log the best model
with mfl.start_run(run_name = 'best_model') as best_model:       
    mfl.log_params(best_params)
    mfl.log_metric('best_train_accuracy', best_train_accuracy_score)
    mfl.log_metric('best_test_accuracy', best_test_accuracy_score)
    
    # Log the confusion matrix as an artifact
    mfl.sklearn.log_model(best_model,
                          artifact_path = 'final_model',
                          registered_model_name = 'RandomForestClassifier_model',
                          signature = signature)

Registered model 'RandomForestClassifier_model' already exists. Creating a new version of this model...
2025/01/26 03:34:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestClassifier_model, version 4


🏃 View run best_model at: http://127.0.0.1:5000/#/experiments/638940344264241299/runs/e264b6a235084a799f08b10197c9cc49
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/638940344264241299


Created version '4' of model 'RandomForestClassifier_model'.
