# ML Assignment 1 - Canadian Hospital Re-admittance Challenge

*Harsh Kumar - IMT2021016* |
*Subhajeet Lahiri - IMT2021022* |
*Sai Madhavan G - IMT2021101*

**Note:** This file only contains clean code used for our final submission. To look at many of our attempts along the way, please look at *'./attempts.ipynb'*

Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.utils import class_weight, compute_class_weight
from sklearn.metrics import accuracy_score, f1_score
from joblib import dump, load
from icd9cms.icd9 import search as icdsearch
from sklearn.cluster import KMeans 
from tqdm import tqdm
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

Global variables

In [2]:
ENC_CAT_COLS = ['race', 'gender', 'age', 'weight','payer_code', 'medical_specialty','diag_1',
       'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed']
ENC_CAT_COLS_NO_DRUGS = ['race', 'gender', 'age','payer_code', 'medical_specialty','diag_1',
       'diag_2', 'diag_3', 'change', 'diabetesMed']
CAT_COLS = ['enc_id', 'patient_id', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'payer_code', 'medical_specialty', 
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmission_id']
CAT_COLS_WITH_DIAG = ['race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'payer_code', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmission_id']

## Pre-processing & Feature Extraction

Loading data

In [3]:
data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

### `diag` columns

The 'diag' columns contain the ICD9 codes of the primary, secondary and additional secondary diagnoses. They each contain close to 700 unique values.

To reduce the number of unique values logically, we group them based on the types of disease they denote. We use the `icd9cms` library for the same.

In [4]:
def diag(df):
    # Initialize empty lists to store transformed values
    diag_1 = []
    diag_2 = []
    diag_3 = []
    
    # Iterate through the DataFrame to process the 'diag' columns
    for idx, row in df.iterrows():
        # Extract the values from 'diag_1', 'diag_2', and 'diag_3' columns
        d1 = str(row['diag_1'])
        d2 = str(row['diag_2'])
        d3 = str(row['diag_3'])
        
        # Handle missing values (NaN)
        if d1 == 'nan':
            diag_1.append(np.nan)
        else:
            # Process ICD9 codes that start with 'E'
            if d1[0] == 'E':
                d1 = d1[:4]
            # Truncate ICD9 codes to the first 3 characters for grouping
            elif len(d1) > 3:
                d1 = d1[:3]
            # Ensure a consistent format for ICD9 codes
            v1 = f'{int(d1):03d}' if d1.isnumeric() else d1
            # Use a function 'icdsearch' (not shown in this code) to obtain a parent node for the ICD9 code
            node = icdsearch(v1)
            if not node:
                print(v1, idx)
                break
            diag_1.append(str(node.parent))
        
        # Repeat the same process for 'diag_2' and 'diag_3'
        d2 = str(row['diag_2'])
        if d2 == 'nan':
            diag_2.append(np.nan)
        else:
            if d2[0=='E']:
                d2 = d2[:4]
            elif len(d2)>3:
                d2 = d2[:3]
            v2 = f'{int(d2):03d}' if d2.isnumeric() else d2
            node = icdsearch(v2)
            if not node:
                print(v2, idx)
                break
            diag_2.append(str(node.parent))
        d3 = str(row['diag_3'])
        if d3 == 'nan':
            diag_3.append(np.nan)
        else:
            if d3[0=='E']:
                d3 = d3[:4]
            elif len(d3)>3:
                d3 = d3[:3]
            v3 = f'{int(d3):03d}' if d3.isnumeric() else d3
            node = icdsearch(v3)
            if not node:
                print(v3, idx)
                break
            diag_3.append(str(node.parent))
        
    # Update the DataFrame with the transformed 'diag' columns
    df['diag_1'] = diag_1
    df['diag_2'] = diag_2
    df['diag_3'] = diag_3
    
    return df

In [5]:
data[['diag_1', 'diag_2', 'diag_3']].nunique()

diag_1    684
diag_2    691
diag_3    745
dtype: int64

In [6]:
data = diag(data)
data[['diag_1', 'diag_2', 'diag_3']].nunique()

diag_1    114
diag_2    128
diag_3    133
dtype: int64

In [7]:
data[['diag_1', 'diag_2', 'diag_3']].head()

Unnamed: 0,diag_1,diag_2,diag_3
0,410-414:Ischemic Heart Disease:None,420-429:Other Forms Of Heart Disease:None,410-414:Ischemic Heart Disease:None
1,410-414:Ischemic Heart Disease:None,420-429:Other Forms Of Heart Disease:None,420-429:Other Forms Of Heart Disease:None
2,510-519:Other Diseases Of Respiratory System:None,510-519:Other Diseases Of Respiratory System:None,996-999:Complications Of Surgical And Medical ...
3,590-599:Other Diseases Of Urinary System:None,590-599:Other Diseases Of Urinary System:None,249-259:Diseases Of Other Endocrine Glands:None
4,249-259:Diseases Of Other Endocrine Glands:None,710-719:Arthropathies And Related Disorders:None,700-709:Other Diseases Of Skin And Subcutaneou...


This reduces the number of unique values close to 120. 

We also empirically observed that this improves the performace by about 2%

### Drugs columns

There are 23 categorical columns mentioning whether that particular drug has been increased, decreased, not changed or not prescribed.

We consolidate the changes in drug dosages for a particular encounter in terms of number of drugs increased, decreased or not changed in dosage.

In [8]:
def drugs(data, tt):
    # Define a nested function 'drug_changes' to process drug columns
    def drug_changes(row):
        d = {"drug_up": 0, "drug_down": 0, "drug_steady": 0}
        for drug in row:
            if drug == 'Up':
                d['drug_up'] += 1
            elif drug == "Down":
                d['drug_down'] += 1
            elif drug == "Steady":
                d['drug_steady'] += 1
        return pd.Series(d)
    
    # Define a list of drug columns to be processed
    drugs = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone']
    
    # Extract only the relevant drug columns from the data
    drugs_data = data[drugs]
    
    # Apply the 'drug_changes' function to create a DataFrame with drug change information
    drug_change_df = drugs_data.apply(drug_changes, 1)
    
    # Remove the processed drug columns from the data
    data.drop(columns=drugs, inplace=True)
    
    # Handle data for training and testing separately
    if tt == 'train':
        # Extract the target variable 'readmission_id' for training data
        y = data['readmission_id']
        data.drop(columns=['readmission_id'], inplace=True)
        
        # Join the processed drug change information with the training data
        data = data.join(drug_change_df)
        data = data.join(y)
    elif tt == 'test':
        # Join the processed drug change information with the test data
        data = data.join(drug_change_df)
    
    return data


In [9]:
data.shape

(71236, 50)

In [10]:
data.head()

Unnamed: 0,enc_id,patient_id,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmission_id
0,88346340,2488608,Caucasian,Male,[60-70),,1,2,6,3,...,No,Steady,No,No,No,No,No,Ch,Yes,2
1,92001408,52133202,Caucasian,Male,[70-80),[100-125),2,6,1,7,...,No,No,No,No,No,No,No,No,Yes,1
2,169424316,40945509,Caucasian,Female,[70-80),,3,2,1,7,...,No,Up,No,No,No,No,No,Ch,Yes,1
3,272987082,38850777,Caucasian,Female,[50-60),,1,1,7,1,...,No,No,No,No,No,No,No,No,Yes,2
4,150600612,72738225,Caucasian,Female,[80-90),,1,6,7,6,...,No,Down,No,No,No,No,No,Ch,Yes,2


In [11]:
data = drugs(data, 'train')

data.shape

(71236, 30)

In [12]:
data.head()

Unnamed: 0,enc_id,patient_id,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,diag_3,number_diagnoses,max_glu_serum,A1Cresult,change,diabetesMed,drug_up,drug_down,drug_steady,readmission_id
0,88346340,2488608,Caucasian,Male,[60-70),,1,2,6,3,...,410-414:Ischemic Heart Disease:None,5,,,Ch,Yes,0,0,2,2
1,92001408,52133202,Caucasian,Male,[70-80),[100-125),2,6,1,7,...,420-429:Other Forms Of Heart Disease:None,9,,,No,Yes,0,0,1,1
2,169424316,40945509,Caucasian,Female,[70-80),,3,2,1,7,...,996-999:Complications Of Surgical And Medical ...,9,,,Ch,Yes,1,0,0,1
3,272987082,38850777,Caucasian,Female,[50-60),,1,1,7,1,...,249-259:Diseases Of Other Endocrine Glands:None,8,,,No,Yes,0,0,1,2
4,150600612,72738225,Caucasian,Female,[80-90),,1,6,7,6,...,700-709:Other Diseases Of Skin And Subcutaneou...,9,,,Ch,Yes,0,1,0,2


### Encoding

We encode categorical variables using `OrdinalEncoder` from sklearn.

In [13]:
def encode_cat(data, tt, cat_cols, enc):
    if tt == 'train':
        # Encode categorical variables for the training data
        encoded_data = pd.DataFrame(enc.fit_transform(data[cat_cols]), columns=cat_cols)
        for col in cat_cols:
            data[col] = encoded_data[col]
    elif tt == 'test':
        # Encode categorical variables for the test data using the same encoder
        encoded_data = pd.DataFrame(enc.transform(data[cat_cols]), columns=cat_cols)
        for col in cat_cols:
            data[col] = encoded_data[col]
    return data

In [14]:
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
data = encode_cat(data, 'train', ENC_CAT_COLS_NO_DRUGS, enc)

data.head()

Unnamed: 0,enc_id,patient_id,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,diag_3,number_diagnoses,max_glu_serum,A1Cresult,change,diabetesMed,drug_up,drug_down,drug_steady,readmission_id
0,88346340,2488608,2.0,1.0,6.0,,1,2,6,3,...,44.0,5,,,0.0,1.0,0,0,2,2
1,92001408,52133202,2.0,1.0,7.0,[100-125),2,6,1,7,...,46.0,9,,,1.0,1.0,0,0,1,1
2,169424316,40945509,2.0,0.0,7.0,,3,2,1,7,...,106.0,9,,,0.0,1.0,1,0,0,1
3,272987082,38850777,2.0,0.0,5.0,,1,1,7,1,...,26.0,8,,,1.0,1.0,0,0,1,2
4,150600612,72738225,2.0,0.0,8.0,,1,6,7,6,...,75.0,9,,,0.0,1.0,0,1,0,2


### Creating a new feature: `pat_cnt`

We create a new feature `pat_cnt` (patient count) for looking at how many times a patient has been admitted/re-admitted using duplicate values of `patient_id`. 

This turned out to be the single most useful feature in the dataset increasing the score by **15%**

In [15]:
def compute_pat_cnt(data, test_data, tt, scaler):
    # Calculate the count of admissions/re-admissions for each patient
    vc = pd.concat([data['patient_id'], test_data['patient_id']], axis=0).value_counts()
    pat_cnt = []
    for idx, row in data.iterrows():
        pat_cnt.append(vc[row['patient_id']])
    
    if tt == 'train':
        # Insert the 'pat_cnt' feature before the last column in the training data
        data.insert(data.shape[1] - 1, 'pat_cnt', pat_cnt)
        # Standardize the 'pat_cnt' feature using scaler
        data['pat_cnt'] = scaler.fit_transform(data['pat_cnt'].to_numpy().reshape(-1, 1))
    elif tt == 'test':
        # Insert the 'pat_cnt' feature at the end of the test data
        data.insert(data.shape[1], 'pat_cnt', pat_cnt)
        data['pat_cnt'] = scaler.transform(data['pat_cnt'].to_numpy().reshape(-1, 1))
    
    return data

In [16]:
ss1 = StandardScaler()
data = compute_pat_cnt(data, test_data, 'train', ss1)

data[['pat_cnt']].head()

Unnamed: 0,pat_cnt
0,-0.514042
1,-0.105474
2,2.345931
3,-0.514042
4,-0.514042


### Dealing with null values

Checking for null values

In [17]:
(data.isna().sum()/len(data)*100).sort_values(ascending=False)

weight                      96.841485
max_glu_serum               94.776517
A1Cresult                   83.323039
medical_specialty           49.034196
payer_code                  39.555843
race                         2.275535
diag_3                       1.388343
diag_2                       0.342523
diag_1                       0.021057
number_diagnoses             0.000000
enc_id                       0.000000
change                       0.000000
drug_up                      0.000000
drug_down                    0.000000
drug_steady                  0.000000
pat_cnt                      0.000000
diabetesMed                  0.000000
number_outpatient            0.000000
number_inpatient             0.000000
number_emergency             0.000000
patient_id                   0.000000
num_medications              0.000000
num_procedures               0.000000
num_lab_procedures           0.000000
time_in_hospital             0.000000
admission_source_id          0.000000
discharge_di

We see that several columns have a significant number of null values

We employ the following strategies for dealing with null values based on experimentation:
1. Dropping columns: `weight`, `max_glu_serum`, `A1Cresult`
2. Imputing constant value: `medical_specialty`, `payer_code`
3. Imputing the mode: `race`, `diag_*`

In [18]:
def removing_null(data, tt, imputer):
    # Drop columns with significant null values
    data.drop(columns=['weight', 'max_glu_serum', 'A1Cresult'], inplace=True)
    
    # Impute constant values for 'medical_specialty' and 'payer_code'
    data['medical_specialty'] = data['medical_specialty'].fillna(68)
    data['payer_code'] = data['payer_code'].fillna(17)
    
    if tt == "train":
        # Extract the target variable 'readmission_id' for training data
        y = data['readmission_id']
        
        # Remove the target variable column
        data = data.iloc[:, :data.shape[1] - 1]
        
        # Impute missing values for the remaining features using the provided imputer
        imputed_data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
        
        # Rejoin the target variable for training data
        imputed_data = imputed_data.join(y)
    elif tt == "test":
        # Impute missing values for the test data using the same imputer
        imputed_data = pd.DataFrame(imputer.transform(data), columns=data.columns)
    
    return imputed_data

In [19]:
imputer = SimpleImputer(strategy='most_frequent')
data = removing_null(data, 'train', imputer)

(data.isna().sum()/len(data)*100).sort_values(ascending=False)

enc_id                      0.0
patient_id                  0.0
pat_cnt                     0.0
drug_steady                 0.0
drug_down                   0.0
drug_up                     0.0
diabetesMed                 0.0
change                      0.0
number_diagnoses            0.0
diag_3                      0.0
diag_2                      0.0
diag_1                      0.0
number_inpatient            0.0
number_emergency            0.0
number_outpatient           0.0
num_medications             0.0
num_procedures              0.0
num_lab_procedures          0.0
medical_specialty           0.0
payer_code                  0.0
time_in_hospital            0.0
admission_source_id         0.0
discharge_disposition_id    0.0
admission_type_id           0.0
age                         0.0
gender                      0.0
race                        0.0
readmission_id              0.0
dtype: float64

### The pecularity with `id`'s

We observed how `pat_cnt` feature drastically improved our results, we wondered if there was some implicit pattern with the id's which might help us improve our results.

This made us realize that normalizing the value of `enc_id` gives us a significant boost to the score

`patient_id` performed better without normalizing however.

In [20]:
def deal_with_ids(data, tt, scaler):
    if tt == 'train':
        # Normalize 'enc_id' for the training data using the provided scaler
        data['enc_id'] = scaler.fit_transform(data['enc_id'].to_numpy().reshape(-1, 1))
    elif tt == 'test':
        # Normalize 'enc_id' for the test data using the same scaler
        data['enc_id'] = scaler.transform(data['enc_id'].to_numpy().reshape(-1, 1))
    
    return data

## Helper functions

We defined a function to get training data as X & y numpy arrays

In [21]:
def get_X_y(data):
    # Extract features (X) and the target variable (y) from the given DataFrame
    X, y = data.iloc[:, :data.shape[1] - 1].to_numpy(), data.iloc[:, data.shape[1] - 1].to_numpy()
    return X, y


We also created a function to cross validate a model

In [22]:
def cv(model, X, y, params=None):
    # Perform cross-validation of the given model
    cv_results = cross_validate(model, X, y, scoring=['accuracy', 'f1_macro'], return_estimator=True, fit_params=params)
    return cv_results

And finally a function to generate a submission file in the required format

In [23]:
def gen_submission(data, model, enc_ids, fname, xg=False, np=True):
    # Make predictions using the given model
    if not xg:
        if np:
            x = data
        else:
            x = data.to_numpy()
        preds = model.predict(x)
    else:
        x = xgb.DMatrix(data)
        preds = model.predict(x)
    
    if np:
        # Create a DataFrame for submission
        d = {'enc_id': enc_ids, 'readmission_id': preds}
        submission = pd.DataFrame(d)
    else:
        # Update the 'readmission_id' column with predictions
        data['readmission_id'] = preds
        data['enc_id'] = enc_ids
        submission = data[['enc_id', 'readmission_id']]
    
    # Ensure data types of 'enc_id' and 'readmission_id'
    submission.loc[:, 'enc_id'] = submission['enc_id'].astype(int)
    submission.loc[:, 'readmission_id'] = submission['readmission_id'].astype(float)
    
    # Save the submission DataFrame to a CSV file with the provided filename
    submission.to_csv(fname, index=False)


We encapsulate all the above functons in a couple of functions

In [24]:
def load_data(data_dir):
    # Load the training and test data from the specified directory
    data = pd.read_csv(data_dir + '/train.csv')
    test_data = pd.read_csv(data_dir + '/test.csv')
    return data, test_data

In [25]:
def preprocessing_and_fe(data, test_data):
    # Extract 'enc_id' from test data for submission
    enc_ids = test_data['enc_id']
    
    # Process the 'diag' columns for both training and test data
    data = diag(data)
    test_data = diag(test_data)
    
    # Process the 'drugs' columns for training and test data
    data = drugs(data, 'train')
    test_data = drugs(test_data, 'test')
    
    # Encode categorical variables using OrdinalEncoder for training and test data
    enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
    data = encode_cat(data, 'train', ENC_CAT_COLS_NO_DRUGS, enc)
    test_data = encode_cat(test_data, 'test', ENC_CAT_COLS_NO_DRUGS, enc)
    
    # Standardize the 'pat_cnt' feature for training and test data
    ss1 = StandardScaler()
    data = compute_pat_cnt(data, test_data, 'train', ss1)
    test_data = compute_pat_cnt(test_data, data, 'test', ss1)
    
    # Impute missing values and handle null values for training and test data
    imputer = SimpleImputer(strategy='most_frequent')
    data = removing_null(data, 'train', imputer)
    test_data = removing_null(test_data, 'test', imputer)
    
    # Standardize the 'enc_id' feature for training and test data
    ss2 = StandardScaler()
    data = deal_with_ids(data, 'train', ss2)
    test_data = deal_with_ids(test_data, 'test', ss2)

    # Create a list of categorical features based on the presence of 'diag' columns
    cat_feat = list(map(lambda x: x in CAT_COLS_WITH_DIAG, data.columns.tolist()[:-1]))
    
    # Extract X (input features) and y (target variable) for training data
    X, y = get_X_y(data)
    
    # Convert test data to a NumPy array
    x = test_data.to_numpy()
    
    return X, y, x, enc_ids, cat_feat

## Model used

We use the `HistGradientBoostClassifier` classifier from sklearn which is a variant of Gradient Boosting inspired by LightGBM.

In [26]:
data, test_data = load_data('data')

# Load and preprocess the data
X, y, x, enc_ids, cat_feat = preprocessing_and_fe(data, test_data)

# Initialize the HistGradientBoostingClassifier with categorical features
hgbc = HistGradientBoostingClassifier(categorical_features=cat_feat)

# Perform cross-validation to find the best f1 score
cv_res = cv(hgbc, X, y)
best_f1 = max(cv_res['test_f1_macro'])
print(f'Best f1: {best_f1}')

# Get the model with the best f1 score
model = cv_res['estimator'][cv_res['test_f1_macro'].argmax()]

# Generate the final submission
gen_submission(x, model, enc_ids, 'final_submission.csv')


Best f1: 0.5522265857269146


# Hyperparameter tuning

For tuning hyper parameters, we first do random search on several possible combinations

In [27]:
model = HistGradientBoostingClassifier(categorical_features=cat_feat)

# Define a dictionary of hyperparameter options to search
param_distributions = {
    'max_iter': [100, 1000, 3000],
    'max_leaf_nodes': [10, 50, 100],
    'min_samples_leaf': [20, 50, 10],
    'l2_regularization': [0, 1, 1.5],
    'scoring': ['f1_macro'],
}

# Create a RandomizedSearchCV object to search for the best hyperparameters
model_random_search = RandomizedSearchCV(
    model, param_distributions=param_distributions, n_jobs=-1, cv=3, scoring='f1_macro', n_iter=30
)

# Fit the RandomizedSearchCV object to the training data
model_random_search.fit(X, y)

In [28]:
# Retrieve the cross-validation results and sort them by mean test score
cv_results = pd.DataFrame(model_random_search.cv_results_).sort_values(
    "mean_test_score", ascending=False
)
cv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_scoring,param_min_samples_leaf,param_max_leaf_nodes,param_max_iter,param_l2_regularization,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
9,32.979763,9.888138,1.213187,0.272197,f1_macro,50,100,3000,1.5,"{'scoring': 'f1_macro', 'min_samples_leaf': 50...",0.554039,0.553747,0.554298,0.554028,0.000225,1
6,28.103802,6.140377,1.103143,0.185629,f1_macro,10,100,1000,1.0,"{'scoring': 'f1_macro', 'min_samples_leaf': 10...",0.553418,0.55428,0.554095,0.553931,0.000371,2
22,36.374589,9.653315,1.005154,0.114661,f1_macro,50,50,1000,0.0,"{'scoring': 'f1_macro', 'min_samples_leaf': 50...",0.556574,0.551948,0.553028,0.55385,0.001976,3
18,31.820839,10.879679,1.174793,0.260457,f1_macro,50,100,100,1.5,"{'scoring': 'f1_macro', 'min_samples_leaf': 50...",0.556023,0.554635,0.550315,0.553658,0.002431,4
17,26.538475,4.989785,1.097434,0.243954,f1_macro,50,50,3000,1.0,"{'scoring': 'f1_macro', 'min_samples_leaf': 50...",0.557354,0.547305,0.555357,0.553339,0.004344,5


We perform a grid search on the resulting hyper parameters

In [30]:
param_grid = {
    'max_iter': [1000, 3000],
    'max_leaf_nodes': [100, 150],
    'min_samples_leaf': [50, 10],
    'l2_regularization': [1, 1.5],
    'scoring': ['f1_macro'],
}

# Create a GridSearchCV object to search for the best hyperparameters
model_grid_search = GridSearchCV(
    model, param_grid=param_grid, n_jobs=-1, cv=3, scoring='f1_macro'
)

# Fit the GridSearchCV object to the training data
model_grid_search.fit(X, y)

In [33]:
# Retrieve the cross-validation results and sort them by mean test score
cv_results_gs = pd.DataFrame(model_grid_search.cv_results_).sort_values(
    "mean_test_score", ascending=False
)
cv_results_gs.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_l2_regularization,param_max_iter,param_max_leaf_nodes,param_min_samples_leaf,param_scoring,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
14,33.150123,1.324454,0.763672,0.025422,1.5,3000,150,50,f1_macro,"{'l2_regularization': 1.5, 'max_iter': 3000, '...",0.556349,0.552617,0.559853,0.556273,0.002955,1
0,32.460158,7.595268,1.16539,0.240434,1.0,1000,100,50,f1_macro,"{'l2_regularization': 1, 'max_iter': 1000, 'ma...",0.557672,0.553547,0.555623,0.555614,0.001684,2
5,30.88289,6.033982,1.161958,0.174209,1.0,3000,100,10,f1_macro,"{'l2_regularization': 1, 'max_iter': 3000, 'ma...",0.5595,0.551789,0.554734,0.555341,0.003177,3
10,38.837281,4.321131,1.464876,0.275889,1.5,1000,150,50,f1_macro,"{'l2_regularization': 1.5, 'max_iter': 1000, '...",0.55679,0.551971,0.555961,0.554907,0.002103,4
6,37.176221,9.488804,1.541032,0.202772,1.0,3000,150,50,f1_macro,"{'l2_regularization': 1, 'max_iter': 3000, 'ma...",0.558064,0.550115,0.555947,0.554709,0.003361,5


In [34]:
# Get the best model with the optimized hyperparameters
best_model = model_grid_search.best_estimator_

# Generate a submission using the best model
gen_submission(x, best_model, enc_ids, 'final_hyp_submission.csv')