# TODO:
* Implement stacking (ensembling technique)
* Use xgboost, catboost, lightgbm and keras and logistic regression as base models
* Use linear regression with regularization (fine tune the regularization parameter with optuna) and tackle the task as regression with target values either being 0 or 1 and feature values of meta data being the probabilities predicted by our base models

# REMOVE ALL THE NEW ADDED FEATURES
## The keras tuned model achieved a score of 87.7 with no feature engineering

In [154]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
import xgboost as xgb
import lightgbm as lgbm
import catboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.linear_model import LogisticRegression
from IPython.display import display
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import optuna
from optuna.samplers import TPESampler
from optuna.integration import XGBoostPruningCallback
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [119]:
from warnings import filterwarnings
filterwarnings('ignore')

# Loading Data

In [120]:
BASE_DIR = Path("/kaggle/input/playground-series-s3e2/")

train = pd.read_csv(BASE_DIR / "train.csv").drop(columns="id")
test = pd.read_csv(BASE_DIR / "test.csv").drop(columns="id")

# Preprocessing

In [121]:
df = pd.concat([train.drop(columns=["stroke"]), test], axis=0).reset_index(drop=True)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked
1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked
2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown
3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked
4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked


# Encoding - One hot

In [122]:
df = pd.get_dummies(df)
df.head(3)

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,28.0,0,0,79.53,31.1,0,1,0,0,1,...,0,1,0,0,0,1,0,0,1,0
1,33.0,0,0,78.44,23.9,0,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
2,42.0,0,0,103.0,40.3,1,0,0,0,1,...,0,1,0,0,1,0,1,0,0,0


# Preprocessing v2

In [123]:
X = df.iloc[:-len(test), :]
test_new = df.iloc[-len(test):, :]
y = train.stroke

In [171]:
len(X) == len(train)

True

In [172]:
len(X), len(train)

(15304, 15304)

# Before we ensebmle, let's first finetune individual models and to get the optimized parameters for them

## Finetuning Logsitic Regression

In [125]:
# def objective(trial, X, y):
#     params = {
#         "solver": trial.suggest_categorical("solver", ["liblinear", "newton-cg", "saga", "lbfgs"]),
#         "C": trial.suggest_loguniform("C", 0.01, 100.0),
#         "tol": trial.suggest_loguniform("tol", 1e-6, 1e-2),
#         "max_iter": trial.suggest_int("max_iter", 100, 1000),
#     }
        
#     if params["solver"] in ["newton-cg", "lbfgs"]:
#         params["penalty"] = "l2"
        
#     elif params["solver"] == "liblinear":
#         params["penalty"] =  trial.suggest_categorical("penalty_liblinear", ["l1","l2"])
    
#     elif params["solver"] == "saga":
#         params["penalty"] =  trial.suggest_categorical("penalty_saga", ["l1","l2","elasticnet"])
        
#         # saga also needs anothe parameter l1_ratio which ranges in 0-1
#         params["l1_ratio"] = trial.suggest_loguniform("l1_ratio", 0.00001, 1)
        
#         # saga also likes it's features to be scaled to converge so let's scale our data
#         sc = StandardScaler()
#         X = sc.fit_transform(X)
        
#     # lets cross validate
#     cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=1337)
    
#     cv_scores = []
#     for fold, (train_idx, test_idx) in enumerate(cv.split(X, y)):
#         # since we scale our dataset also, that converts it into a numpy ndarray
#         # and hence it won't have iloc func that pandas dataframe has.
#         # so let's check for the datatype of X first and then slice
#         if type(X) == pd.core.frame.DataFrame:
#             X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
#             y_train, y_test = y[train_idx], y[test_idx]
#         else:
#             X_train, X_test = X[train_idx], X[test_idx]
#             y_train, y_test = y[train_idx], y[test_idx]
            
        
#         model = LogisticRegression(**params)       
#         model.fit(X_train, y_train)
        
#         y_pred = model.predict_proba(X_test)[:, 1]
#         cv_scores.append(roc_auc_score(y_test, y_pred))
    
#     auc = np.mean(cv_scores)
#     return auc

In [126]:
# lr_study = optuna.create_study(direction="maximize", study_name="linear regression tuning")
# func = lambda trial: objective(trial, X, y)
# lr_study.optimize(func, n_trials=100, show_progress_bar=True)

In [127]:
# lr_study.best_value

In [128]:
# lr_study.best_params

## Finetuning XGBoost

In [129]:
# # we already have fintuned params for xgboost as following
# xgb_params = {'n_estimators': 272,
#                  'max_depth': 4,
#                  'learning_rate': 0.07360332417334109,
#                  'min_child_weight': 1,
#                  'gamma': 0.05180472121817407,
#                  'subsample': 0.41981696676590474,
#                  'colsample_bytree': 0.730818203141452,
#                  'reg_alpha': 0.0005268315833160329,
#                  'reg_lambda': 0.0007997627184403383}

## Finetuning LightGBM

In [130]:
# from optuna.integration import LightGBMPruningCallback

# def objective(trial, X, y):
#     param_grid = {
# #         "device_type": trial.suggest_categorical("device_type", ['gpu']),
#         "is_unbalance": True,
#         "n_estimators": trial.suggest_int("n_estimators", 100, 2000),
#         "num_rounds": trial.suggest_int("num_rounds", 100, 500),
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
#         "num_leaves": trial.suggest_int("num_leaves", 20, 300),
#         "max_depth": trial.suggest_int("max_depth", 2, 12),
#         "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 1000),
#         "lambda_l1": trial.suggest_loguniform('lambda_l1', 0.00001, 1.0),
#         "lambda_l2": trial.suggest_loguniform('lambda_l2', 0.00001, 1.0),
#         "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
#         "bagging_fraction":  trial.suggest_loguniform('bagging_fraction', 0.2, 1.0),
#         "feature_fraction": trial.suggest_loguniform('feature_fraction', 0.2, 1.0),
#     }

#     cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=1337)

#     cv_scores = np.empty(8)
#     for fold, (train_idx, test_idx) in enumerate(cv.split(X, y)):
#         X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
#         y_train, y_test = y[train_idx], y[test_idx]

#         model = lgbm.LGBMClassifier(objective="binary", **param_grid)
#         model.fit(
#             X_train,
#             y_train,
#             eval_set=[(X_test, y_test)],
#             eval_metric="auc",
#             early_stopping_rounds=100,
#             verbose=0,
#             callbacks=[
#                 LightGBMPruningCallback(trial, "auc")
#             ],  # Add a pruning callback
#         )
#         y_preds = model.predict_proba(X_test)[:, 1]
#         cv_scores[fold] = roc_auc_score(y_test, y_preds)
    
#     auc = np.mean(cv_scores)
#     print(f"AVG CV AUC: \t {auc}")

#     return auc

In [131]:
# study = optuna.create_study(direction="maximize", study_name="LGBM Tuning")
# func = lambda trial: objective(trial, X, y)
# study.optimize(func, n_trials=100, show_progress_bar=True)

In [132]:
# study.best_value

In [133]:
# study.best_params

In [134]:
# lgbm_params = study.best_params

NameError: name 'study' is not defined

# CatBoost

In [135]:
# def objective(trial, X, y):
#     param = {
#         "loss_function": trial.suggest_categorical("loss_function", ["CrossEntropy"]),
#         "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 1e0),
#         "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-2, 1e0),
#         "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
#         "depth": trial.suggest_int("depth", 1, 10),
#         "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
#         "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
#         "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 20),
#         "one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 20),  
#     }
#     # Conditional Hyper-Parameters
#     if param["bootstrap_type"] == "Bayesian":
#         param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
#     elif param["bootstrap_type"] == "Bernoulli":
#         param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
    
#     cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=1337)

#     cv_scores = np.empty(8)
#     for fold, (train_idx, test_idx) in enumerate(cv.split(X, y)):
#         X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
#         y_train, y_test = y[train_idx], y[test_idx]

#         cat_model = catboost.CatBoostClassifier(**param)
#         cat_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=0, early_stopping_rounds=100)
        
#         y_preds = cat_model.predict_proba(X_test)[:, 1]
#         cv_scores[fold] = roc_auc_score(y_test, y_preds)
    
#     auc = np.mean(cv_scores)
#     print(f"AVG CV AUC: \t {auc}")
#     return auc

In [136]:
# cat_study = optuna.create_study(direction="maximize", study_name="catboost tuning")
# func = lambda trial: objective(trial, X, y)
# cat_study.optimize(func, n_trials=100, show_progress_bar=True)

In [137]:
# cat_study.best_value

In [138]:
# cat_study.best_params

### Before we implement stacking, let's reserve a validation set for validation/testing the meta leaners's predictions before we make the submission because the test set that comes with compeition has no y.

In [139]:
# X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, shuffle=True, test_size=0.2, stratify=y)

# will do it later, maybe some day

In [140]:
# sanity check
# len(X_train) == len(y_train), len(X_train)

(True, 12243)

# Stacking (finally!)

### Let's set best parametes for each model here

In [141]:
# Tuned XGBoost Params
xgb_params = {'n_estimators': 272,
                 'max_depth': 4,
                 'learning_rate': 0.07360332417334109,
                 'min_child_weight': 1,
                 'gamma': 0.05180472121817407,
                 'subsample': 0.41981696676590474,
                 'colsample_bytree': 0.730818203141452,
                 'reg_alpha': 0.0005268315833160329,
                 'reg_lambda': 0.0007997627184403383}


# Tuned LightGBM Params
lgbm_params = {'n_estimators': 1942,
                 'num_rounds': 477,
                 'learning_rate': 0.2870761124159734,
                 'num_leaves': 37,
                 'max_depth': 6,
                 'min_data_in_leaf': 368,
                 'lambda_l1': 0.00017451291663562305,
                 'lambda_l2': 0.5090553595978456,
                 'min_gain_to_split': 0.02547126174774228,
                 'bagging_fraction': 0.20531850278394478,
                 'feature_fraction': 0.37437811030015083}


# Tuned Logistic Regression params
lr_params = {'solver': 'saga',
                 'C': 0.07655860626453208,
                 'tol': 0.00011268918749381429,
                 'max_iter': 484,
                 'penalty': 'l1',
                 'l1_ratio': 0.012007897837476486}


# Tuned CatBoost params
cat_params = {'loss_function': 'CrossEntropy',
                 'learning_rate': 0.7611165319863433,
                 'l2_leaf_reg': 0.014250721124770624,
                 'colsample_bylevel': 0.061218122768953775,
                 'depth': 1,
                 'boosting_type': 'Plain',
                 'bootstrap_type': 'Bernoulli',
                 'min_data_in_leaf': 18,
                 'one_hot_max_size': 14,
                 'subsample': 0.9972814280760031}

In [142]:
base_learners = []

xgb_model = xgb.XGBClassifier(**xgb_params)
base_learners.append(xgb_model)

lgbm_model = lgbm.LGBMClassifier(objective="binary", is_unbalance=True, **lgbm_params)
base_learners.append(lgbm_model)

cat_model = catboost.CatBoostClassifier(**cat_params)
base_learners.append(cat_model)

lr_model = LogisticRegression(**lr_params)
base_learners.append(lr_model)

### Let's add keras to the mix

In [143]:
inputs = layers.Input(shape=(21,))
x = layers.Dense(512, activation="relu")(inputs)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.3)(x)

x = layers.Dense(448, activation="relu")(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.2)(x)

x = layers.Dense(128, activation="relu")(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.1)(x)

x = layers.Dense(32, activation="relu")(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.7)(x)

outputs = layers.Dense(1, activation="sigmoid")(x)

keras_model = keras.Model(inputs=inputs, outputs=outputs)

optim = keras.optimizers.Adam(learning_rate= 0.0035)

keras_model.compile(optimizer=optim,
                   loss=keras.losses.binary_crossentropy,
                   metrics=[keras.metrics.AUC()])

In [144]:
base_learners.append(keras_model)

In [145]:
# random forest just finished tuning, so lets add it as well

rf_params = {'n_estimators': 390,
                 'max_features': 14,
                 'min_samples_split': 19,
                 'min_samples_leaf': 2,
                 'max_depth': 5}

rf_model = RandomForestClassifier(**rf_params)

base_learners.append(rf_model)

In [82]:
str(keras_model.__class__).__contains__("keras")

True

In [84]:
len(base_learners)

6

In [175]:
# variables to store meta data and their targets that we'll later use for meta-learner model
meta_data = np.zeros((len(base_learners), len(X)))
meta_targets = np.zeros(len(X))

# creating cross_validation
kf = StratifiedKFold(n_splits=8, shuffle=True, random_state=1337)
meta_index = 0

for train_idx, test_idx in kf.split(X, y):
    # train each learner on k-1 folds
    # and create meta data for Kth fold
    for i in range(len(base_learners)):
        learner = base_learners[i]
        if str(learner.__class__).__contains__("keras"):
            learner.fit(X.iloc[train_idx], y[train_idx], epochs=48)
            predictions = learner.predict(X.iloc[test_idx])[:, 0]
        else:
            learner.fit(X.iloc[train_idx], y[train_idx])
            predictions = learner.predict_proba(X.iloc[test_idx])[:, 1]
        meta_data[i][meta_index:meta_index+len(test_idx)] = predictions
    
    meta_targets[meta_index:meta_index+len(test_idx)] = y[test_idx]
    meta_index += len(test_idx)

Epoch 46/48
Epoch 47/48
Epoch 48/48


In [176]:
# transpose the meta data to give it features/target format
meta_data = meta_data.transpose()

In [177]:
meta_data.shape

(15304, 6)

In [178]:
# Creating meta data for test set
test_meta_data = np.zeros((len(base_learners), len(test_new)))

for i in range(len(base_learners)):
    learner = base_learners[i]
    if str(learner.__class__).__contains__("keras"):
        learner.fit(X, y, epochs=48)
        predictions = learner.predict(test_new)[:, 0]
    else:
        learner.fit(X, y)
        predictions = learner.predict_proba(test_new)[:, 1]
    
    test_meta_data[i] = predictions

0:	learn: 0.1824383	total: 2.66ms	remaining: 2.65s
1:	learn: 0.1693605	total: 6.02ms	remaining: 3s
2:	learn: 0.1686928	total: 8.15ms	remaining: 2.71s
3:	learn: 0.1686616	total: 9.79ms	remaining: 2.44s
4:	learn: 0.1434894	total: 12.1ms	remaining: 2.41s
5:	learn: 0.1433230	total: 14ms	remaining: 2.32s
6:	learn: 0.1433123	total: 15.8ms	remaining: 2.25s
7:	learn: 0.1433114	total: 17.6ms	remaining: 2.18s
8:	learn: 0.1433104	total: 19ms	remaining: 2.09s
9:	learn: 0.1433101	total: 20.4ms	remaining: 2.02s
10:	learn: 0.1403072	total: 22.6ms	remaining: 2.03s
11:	learn: 0.1399112	total: 24.7ms	remaining: 2.04s
12:	learn: 0.1399105	total: 26.5ms	remaining: 2.01s
13:	learn: 0.1399107	total: 28.1ms	remaining: 1.98s
14:	learn: 0.1384813	total: 30.2ms	remaining: 1.99s
15:	learn: 0.1375335	total: 32.3ms	remaining: 1.99s
16:	learn: 0.1372192	total: 34.3ms	remaining: 1.98s
17:	learn: 0.1372165	total: 36.3ms	remaining: 1.98s
18:	learn: 0.1371366	total: 38.3ms	remaining: 1.98s
19:	learn: 0.1371227	total: 4

In [179]:
test_meta_data = test_meta_data.transpose()

### Let's define a meta learner, for now let's choose linear regression with regularization and tune it using optuna

In [180]:
def cross_validate(X, y, model):
    kf = KFold(n_splits=8, shuffle=True, random_state=1337)
    
    cv_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
                
        # training
        model.fit(X_train, y_train)

        # predicting
        y_pred = model.predict(X_val)
        
        mse = mean_squared_error(y_val, y_pred)
        
        print(f"Fold: {fold} \t mse: {mse}")
        
        cv_scores.append(mse)
    
    avg_mse = np.mean(cv_scores)
    print(f"Avg MSE: {avg_mse}")

In [182]:
from sklearn.linear_model import Ridge

In [166]:
sum(meta_targets)

474.0

In [167]:
474/12242

0.03871916353537004

In [168]:
sum(y)

632

In [170]:
train.stroke.value_counts()

0    14672
1      632
Name: stroke, dtype: int64

In [181]:
meta_df = pd.DataFrame(data=meta_data)
meta_df["target"] = meta_targets

meta_df

Unnamed: 0,0,1,2,3,4,5,target
0,0.009830,0.038934,0.006878,0.040524,0.001205,0.006718,0.0
1,0.003533,0.089134,0.002841,0.035697,0.005786,0.007981,0.0
2,0.003109,0.001710,0.004122,0.016452,0.003155,0.005529,0.0
3,0.001523,0.000294,0.000885,0.011100,0.000066,0.001851,0.0
4,0.011603,0.008849,0.024423,0.132350,0.087227,0.039727,0.0
...,...,...,...,...,...,...,...
15299,0.081750,0.626202,0.145936,0.100511,0.160455,0.202467,0.0
15300,0.033513,0.026310,0.091665,0.178568,0.092455,0.147739,1.0
15301,0.003765,0.003983,0.009716,0.038686,0.012770,0.006836,0.0
15302,0.026254,0.102597,0.027798,0.113211,0.051185,0.038007,0.0


In [92]:
# meta_data.shape
# meta_targets.

In [183]:
base_ridge = Ridge()

cross_validate(meta_data, meta_targets, base_ridge)

Fold: 0 	 mse: 0.03177328300433247
Fold: 1 	 mse: 0.03668730640409041
Fold: 2 	 mse: 0.029900593865189354
Fold: 3 	 mse: 0.02733016266403214
Fold: 4 	 mse: 0.03530602277423189
Fold: 5 	 mse: 0.03042343494577852
Fold: 6 	 mse: 0.03950934006536171
Fold: 7 	 mse: 0.03812394547370054
Avg MSE: 0.03363176114958963


In [158]:
# let's try XGBoost Regressor

In [184]:
xgb_meta_model = xgb.XGBRegressor()
cross_validate(meta_data, meta_targets, xgb_meta_model)

Fold: 0 	 mse: 0.03642209285456102
Fold: 1 	 mse: 0.04219839849561067
Fold: 2 	 mse: 0.036623613745933814
Fold: 3 	 mse: 0.03338562274782165
Fold: 4 	 mse: 0.040943988248746885
Fold: 5 	 mse: 0.034292240198994704
Fold: 6 	 mse: 0.04476905904034879
Fold: 7 	 mse: 0.04471126407155436
Avg MSE: 0.03916828492544648


In [185]:
xgb_meta_model.fit(meta_data, meta_targets)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [105]:
def objective_ridge(trial, X, y):
    params = {
    "solver": trial.suggest_categorical("solver", ["svd", "cholesky", "saga", "lbfgs"]),
    "alpha": trial.suggest_int("alpha", 0, 3000),
    "tol": trial.suggest_loguniform("tol", 1e-6, 1e-2)
}
        
    if params["solver"] == "saga":
        params["max_iter"] =  trial.suggest_int("max_iter_saga", 500, 4000)
                
        # saga also likes it's features to be scaled to converge so let's scale our data
        sc = StandardScaler()
        X = sc.fit_transform(X)
    
    elif params["solver"] == "lbfgs":
        params["positive"] = True

    else:
        params["max_iter"] = trial.suggest_int("max_iter", 1000, 30000)
        
    # lets cross validate
    cv = KFold(n_splits=8, shuffle=True, random_state=1337)
    
    cv_scores = []
    for fold, (train_idx, test_idx) in enumerate(cv.split(X)):
        # since we scale our dataset also, that converts it into a numpy ndarray
        # and hence it won't have iloc func that pandas dataframe has.
        # so let's check for the datatype of X first and then slice
        if type(X) == pd.core.frame.DataFrame:
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
        else:
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            
        
        model = Ridge(**params)       
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        cv_scores.append(mean_squared_error(y_test, y_pred))
    
    mse = np.mean(cv_scores)
    return mse

In [106]:
study_ridge = optuna.create_study(direction="minimize", study_name="ridge Tuning")
func = lambda trial: objective_ridge(trial, meta_data, meta_targets)
study_ridge.optimize(func, n_trials=50, show_progress_bar=True)

[32m[I 2023-01-15 19:05:25,835][0m A new study created in memory with name: ridge Tuning[0m


  0%|          | 0/50 [00:00<?, ?it/s]

[32m[I 2023-01-15 19:05:25,933][0m Trial 0 finished with value: 0.031798886895004305 and parameters: {'solver': 'svd', 'alpha': 118, 'tol': 2.004824258026079e-06, 'max_iter': 4519}. Best is trial 0 with value: 0.031798886895004305.[0m
[32m[I 2023-01-15 19:05:25,987][0m Trial 1 finished with value: 0.03488778600553169 and parameters: {'solver': 'lbfgs', 'alpha': 2488, 'tol': 0.001802658917103814}. Best is trial 0 with value: 0.031798886895004305.[0m
[32m[I 2023-01-15 19:05:26,354][0m Trial 2 finished with value: 0.03129855857845373 and parameters: {'solver': 'saga', 'alpha': 2689, 'tol': 1.439545894153173e-06, 'max_iter_saga': 1093}. Best is trial 2 with value: 0.03129855857845373.[0m
[32m[I 2023-01-15 19:05:26,397][0m Trial 3 finished with value: 0.034634538696331874 and parameters: {'solver': 'cholesky', 'alpha': 2097, 'tol': 0.00018413445969424818, 'max_iter': 21851}. Best is trial 2 with value: 0.03129855857845373.[0m
[32m[I 2023-01-15 19:05:26,451][0m Trial 4 finished

In [108]:
study_ridge.best_params

{'solver': 'saga',
 'alpha': 1184,
 'tol': 0.00964596906769185,
 'max_iter_saga': 500}

In [110]:
ridge_params = {'solver': 'saga',
                     'alpha': 1184,
                     'tol': 0.00964596906769185,
                     'max_iter': 500}

In [113]:
ridge_final = Ridge(**ridge_params)
ridge_final.fit(meta_data, meta_targets)

Ridge(alpha=1184, max_iter=500, solver='saga', tol=0.00964596906769185)

In [114]:
# y_final = ridge_final.predict(test_meta_data)

In [186]:
y_final = xgb_meta_model.predict(test_meta_data)

In [187]:
# since we dropped id column from our test dataframe earlier but we need it for submission file so we're reloading it
test_idx = pd.read_csv(BASE_DIR / "test.csv").id
test_idx

0        15304
1        15305
2        15306
3        15307
4        15308
         ...  
10199    25503
10200    25504
10201    25505
10202    25506
10203    25507
Name: id, Length: 10204, dtype: int64

In [188]:
submission = pd.DataFrame(data={"id": test_idx, "stroke": y_final})
submission.head()

Unnamed: 0,id,stroke
0,15304,0.012291
1,15305,0.142904
2,15306,0.001131
3,15307,-0.026331
4,15308,0.003519


In [116]:
# submission = pd.DataFrame(data={"id": test_idx, "stroke": y_final})
# submission.head()

Unnamed: 0,id,stroke
0,15304,0.039318
1,15305,0.096278
2,15306,0.020405
3,15307,0.066994
4,15308,0.022093


In [189]:
submission.to_csv("submission.csv", index=False)