# Hyperparameter tuning

## Train Val Test split

Role of Each Split
1. **Training Set:** Used to fit or train your model parameters (weights, decision tree splits, etc.).
2. **Validation Set:** Used to tune hyperparameters or compare different models. It provides feedback on how different settings affect performance before touching the test set.
3. **Test Set:** Used **once** at the end to get an unbiased estimate of the final chosen model’s performance.

The same way we need a dataset separate from the training set to tune hyperparameters, we need a separate dataset to evaluate the model’s final performance. This is the test set. 

### Data Prep

In [1]:
import os
import pandas as pd 
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [3]:
if "House_Rent_Dataset.csv" in os.listdir():
    path = "House_Rent_Dataset.csv"
else:
    import kagglehub
    path = kagglehub.dataset_download("iamsouravbanerjee/house-rent-prediction-dataset")
    path = os.path.join(path, "House_Rent_Dataset.csv")
    
df = pd.read_csv(path)

In [4]:
target = "Rent"

# Split into train val, test
train_size = 0.7

X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=train_size, random_state=5090)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, 
                                                  test_size=0.5, random_state=5090)

print(f"Sizes: Train={len(X_train)}, Val={len(X_val)}, Test={len(X_test)}")

Sizes: Train=3322, Val=712, Test=712


Note: It would have been better to do cross-validation here

In [6]:
COLS_TO_DROP = ["Posted On", "Floor", "Area Locality"]

def basic_preprocessing(df):
    df = df.drop(columns=COLS_TO_DROP)
    df.rename(columns={"Furnishing Status": "Furnish"}, inplace=True)
    return df

In [7]:
X_train = basic_preprocessing(X_train)
X_val = basic_preprocessing(X_val)
X_test = basic_preprocessing(X_test)

X_val

Unnamed: 0,BHK,Size,Area Type,City,Furnish,Tenant Preferred,Bathroom,Point of Contact
284,2,900,Super Area,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
257,2,450,Carpet Area,Kolkata,Unfurnished,Bachelors,1,Contact Owner
872,2,400,Carpet Area,Mumbai,Furnished,Bachelors/Family,1,Contact Owner
3749,3,1610,Super Area,Chennai,Semi-Furnished,Family,3,Contact Agent
2732,2,1000,Super Area,Delhi,Semi-Furnished,Family,1,Contact Owner
...,...,...,...,...,...,...,...,...
2357,2,850,Super Area,Bangalore,Semi-Furnished,Bachelors/Family,2,Contact Owner
3842,2,1000,Carpet Area,Chennai,Unfurnished,Bachelors,2,Contact Owner
1799,1,500,Super Area,Bangalore,Unfurnished,Bachelors/Family,1,Contact Owner
3931,2,1800,Super Area,Hyderabad,Furnished,Bachelors/Family,2,Contact Owner


In [8]:
COLS_OHE = ["City"]
COLS_ORDINAL = ["Furnish"]
COLS_STANDARD_SCALE = ["BHK", "Bathroom", "Size"]

In [9]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, \
    PolynomialFeatures

ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore", drop="first")
ordinal = OrdinalEncoder(categories=[["Unfurnished", "Semi-Furnished", "Furnished"]])
standard = StandardScaler()
poly = PolynomialFeatures(degree=3)

FIT ONLY ON TRAIN, OTHERWISE DATA LEAKAGE


In [10]:
ohe.fit(X_train[COLS_OHE])
ordinal.fit(X_train[COLS_ORDINAL])
poly.fit(X_train[COLS_STANDARD_SCALE])

X_train_poly = poly.transform(X_train[COLS_STANDARD_SCALE])

standard.fit(X_train_poly)

In [11]:
def preprocess_data(X, ohe, ordinal, standard):
    X_ohe = ohe.transform(X[COLS_OHE])
    X_ordinal = ordinal.transform(X[COLS_ORDINAL])
    x_poly = poly.transform(X[COLS_STANDARD_SCALE])
    X_poly_standard = standard.transform(x_poly)

    X_poly_standard = pd.DataFrame(X_poly_standard, columns=poly.get_feature_names_out())

    X_ohe = pd.DataFrame(X_ohe, columns=ohe.get_feature_names_out(COLS_OHE))
    X_ordinal = pd.DataFrame(X_ordinal, columns=COLS_ORDINAL)
    X_poly_standard = pd.DataFrame(X_poly_standard, columns=poly.get_feature_names_out(COLS_STANDARD_SCALE))
    
    X = pd.concat([X_poly_standard, X_ohe, X_ordinal], axis=1)
    return X

In [12]:
X_train = preprocess_data(X_train, ohe, ordinal, standard)
X_val = preprocess_data(X_val, ohe, ordinal, standard)
X_test = preprocess_data(X_test, ohe, ordinal, standard)

X_train

Unnamed: 0,1,BHK,Bathroom,Size,BHK^2,BHK Bathroom,BHK Size,Bathroom^2,Bathroom Size,Size^2,...,Bathroom^3,Bathroom^2 Size,Bathroom Size^2,Size^3,City_Chennai,City_Delhi,City_Hyderabad,City_Kolkata,City_Mumbai,Furnish
0,0.0,1.110492,1.189251,0.829344,0.999475,1.114653,0.806053,0.928818,0.707418,0.344902,...,0.506051,0.356248,0.175618,0.041691,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,-0.097846,-1.091749,-0.563145,-0.254682,-0.677640,-0.450178,-0.759804,-0.549198,-0.376971,...,-0.427136,-0.317066,-0.237390,-0.183322,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,-0.097846,0.048751,-0.408424,-0.254682,-0.165556,-0.374043,-0.126570,-0.291431,-0.327319,...,-0.175894,-0.202237,-0.197317,-0.174276,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,-0.097846,0.048751,0.133100,-0.254682,-0.165556,-0.107570,-0.126570,-0.065884,-0.093378,...,-0.175894,-0.129164,-0.118141,-0.116251,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,-0.097846,-1.091749,-0.021621,-0.254682,-0.677640,-0.183705,-0.759804,-0.436425,-0.169767,...,-0.427136,-0.298798,-0.202326,-0.137638,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3317,0.0,-0.097846,0.048751,0.055739,-0.254682,-0.165556,-0.145637,-0.126570,-0.098105,-0.132528,...,-0.175894,-0.139603,-0.131391,-0.127479,0.0,0.0,1.0,0.0,0.0,1.0
3318,0.0,-1.306183,-1.091749,-0.795227,-1.007177,-0.933682,-0.735686,-0.759804,-0.597530,-0.437127,...,-0.427136,-0.324895,-0.247570,-0.192217,0.0,1.0,0.0,0.0,0.0,0.0
3319,0.0,-0.097846,0.048751,0.055739,-0.254682,-0.165556,-0.145637,-0.126570,-0.098105,-0.132528,...,-0.175894,-0.139603,-0.131391,-0.127479,0.0,1.0,0.0,0.0,0.0,1.0
3320,0.0,-1.306183,0.048751,-0.717866,-1.007177,-0.677640,-0.716652,-0.126570,-0.420314,-0.418985,...,-0.175894,-0.243993,-0.228341,-0.189804,0.0,0.0,0.0,0.0,1.0,2.0


## Hyperparameter Tuning with Random Search and Grid Search


In [14]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import Lasso

# ConvergenceWarning
# ignore the warning
# from warnings import simplefilter
# simplefilter("ignore")


Notes:
- confusing terminology, the regularization parameter is called alpha in sklearn
- lambda (alpha) is usually searched on a log scale


In [16]:
alphas = np.logspace(1, 3, 20)
print(alphas)

[  10.           12.74274986   16.23776739   20.69138081   26.36650899
   33.59818286   42.81332399   54.55594781   69.51927962   88.58667904
  112.88378917  143.84498883  183.29807108  233.57214691  297.63514416
  379.26901907  483.29302386  615.84821107  784.75997035 1000.        ]


In [84]:
10 ** -1

0.1

### Grid Search

In [17]:
# extend the grid 
param_grid = {
    "alpha": [10 ** i for i in range(-3, 4)],
    'max_iter': [10, 100, 50_000]
}

# get Decartian product of all the hyperparameters
param_grid

{'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'max_iter': [10, 100, 50000]}

In [18]:

res_list = []
for lamb in param_grid['alpha']:
    for max_iter in param_grid['max_iter']:
        model = Lasso(alpha=lamb, max_iter=max_iter)
        model.fit(X_train, y_train)
        
        y_pred_train = model.predict(X_train)
        y_pred_val = model.predict(X_val)
        
        # r2
        r2_train = model.score(X_train, y_train)
        r2_val = model.score(X_val, y_val)
        
        res = {
            "lambda": lamb,
            "max_iter": max_iter,
            "r2_train": r2_train,
            "r2_val": r2_val
        }
        
        print(res)
        res_list.append(res)
        


{'lambda': 0.001, 'max_iter': 10, 'r2_train': 0.286300900903835, 'r2_val': 0.5469368390683846}
{'lambda': 0.001, 'max_iter': 100, 'r2_train': 0.2957594907787815, 'r2_val': 0.5626080457488305}
{'lambda': 0.001, 'max_iter': 50000, 'r2_train': 0.30240718968076064, 'r2_val': 0.5734121131426642}
{'lambda': 0.01, 'max_iter': 10, 'r2_train': 0.28630087985195296, 'r2_val': 0.5469369298770144}
{'lambda': 0.01, 'max_iter': 100, 'r2_train': 0.2957593913671396, 'r2_val': 0.5626080842528678}
{'lambda': 0.01, 'max_iter': 50000, 'r2_train': 0.3024071886295646, 'r2_val': 0.5734129905395922}
{'lambda': 0.1, 'max_iter': 10, 'r2_train': 0.28630066925527053, 'r2_val': 0.546937837867848}
{'lambda': 0.1, 'max_iter': 100, 'r2_train': 0.29575839666805526, 'r2_val': 0.5626084686353577}
{'lambda': 0.1, 'max_iter': 50000, 'r2_train': 0.3024070457697213, 'r2_val': 0.5734215832461952}
{'lambda': 1, 'max_iter': 10, 'r2_train': 0.2862985555024221, 'r2_val': 0.546946908229855}
{'lambda': 1, 'max_iter': 100, 'r2_train

In [19]:
res_df = pd.DataFrame(res_list)
res_df

Unnamed: 0,lambda,max_iter,r2_train,r2_val
0,0.001,10,0.286301,0.546937
1,0.001,100,0.295759,0.562608
2,0.001,50000,0.302407,0.573412
3,0.01,10,0.286301,0.546937
4,0.01,100,0.295759,0.562608
5,0.01,50000,0.302407,0.573413
6,0.1,10,0.286301,0.546938
7,0.1,100,0.295758,0.562608
8,0.1,50000,0.302407,0.573422
9,1.0,10,0.286299,0.546947


In [20]:
px.scatter(res_df, x="lambda", y="max_iter", log_x=True, log_y=True, color="r2_val", title="Grid Search")

## Grid Search with sklearn

In [21]:
param_grid = {
    "alpha": [10 ** i for i in range(-3, 2)],
    'max_iter': [10, 100, 50_000], 
    'tol': [1e-4,  1e-2],
    "fit_intercept": [True, False]  
}

In [22]:
grid_search = GridSearchCV(
    estimator=Lasso(),
    param_grid=param_grid,
    scoring='r2',       # Optimize R^2
    cv=3,               # 5-fold cross-validation
    n_jobs=-1,          # Use all CPU cores
    verbose=True        # Print more information  
)

X_use = pd.concat([X_train, X_val]).head(1_000)
y_use = pd.concat([y_train, y_val]).head(1_000)

grid_search.fit(X_use, y_use)


Fitting 3 folds for each of 60 candidates, totalling 180 fits


KeyboardInterrupt: 

In [102]:
print("\n== Grid Search Results ==")
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation R^2:", grid_search.best_score_)



== Grid Search Results ==
Best parameters found: {'alpha': 10, 'fit_intercept': True, 'max_iter': 50000, 'tol': 0.01}
Best cross-validation R^2: 0.5424913981816509


In [103]:
from sklearn.metrics import r2_score, root_mean_squared_error

best_lasso = grid_search.best_estimator_

y_train_pred = best_lasso.predict(X_train)
y_val_pred = best_lasso.predict(X_val)
y_test_pred = best_lasso.predict(X_test)

def evaluate_model(y_true, y_pred):
    mse = root_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mse, r2

mse_train, r2_train = evaluate_model(y_train, y_train_pred)
mse_val, r2_val = evaluate_model(y_val, y_val_pred)
mse_test, r2_test = evaluate_model(y_test, y_test_pred)

print(f"Train: RMSE={mse_train:.2f}, R^2={r2_train:.2f}")
print(f"Val: RMSE={mse_val:.2f}, R^2={r2_val:.2f}")
print(f"Test: RMSE={mse_test:.2f}, R^2={r2_test:.2f}")


Train: RMSE=83108.20, R^2=0.02
Val: RMSE=111435.42, R^2=-0.12
Test: RMSE=37364.36, R^2=0.54


## Random Search

In [23]:
from scipy.stats import loguniform, randint


In [24]:
param_dist = {
    'alpha': loguniform(1e-3, 1e4),  # alpha in [1e-3, 1e4]
    'max_iter': randint(10, 5000)    # integer in [10, 5000)
}

print("Random Search will sample from these hyperparameter distributions:")
for p, dist in param_dist.items():
    print(f"  {p}: {dist}")

Random Search will sample from these hyperparameter distributions:
  alpha: <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x00000206FCD59C90>
  max_iter: <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x00000206A376B5E0>


In [108]:
random_search = RandomizedSearchCV(
    estimator=Lasso(),
    param_distributions=param_dist,
    n_iter=100,         # try 100 combinations of hyperparameters
    scoring='r2',       # optimize R^2
    cv=3,               # 3-fold cross-validation
    n_jobs=-1,          # use all CPU cores
    verbose=1,          # get more information
    random_state=509   # random seed for reproducibility
)

In [111]:
X_use = pd.concat([X_train, X_val]).head(1_000)
y_use = pd.concat([y_train, y_val]).head(1_000)


random_search.fit(X_use, y_use)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [112]:

print("\n=== RandomizedSearchCV Results ===")
print("Best params:", random_search.best_params_)
print("Best CV R^2:", random_search.best_score_)


=== RandomizedSearchCV Results ===
Best params: {'alpha': 11.87976242665855, 'max_iter': 4585}
Best CV R^2: 0.5420305853566706


In [113]:
random_search.cv_results_

{'mean_fit_time': array([0.00737635, 0.01639032, 0.01452788, 0.00501291, 0.04865607,
        0.01253215, 0.05014253, 0.00518378, 0.08894293, 0.02473195,
        0.05683327, 0.07208387, 0.00651677, 0.01185878, 0.00600743,
        0.03046282, 0.09061797, 0.00571314, 0.11936307, 0.05818748,
        0.06519278, 0.00818102, 0.01420371, 0.02640192, 0.02172613,
        0.10365264, 0.01105404, 0.00752838, 0.04767585, 0.0122145 ,
        0.04615887, 0.02392403, 0.06422218, 0.00534638, 0.00819031,
        0.02107159, 0.00620969, 0.02257133, 0.02215878, 0.0058589 ,
        0.03905479, 0.02523359, 0.00855168, 0.06150262, 0.00893354,
        0.04353158, 0.04662975, 0.04958463, 0.03075496, 0.04376769,
        0.00590595, 0.06301188, 0.03838595, 0.00551462, 0.00738827,
        0.03568268, 0.06727751, 0.06562376, 0.06309589, 0.01212605,
        0.0331049 , 0.00838892, 0.01657891, 0.01964808, 0.05005956,
        0.06235242, 0.02657437, 0.00701213, 0.0794069 , 0.00569105,
        0.00764918, 0.06112719,

In [115]:
import pandas as pd

# Suppose you already have a fitted RandomizedSearchCV object named random_search
random_results_df = pd.DataFrame(random_search.cv_results_)

# Each hyperparameter is in a column with 'param_' prefix:
# e.g., 'param_alpha', 'param_max_iter', etc.
# You can display them along with the mean CV score, etc.
df_random_search = random_results_df[
    ['param_alpha', 'param_max_iter', 'mean_test_score', 'rank_test_score']
].sort_values(by='mean_test_score', ascending=False)

df_random_search

Unnamed: 0,param_alpha,param_max_iter,mean_test_score,rank_test_score
97,11.879762,4585,0.542031,1
81,8.479559,4185,0.541872,2
76,5.404961,3828,0.541591,3
32,11.671067,3690,0.541486,4
16,2.321346,4282,0.541280,5
...,...,...,...,...
17,5252.818689,4238,0.450202,96
67,5642.591616,2679,0.438345,97
90,6091.199063,2999,0.423569,98
39,9017.145603,1777,0.342540,99


In [121]:
px.scatter(df_random_search, x="param_alpha", y="param_max_iter", color="mean_test_score", 
           title="Random Search", log_x=True, log_y=True, color_continuous_scale="viridis")	

## Bayesian Optimization

Sklearn 

`!pip install scikit-optimize`

`from skopt import BayesSearchCV`

Hyperopt

In [122]:
!pip install hyperopt

Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting future (from hyperopt)
  Using cached future-1.0.0-py3-none-any.whl.metadata (4.0 kB)
Collecting cloudpickle (from hyperopt)
  Downloading cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting py4j (from hyperopt)
  Downloading py4j-0.10.9.9-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
   ---------------------------------------- 1.6/1.6 MB 2.3 MB/s eta 0:00:00
Downloading cloudpickle-3.1.1-py3-none-any.whl (20 kB)
Using cached future-1.0.0-py3-none-any.whl (491 kB)
Downloading py4j-0.10.9.9-py2.py3-none-any.whl (203 kB)
   ---------------------------------------- 203.0/203.0 kB 2.4 MB/s eta 0:00:00
Installing collected packages: py4j, future, cloudpickle, hyperopt
Successfully installed cloudpickle-3.1.1 future-1.0.0 hyperopt-0.2.7 py4j-0.10.9.9


In [124]:
from sklearn.model_selection import cross_val_score, train_test_split, KFold

In [123]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

In [126]:
# 1) Define the search space for alpha and max_iter
space = {
    'alpha': hp.loguniform('alpha', np.log(1e-3), np.log(1e3)),
    'max_iter': hp.randint('max_iter', 5000)  # integer in [0, 5000)
}

# 2) We’ll do 5-fold cross-validation to evaluate each set of params
cv = KFold(n_splits=5, shuffle=True, random_state=42)

def objective(params):
    # Extract hyperparameters
    alpha = params['alpha']
    max_iter = int(params['max_iter'])  # ensure integer
    
    # Create Lasso model
    model = Lasso(alpha=alpha, max_iter=max_iter, random_state=509)
    
    # Cross-validate (negative MSE, so we can minimize it)
    mse_scores = -cross_val_score(
        model, X_train, y_train,
        cv=cv, scoring='neg_mean_squared_error'
    )
    avg_mse = np.mean(mse_scores)
    
    # Hyperopt tries to minimize the returned value
    return {
        'loss': avg_mse,
        'status': STATUS_OK,
        # Optional: attach other info if you like
        'params': params
    }


In [127]:
trials = Trials()  # To store results

best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=20,       # Number of trials (increase for better search)
    trials=trials,
    rstate=np.random.default_rng(509)  # for reproducibility
)

print("\nBest hyperparameters found:")
print(best)


100%|██████████| 20/20 [00:06<00:00,  3.11trial/s, best loss: 5158148188.803587]

Best hyperparameters found:
{'alpha': 82.26205843443854, 'max_iter': 3194}


In [132]:
best_alpha = best['alpha']
best_max_iter = int(best['max_iter'])

model_best = Lasso(alpha=best_alpha, max_iter=best_max_iter, random_state=42)
model_best.fit(X_train, y_train)

# Evaluate on the test set
y_pred_test = model_best.predict(X_test)

rmse, r2 = evaluate_model(y_test, y_pred_test)
print(f"Test set: RMSE={rmse:.2f}, R^2={r2:.2f}")

Test set: RMSE=35930.87, R^2=0.58


In [136]:
results_df = pd.DataFrame(trials.results)
results_df = results_df.sort_values(by='loss')

results_df["alpha"] = results_df["params"].apply(lambda x: x["alpha"])
results_df["max_iter"] = results_df["params"].apply(lambda x: x["max_iter"])
results_df


Unnamed: 0,loss,status,params,alpha,max_iter
14,5158148000.0,ok,"{'alpha': 82.26205843443854, 'max_iter': 3194}",82.262058,3194
3,5167013000.0,ok,"{'alpha': 23.927000650080377, 'max_iter': 206}",23.927001,206
13,5290333000.0,ok,"{'alpha': 347.92671616536, 'max_iter': 1292}",347.926716,1292
11,5358360000.0,ok,"{'alpha': 28.505108573753763, 'max_iter': 2196}",28.505109,2196
1,5384187000.0,ok,"{'alpha': 0.013918988887753607, 'max_iter': 429}",0.013919,429
10,5839504000.0,ok,"{'alpha': 0.0027521976271328466, 'max_iter': 792}",0.002752,792
16,6432845000.0,ok,"{'alpha': 0.037966985847236734, 'max_iter': 1196}",0.037967,1196
8,6677457000.0,ok,"{'alpha': 0.0014052766974551073, 'max_iter': 1...",0.001405,1355
6,7403906000.0,ok,"{'alpha': 0.002452008538580964, 'max_iter': 1820}",0.002452,1820
18,7584504000.0,ok,"{'alpha': 9.753940120503824, 'max_iter': 2707}",9.75394,2707


In [140]:
px.scatter(results_df, x="alpha", y="max_iter", title="Hyperopt Search",
              log_x=True)