# importing and reading the data

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
tqdm.pandas()

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import LeaveOneGroupOut,train_test_split,KFold,StratifiedKFold,cross_val_predict,cross_val_score
from sklearn.metrics import mean_squared_error as metric
from sklearn.preprocessing import FunctionTransformer,OneHotEncoder,RobustScaler

from xgboost import XGBRegressor
#!pip install catboost
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor,early_stopping

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge,LinearRegression,Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from sklearn.base import (BaseEstimator,TransformerMixin)

In [13]:
class PATH:
    main = 'Data/'
    train = main + 'processed_train.csv'
    test = main + 'processed_test.csv'
    ss = main + 'SampleSubmission.csv'

train_df = pd.read_csv(PATH.train).sort_values(by=['country','city','site_id','date','hour']).reset_index(drop=True)
test_df = pd.read_csv(PATH.test).sort_values(by=['country','city','site_id','date','hour']).reset_index(drop=True)

target=train_df.pm2_5
print(
    f'train shape :{train_df.shape}',
    f'test shape :{test_df.shape}',
    sep='\n'
)
selected_columns=test_df.select_dtypes(include=('number')).columns.tolist()
selected_columns

train shape :(7909, 35)
test shape :(2783, 34)


['site_latitude',
 'site_longitude',
 'city',
 'country',
 'date',
 'hour',
 'month',
 'carbonmonoxide_co_column_number_density',
 'carbonmonoxide_h2o_column_number_density',
 'carbonmonoxide_cloud_height',
 'formaldehyde_tropospheric_hcho_column_number_density',
 'formaldehyde_tropospheric_hcho_column_number_density_amf',
 'formaldehyde_hcho_slant_column_number_density',
 'ozone_o3_column_number_density',
 'ozone_o3_effective_temperature',
 'cloud_cloud_optical_depth',
 'cloud_surface_albedo',
 '2_groub_mean',
 '3_groub_mean',
 '4_groub_mean',
 '5_groub_mean',
 '6_groub_mean',
 '7_groub_mean',
 '8_groub_mean',
 '10_groub_mean',
 '11_groub_mean',
 '12_groub_mean',
 'date_month',
 'date_day',
 'date_quarter',
 'date_week',
 'date_year']

In [19]:
TRIALS=20
SAVE_PATH='/kaggle/working/'
MODEL='lgbm_2'  ###### cb or lgbm

# fine-tuning:

# XGBOOST

In [9]:
# Import libraries
import optuna
#from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
import sklearn.metrics

# Define objective function
def objective_xgb(trial):

    param={
        'booster' : trial.suggest_categorical('booster', ['gbtree', 'gblinear']),
        'lambda' : trial.suggest_float('lambda', 1e-8, 1.0,log = True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0,log = True),
        "eval_metric": trial.suggest_categorical('eval_metric',['logloss','auc','mae','rmse']),
        'n_estimators' :trial.suggest_int('n_estimators',50,300)

    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=1)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
        param["subsample"] = trial.suggest_float("subsample", 0.2, 1.0)
        # sampling according to each tree.
        param["colsample_bytree"] = trial.suggest_float("colsample_bytree", 0.2, 1.0)

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)


      #create and fit ridge regression model
    model = XGBRegressor(**param)
    GROUPS=train.city
    rmse_average=[]
    skf = LeaveTwoGroupsOut(groups=GROUPS)
    X=train_base[selected_columns_base].copy()
    y=train_base.pm2_5.copy()
    oof=y.copy()
    #y=np.where(y >= y.quantile(0.97), y.quantile(0.97), y)
    for fold, (train_index, val_index) in enumerate(skf.split(X,GROUPS)):

      X_train, X_val = X.iloc[train_index], X.iloc[val_index]
      y_train, y_val = y[train_index], y[val_index]

      model.fit(X_train,y_train)

      val_preds = model.predict(X_val)
      oof.iloc[val_index]=val_preds
      rmse = np.sqrt(metric(y_val, val_preds))
      rmse_average.append(rmse)
      
        


    return weighted_mean(rmse_average)



## catboost

In [3]:
def weighted_mean(preds):
    mean =np.mean(preds)
    weights =np.abs([x-mean for x in preds])
    w_mean=np.average(preds,weights=weights)
    return w_mean


In [10]:
# Import libraries
import optuna
#from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
import sklearn.metrics

# Define objective function
def objective_cb(trial):

    params = {
        'iterations':trial.suggest_int("iterations", 100, 1000),
        'learning_rate':trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        'depth':trial.suggest_int("depth", 4, 10),
        'l2_leaf_reg':trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        'bootstrap_type':trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
        'random_strength':trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        'bagging_temperature':trial.suggest_float("bagging_temperature", 0.0, 10.0),
        'od_type':trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        'od_wait':trial.suggest_int("od_wait", 10, 50),
        'rsm':trial.suggest_float('rsm',0.1,1,log=True)
    }

      #create and fit ridge regression model
    model = CatBoostRegressor(use_best_model=True,eval_metric='RMSE', random_seed=42,**params, silent=True)
    rmse_average=[]
    skf = LeaveTwoGroupsOut(groups=GROUPS)
    X=train[selected_columns].copy()
    y=train.pm2_5.copy()
    oof=y.copy()
    #y=np.where(y >= y.quantile(0.97), y.quantile(0.97), y)
    for fold, (train_index, val_index) in enumerate(skf.split(X,GROUPS)):

      X_train, X_val = X.iloc[train_index], X.iloc[val_index]
      y_train, y_val = y[train_index], y[val_index]

      model.fit(X_train,y_train,eval_set=[(X_val,y_val)], early_stopping_rounds = 250 )

      val_preds = model.predict(basepipe.fit_transform(X_val))
      oof.iloc[val_index]=val_preds
      rmse = np.sqrt(metric(y_val, val_preds))
      rmse_average.append(rmse)
      
        


    return weighted_mean(rmse_average)



## lightgbm

In [16]:
# Import libraries
import optuna
#from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
import sklearn.metrics

def objective_lgbm(trial,data=train_df,target=target):
    from lightgbm import early_stopping
    param = {
        'metric': 'rmse',
        'random_state': 42,
        'n_estimators': trial.suggest_int('n_estimators', 1000,30000),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0,log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0,log=True),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_int('max_depth', 1 , 100),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    model = LGBMRegressor(**param,verbose=-1)

    rmse_average=[]
    skf = LeaveTwoGroupsOut(groups=GROUPS)
    X=train[selected_columns].copy()
    y=train.pm2_5.copy()
    oof=y.copy()
    #y=np.where(y >= y.quantile(0.97), y.quantile(0.97), y)
    for fold, (train_index, val_index) in enumerate(skf.split(X,GROUPS)):

      X_train, X_val = X.iloc[train_index], X.iloc[val_index]
      y_train, y_val = y[train_index], y[val_index]

      e_stopping=early_stopping(stopping_rounds=250,verbose=False)
      model.fit(X_train,y_train,eval_set=[(X_val,y_val)],callbacks=[e_stopping])

      val_preds = model.predict(basepipe.fit_transform(X_val))
      oof.iloc[val_index]=val_preds
      rmse = np.sqrt(metric(y_val, val_preds))
      rmse_average.append(rmse)


    return np.mean(rmse_average)

In [24]:
# Import libraries
import optuna
#from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
import sklearn.metrics

def objective_lgbm_2(trial,data=train_df,target=target):
    from lightgbm import early_stopping
    param = {
        'metric': 'rmse',
        'random_state': 42,
        'n_estimators': trial.suggest_int('n_estimators', 1000,30000),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0,log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0,log=True),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_int('max_depth', 1 , 100),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    model = LGBMRegressor(**param,verbose=-1)

    rmse_average=[]
    weights=[]
    skf = LeaveOneGroupOut()
    X=train_df[selected_columns].copy()
    y=train_df.pm2_5.copy()
    oof=y.copy()
    #y=np.where(y >= y.quantile(0.97), y.quantile(0.97), y)
    for fold, (train_index, val_index) in enumerate(skf.split(X,y,X.city)):

      X_train, X_val = X.iloc[train_index], X.iloc[val_index]
      y_train, y_val = y[train_index], y[val_index]

      model.fit(X_train,y_train)
      weights.append(len(train_index)/len(train_df))
      val_preds = model.predict(X_val)
      oof.iloc[val_index]=val_preds
      rmse = np.sqrt(metric(y_val, val_preds))
      print(rmse)
      rmse_average.append(rmse)


    return np.average(rmse_average,weights=weights)

In [22]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
def  objective_knn(trial):
    # -- Instantiate scaler
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])

    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()
                
    # -- Tune estimator algorithm
    n_neighbors = trial.suggest_int("n_neighbors", 1, 30)
    weights = trial.suggest_categorical("weights", ['uniform', 'distance'])
    Metric = trial.suggest_categorical("metric", ['euclidean', 'manhattan', 'minkowski'])
    knn = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, metric=Metric)
        
    # -- Make a pipeline
    pipeline = make_pipeline(scaler, knn)
    
    rmse_average=[]
    weights=[]
    skf = LeaveTwoGroupsOut(groups=GROUPS)
    X=train[selected_columns].bfill().copy()
    y=train.pm2_5.copy()
    oof=y.copy()
    #y=np.where(y >= y.quantile(0.97), y.quantile(0.97), y)
    for fold, (train_index, val_index) in enumerate(skf.split(X,GROUPS)):

      X_train, X_val = X.iloc[train_index], X.iloc[val_index]
      y_train, y_val = y[train_index], y[val_index]

      pipeline.fit(X_train,y_train)
      weights.append(len(train_index)/len(train))
      val_preds = pipeline.predict(X_val)
      oof.iloc[val_index]=val_preds
      rmse = np.sqrt(metric(y_val, val_preds))
      rmse_average.append(rmse)
    return np.average(rmse_average,weights=weights)


# results:

In [25]:
study = optuna.create_study(direction='minimize')

if MODEL.lower() in['cb','catboost']:
    study.optimize(objective_cb, n_trials=TRIALS)

if MODEL.lower() in['lgbm','lightgbm']:
    study.optimize(objective_lgbm, n_trials=TRIALS)

if MODEL.lower() in['lgbm_2','lightgbm_2']:
    study.optimize(objective_lgbm_2, n_trials=TRIALS)

if MODEL.lower() in['knn']:
    study.optimize(objective_knn, n_trials=TRIALS)

if MODEL.lower() in['xgb','xgboost']:
    study.optimize(objective_xgb, n_trials=TRIALS)


[I 2024-06-29 11:16:50,199] A new study created in memory with name: no-name-4e55b7e8-774f-4374-a60e-a3f9acf7377d


11.630279813984927
10.42200846962403
15.969084163148699


[I 2024-06-29 11:22:36,413] Trial 0 finished with value: 12.662994293438514 and parameters: {'n_estimators': 5353, 'reg_alpha': 0.2388094636924009, 'reg_lambda': 0.20702413699630365, 'colsample_bytree': 0.8, 'subsample': 1.0, 'learning_rate': 0.02, 'max_depth': 91, 'num_leaves': 407, 'min_child_samples': 1, 'min_data_per_groups': 73}. Best is trial 0 with value: 12.662994293438514.


12.150834588290792


[W 2024-06-29 11:23:49,093] Trial 1 failed with parameters: {'n_estimators': 28269, 'reg_alpha': 0.2709490897079377, 'reg_lambda': 0.09567526709720907, 'colsample_bytree': 0.3, 'subsample': 0.5, 'learning_rate': 0.008, 'max_depth': 59, 'num_leaves': 606, 'min_child_samples': 218, 'min_data_per_groups': 42} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\Administrator\AppData\Local\Temp\ipykernel_8000\2778715398.py", line 38, in objective_lgbm_2
    model.fit(X_train,y_train)
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\lightgbm\sklearn.py", line 1049, in fit
    super().fit(
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\lightgbm\sklearn.py

KeyboardInterrupt: 

In [16]:
# @title
def get_top_10_results(study):
  top_results = []
  for trial in study.trials:
    params = trial.params
    score = trial.value
    top_results.append((params, score))
  # Sort based on score (ascending order for minimization)
  sorted_results = sorted(top_results, key=lambda x: x[1])
  return sorted_results[:10]

top_10_params_scores = get_top_10_results(study)

# Access top 10 parameter sets and scores
top_10={}
for i,(params, score) in enumerate(top_10_params_scores):
  print(f"Parameters: {params}, Score: {score}")
  top_10[f'parameter_{i}']={'parameters' :params,'score' :score}

Parameters: {'booster': 'gbtree', 'lambda': 0.0035996888734008935, 'alpha': 0.3436704494035292, 'eval_metric': 'mae', 'n_estimators': 278, 'max_depth': 9, 'min_child_weight': 10, 'eta': 0.04468554904313802, 'gamma': 6.003358167076313e-07, 'grow_policy': 'lossguide', 'subsample': 0.8521416752421659, 'colsample_bytree': 0.24111222530283968}, Score: 12.581537210481812
Parameters: {'booster': 'gbtree', 'lambda': 0.0077201061087318066, 'alpha': 0.44706101522658637, 'eval_metric': 'mae', 'n_estimators': 271, 'max_depth': 9, 'min_child_weight': 10, 'eta': 0.035150585340927784, 'gamma': 2.2473095071167075e-06, 'grow_policy': 'lossguide', 'subsample': 0.6487890928765095, 'colsample_bytree': 0.29456477563904915}, Score: 12.594729487220878
Parameters: {'booster': 'gbtree', 'lambda': 0.003000518280895963, 'alpha': 0.9366814197837326, 'eval_metric': 'logloss', 'n_estimators': 272, 'max_depth': 9, 'min_child_weight': 10, 'eta': 0.048516208819314045, 'gamma': 1.4057650714831519e-05, 'grow_policy': 'l

In [17]:
import json
SAVE_PATH=''
with open(SAVE_PATH+'results.json','w') as file:
    json.dump(top_10,file,indent=4)