# Reading Data

In [1]:
import pandas as pd
# Reading files
x_train = pd.read_csv('../data/processed/x_train.csv',low_memory=False)
x_test = pd.read_csv('../data/processed/x_test.csv',low_memory=False)
y_train = pd.read_csv('../data/processed/y_train.csv',low_memory=False)
y_test = pd.read_csv('../data/processed/y_test.csv',low_memory=False)



In [7]:
#drop id column
x_train= x_train.drop(["id"],axis=1)
x_test= x_test.drop(["id"],axis=1)

# Machine Learning

In [9]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error

# Initialize LightGBM regressor
lgb_reg = lgb.LGBMRegressor()

param_dist = {
    "n_estimators": [250, 350, 450],

}

# Define K-fold cross-validation (you can use KFold for regression)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Use GridSearchCV to find the best hyperparameters
grid = GridSearchCV(lgb_reg, param_dist, cv=cv, refit=True, scoring='neg_root_mean_squared_error', verbose=2)
grid.fit(x_train, y_train)

# Print the best hyperparameters found
print("Best hyperparameters:", grid.best_params_)

# Get the best model
best_model = grid.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(x_test)

# Calculate RMSE on the test set
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Test RMSE:", rmse)


Fitting 5 folds for each of 3 candidates, totalling 15 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.531171 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1130
[LightGBM] [Info] Number of data points in the train set: 37685640, number of used features: 11
[LightGBM] [Info] Start training from score 3.032877
[CV] END ...................................n_estimators=250; total time= 1.0min
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.446280 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1125
[LightGBM] [Info] Number of data points in the train set: 37685640, number of used features: 11
[LightGBM] [Info] Start training from score 3.031349
[CV] END ................................

In [11]:
# Solution:
from joblib import dump

dump(best_model,  '../models/lightgbm.joblib')

['../models/lightgbm.joblib']