In [1]:
import numpy as np 
import pandas as pd 

import optuna
import logging
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


# <span style="color:red"> Load and Preprocess Data </span>


In [2]:
# Load the dataset
file_path = '/kaggle/input/kidneystone/kidney-stone-dataset.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()

Unnamed: 0.1,Unnamed: 0,gravity,ph,osmo,cond,urea,calc,target
0,0,1.021,4.91,725,14.0,443,2.45,0
1,1,1.017,5.74,577,20.0,296,4.49,0
2,2,1.008,7.2,321,14.9,101,2.36,0
3,3,1.011,5.51,408,12.6,224,2.15,0
4,4,1.005,6.52,187,7.5,91,1.16,0


In [3]:
data = data.drop(columns=['Unnamed: 0'])
data.columns

Index(['gravity', 'ph', 'osmo', 'cond', 'urea', 'calc', 'target'], dtype='object')

In [4]:
# Handle missing values (if any)
data = data.dropna()

# Encode categorical variables (if any)
target = 'target'
X = data.drop(columns=[target])
y = data[target]

# Scale the features using MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [5]:
data.describe()

Unnamed: 0,gravity,ph,osmo,cond,urea,calc,target
count,90.0,90.0,90.0,90.0,90.0,90.0,90.0
mean,1.017952,6.036651,602.333333,20.621687,258.2,4.017788,0.5
std,0.00678,0.711801,238.459805,7.654448,135.381127,3.016273,0.502801
min,1.005,4.76,187.0,5.1,10.0,0.17,0.0
25%,1.012258,5.53652,411.5,14.15,148.25,1.4125,0.0
50%,1.018,5.936247,572.0,21.177172,231.5,3.23,0.5
75%,1.023,6.49,778.0,26.075,366.25,5.965127,1.0
max,1.034,7.94,1236.0,38.0,620.0,13.0,1.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   gravity  90 non-null     float64
 1   ph       90 non-null     float64
 2   osmo     90 non-null     int64  
 3   cond     90 non-null     float64
 4   urea     90 non-null     int64  
 5   calc     90 non-null     float64
 6   target   90 non-null     int64  
dtypes: float64(4), int64(3)
memory usage: 5.0 KB


In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)


# <span style="color:red"> RANDOM FOREST </span>


In [8]:
parameters = {
    'max_depth': 18,
    'n_estimators': 1000
}


rf_model = RandomForestRegressor(**parameters, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Calculate and print the RMSE and R^2 score
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R^2 Score: {r2}")

Root Mean Squared Error (RMSE): 0.3968421780578702
R^2 Score: 0.3569414999999997


# <span style="color:red"> XGBOOST Single Model </span>


In [9]:
parameters = {
    'n_estimators': 1000,
    'max_depth': 10,
    'learning_rate': 0.01,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

xgb_model = XGBRegressor(**parameters, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Calculate and print the RMSE and R² score
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")

Root Mean Squared Error (RMSE): 0.39796235469367697
R² Score: 0.35330602067387806


# <span style="color:red"> XGBOOST GridSearch </span>


In [10]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [500, 1000, 1500],
    'max_depth': [10, 15, 20],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Initialize the XGBoost model
xgb_model = XGBRegressor(random_state=42)

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='r2', cv=5, n_jobs=-1, verbose=0)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")

Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 500, 'subsample': 0.6}
Root Mean Squared Error (RMSE): 0.401880764973818
R² Score: 0.3405083905382401


# <span style="color:red"> LightGBM GridSearch </span>


In [None]:
param_grid = {
    'n_estimators': [500, 1000, 1500],
    'max_depth': [10, 15, 20],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

lgbm_model = LGBMRegressor(random_state=42)

grid_search = GridSearchCV(estimator=lgbm_model, param_grid=param_grid, scoring='r2', cv=5, n_jobs=-1, verbose=0)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000026 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 121
[LightGBM] [Info] Number of data points in the train set: 61, number of used features: 6
[LightGBM] [Info] Start training from score 0.442623
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000034 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 122
[LightGBM] [Info] Number of data points in the train set: 61, number of used features: 6
[LightGBM] [Info] Start training from score 0.524590
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000040 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 119
[LightGBM] [Info] Number of data points in the train set: 61, number of used features: 6
[LightGBM] [Info] Start training from scor