In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, cross_val_score
from xgboost import XGBRegressor  
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.metrics import r2_score
import optuna
from sklearn.ensemble import GradientBoostingRegressor 

In [2]:
# Reading the file
df= pd.read_csv(r"D:\Intellipaat\kegal datasets\april -s4e4\train.csv")
df.drop(columns=['id'],inplace = True)
df

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,F,0.550,0.430,0.150,0.7715,0.3285,0.1465,0.2400,11
1,F,0.630,0.490,0.145,1.1300,0.4580,0.2765,0.3200,11
2,I,0.160,0.110,0.025,0.0210,0.0055,0.0030,0.0050,6
3,M,0.595,0.475,0.150,0.9145,0.3755,0.2055,0.2500,10
4,I,0.555,0.425,0.130,0.7820,0.3695,0.1600,0.1975,9
...,...,...,...,...,...,...,...,...,...
90610,M,0.335,0.235,0.075,0.1585,0.0685,0.0370,0.0450,6
90611,M,0.555,0.425,0.150,0.8790,0.3865,0.1815,0.2400,9
90612,I,0.435,0.330,0.095,0.3215,0.1510,0.0785,0.0815,6
90613,I,0.345,0.270,0.075,0.2000,0.0980,0.0490,0.0700,6


In [3]:
# Reading the orginal file directly from the kaggle 
path_to_input_folder = "D:\\Intellipaat\\kegal datasets\\april -s4e4\\abalone\\"
column_names = ['Sex','Length','Diameter','Height','Whole weight','Whole weight.1','Whole weight.2','Shell weight','Rings']
org_df = pd.read_csv(path_to_input_folder + "abalone.data", names=column_names)

In [4]:
# combine two dataset into one by using concat 
data = pd.concat([df,org_df])
data

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,F,0.550,0.430,0.150,0.7715,0.3285,0.1465,0.2400,11
1,F,0.630,0.490,0.145,1.1300,0.4580,0.2765,0.3200,11
2,I,0.160,0.110,0.025,0.0210,0.0055,0.0030,0.0050,6
3,M,0.595,0.475,0.150,0.9145,0.3755,0.2055,0.2500,10
4,I,0.555,0.425,0.130,0.7820,0.3695,0.1600,0.1975,9
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [6]:
#using label encoding 
encoder = LabelEncoder()

In [7]:
data['Sex'] = encoder.fit_transform(data['Sex'].values.reshape(-1,1))

  y = column_or_1d(y, warn=True)


In [8]:
data

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,0.550,0.430,0.150,0.7715,0.3285,0.1465,0.2400,11
1,0,0.630,0.490,0.145,1.1300,0.4580,0.2765,0.3200,11
2,1,0.160,0.110,0.025,0.0210,0.0055,0.0030,0.0050,6
3,2,0.595,0.475,0.150,0.9145,0.3755,0.2055,0.2500,10
4,1,0.555,0.425,0.130,0.7820,0.3695,0.1600,0.1975,9
...,...,...,...,...,...,...,...,...,...
4172,0,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,2,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,2,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,0,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [9]:
X = data.drop(columns=['Rings']) 
Y = data['Rings']

In [10]:
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,0,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,1,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,2,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,1,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [11]:
 # Define the objective function for optimization
def objective_gb(trial): 
    cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    params = dict( # training hyperparameters
        n_estimators=trial.suggest_int('n_estimators', 10, 500),
        max_depth=trial.suggest_int('max_depth', 2, 32),
        learning_rate=trial.suggest_float('learning_rate', 0.001, 0.3),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 10),
        subsample=trial.suggest_float('subsample', 0.33, 0.85),
        max_features=trial.suggest_float('max_features', 0.33, 0.7),
    )

    gb_reg = GradientBoostingRegressor(random_state=0, **params)  # Use GradientBoostingRegressor

    # creating a pipeline
    pipe = Pipeline(
        steps=[
            ('scaler', StandardScaler()),
            ('model', gb_reg)
        ]
    )

    score = -np.mean(cross_val_score(pipe, X, Y, scoring='neg_root_mean_squared_log_error', cv=cvo)) 
    return score

# Create an Optuna study
study_gb = optuna.create_study(direction='minimize')  # Use 'minimize' for regression problems

# Optimize the objective function
study_gb.optimize(objective_gb, n_trials=600, n_jobs= 15 , show_progress_bar=True)

# Get the best parameters and score
best_params = study_gb.best_params
best_score = study_gb.best_value

print("Best Parameters:", best_params)
print("Best Score:", best_score)

# Train Gradient Boosting regressor with the best parameters
best_gb = GradientBoostingRegressor(random_state=0, **best_params)  # Use GradientBoostingRegressor

best_pipe = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('model', best_gb)
    ]
)

# Fit the pipeline to the entire dataset
best_pipe.fit(X, Y)

# Make predictions
predictions = best_pipe.predict(X)


[I 2024-04-13 18:30:00,377] A new study created in memory with name: no-name-78ff6579-04ad-4b87-b54b-9b81c0ec199d


  0%|          | 0/600 [00:00<?, ?it/s]

[I 2024-04-13 18:31:14,644] Trial 12 finished with value: 0.15270659778458434 and parameters: {'n_estimators': 143, 'max_depth': 4, 'learning_rate': 0.2644103716529022, 'min_samples_split': 10, 'min_samples_leaf': 5, 'subsample': 0.34698128990008015, 'max_features': 0.5373435533105851}. Best is trial 12 with value: 0.15270659778458434.
[I 2024-04-13 18:32:00,989] Trial 11 finished with value: 0.15316612029311272 and parameters: {'n_estimators': 362, 'max_depth': 2, 'learning_rate': 0.2823739391638427, 'min_samples_split': 4, 'min_samples_leaf': 10, 'subsample': 0.37543827949269537, 'max_features': 0.5772721596030586}. Best is trial 12 with value: 0.15270659778458434.
[I 2024-04-13 18:33:12,217] Trial 3 finished with value: 0.15111236572449965 and parameters: {'n_estimators': 290, 'max_depth': 4, 'learning_rate': 0.23027802740784928, 'min_samples_split': 4, 'min_samples_leaf': 4, 'subsample': 0.6832154580756282, 'max_features': 0.4060851518038221}. Best is trial 3 with value: 0.15111236

Traceback (most recent call last):
  File "C:\Users\mahes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\metrics\_scorer.py", line 137, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "C:\Users\mahes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\metrics\_scorer.py", line 350, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\mahes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\utils\_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\mahes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0

[I 2024-04-13 18:40:48,862] Trial 17 finished with value: 0.17271948478448887 and parameters: {'n_estimators': 223, 'max_depth': 24, 'learning_rate': 0.19248580955884118, 'min_samples_split': 3, 'min_samples_leaf': 10, 'subsample': 0.43313442371763056, 'max_features': 0.45418662565782275}. Best is trial 8 with value: 0.15019314142484883.
[W 2024-04-13 18:43:37,728] Trial 22 failed with parameters: {'n_estimators': 271, 'max_depth': 25, 'learning_rate': 0.2775770628987535, 'min_samples_split': 3, 'min_samples_leaf': 4, 'subsample': 0.33303985542686, 'max_features': 0.4571699983913362} because of the following error: The value nan is not acceptable.
[W 2024-04-13 18:43:37,738] Trial 22 failed with value nan.
[I 2024-04-13 18:43:50,164] Trial 9 finished with value: 0.17961395387401757 and parameters: {'n_estimators': 486, 'max_depth': 15, 'learning_rate': 0.2597910775220761, 'min_samples_split': 7, 'min_samples_leaf': 6, 'subsample': 0.5123769847587473, 'max_features': 0.42845043781067443

In [17]:
best_gb

In [13]:
best_params

{'n_estimators': 477,
 'max_depth': 9,
 'learning_rate': 0.02767276276006414,
 'min_samples_split': 4,
 'min_samples_leaf': 9,
 'subsample': 0.6383037201527186,
 'max_features': 0.5249527477151283}

In [14]:
best_score

0.1492079086382534

In [15]:
best_pipe

In [16]:
predictions

array([10.39885047, 10.40218555,  4.09980428, ..., 10.81485791,
        9.00976425, 11.73848659])

In [18]:
study_gb

<optuna.study.study.Study at 0x24102e1d710>

In [19]:
#Reading Test dataset
test_data= pd.read_csv(r"D:\Intellipaat\kegal datasets\april -s4e4\test.csv")
test_data

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight
0,90615,M,0.645,0.475,0.155,1.2380,0.6185,0.3125,0.3005
1,90616,M,0.580,0.460,0.160,0.9830,0.4785,0.2195,0.2750
2,90617,M,0.560,0.420,0.140,0.8395,0.3525,0.1845,0.2405
3,90618,M,0.570,0.490,0.145,0.8740,0.3525,0.1865,0.2350
4,90619,I,0.415,0.325,0.110,0.3580,0.1575,0.0670,0.1050
...,...,...,...,...,...,...,...,...,...
60406,151021,I,0.345,0.260,0.085,0.1775,0.0735,0.0265,0.0500
60407,151022,F,0.525,0.410,0.145,0.8445,0.3885,0.1670,0.2050
60408,151023,I,0.590,0.440,0.155,1.1220,0.3930,0.2000,0.2650
60409,151024,F,0.660,0.525,0.190,1.4935,0.5885,0.3575,0.4350


In [20]:
#droping id column in the test dataset
test = test_data.drop(columns=['id'])
test

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight
0,M,0.645,0.475,0.155,1.2380,0.6185,0.3125,0.3005
1,M,0.580,0.460,0.160,0.9830,0.4785,0.2195,0.2750
2,M,0.560,0.420,0.140,0.8395,0.3525,0.1845,0.2405
3,M,0.570,0.490,0.145,0.8740,0.3525,0.1865,0.2350
4,I,0.415,0.325,0.110,0.3580,0.1575,0.0670,0.1050
...,...,...,...,...,...,...,...,...
60406,I,0.345,0.260,0.085,0.1775,0.0735,0.0265,0.0500
60407,F,0.525,0.410,0.145,0.8445,0.3885,0.1670,0.2050
60408,I,0.590,0.440,0.155,1.1220,0.3930,0.2000,0.2650
60409,F,0.660,0.525,0.190,1.4935,0.5885,0.3575,0.4350


In [21]:
#encoding
test['Sex'] = encoder.fit_transform(test_data['Sex'].values.reshape(-1,1))

  y = column_or_1d(y, warn=True)


In [22]:
test

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight
0,2,0.645,0.475,0.155,1.2380,0.6185,0.3125,0.3005
1,2,0.580,0.460,0.160,0.9830,0.4785,0.2195,0.2750
2,2,0.560,0.420,0.140,0.8395,0.3525,0.1845,0.2405
3,2,0.570,0.490,0.145,0.8740,0.3525,0.1865,0.2350
4,1,0.415,0.325,0.110,0.3580,0.1575,0.0670,0.1050
...,...,...,...,...,...,...,...,...
60406,1,0.345,0.260,0.085,0.1775,0.0735,0.0265,0.0500
60407,0,0.525,0.410,0.145,0.8445,0.3885,0.1670,0.2050
60408,1,0.590,0.440,0.155,1.1220,0.3930,0.2000,0.2650
60409,0,0.660,0.525,0.190,1.4935,0.5885,0.3575,0.4350


In [23]:
predictions = best_pipe.predict(test)
predictions

array([ 9.85914095,  9.75436043, 10.04477575, ..., 12.61049199,
       13.37920452,  9.01246072])

In [24]:
submission_df = pd.DataFrame({'id': test_data['id'], 'Rings' : predictions})
submission_df

Unnamed: 0,id,Rings
0,90615,9.859141
1,90616,9.754360
2,90617,10.044776
3,90618,10.498776
4,90619,7.595366
...,...,...
60406,151021,6.271668
60407,151022,9.543325
60408,151023,12.610492
60409,151024,13.379205


In [25]:
submission_df.to_csv('submission_gradian_boosting.csv', index=False)