In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import optuna
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import TransformedTargetRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e4/sample_submission.csv
/kaggle/input/playground-series-s4e4/train.csv
/kaggle/input/playground-series-s4e4/test.csv


In [2]:
def rmsle_score(y_true, y_pred):
    
    msle = mean_squared_log_error(y_true, y_pred)
    rmsle = np.sqrt(msle)
    return rmsle

In [3]:
def add_feature(data):

    data['Volume'] = data['Length'] * data['Diameter'] * data['Height']
    return data

In [4]:
test = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')
train = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')

In [5]:
print('Train dataset shape: \n', train.shape)
print('Test dataset shape: \n', test.shape)

Train dataset shape: 
 (90615, 10)
Test dataset shape: 
 (60411, 9)


In [6]:
print('Train dataset columns: \n', train.columns)

Train dataset columns: 
 Index(['id', 'Sex', 'Length', 'Diameter', 'Height', 'Whole weight',
       'Whole weight.1', 'Whole weight.2', 'Shell weight', 'Rings'],
      dtype='object')


In [7]:
print('Test dataset columns: \n', test.columns)

Test dataset columns: 
 Index(['id', 'Sex', 'Length', 'Diameter', 'Height', 'Whole weight',
       'Whole weight.1', 'Whole weight.2', 'Shell weight'],
      dtype='object')


In [8]:
train.head(10)

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9
5,5,F,0.61,0.48,0.17,1.201,0.5335,0.3135,0.3085,10
6,6,M,0.415,0.325,0.11,0.3315,0.1655,0.0715,0.13,9
7,7,F,0.61,0.49,0.15,1.1165,0.4955,0.2945,0.295,9
8,8,I,0.205,0.15,0.04,0.046,0.0145,0.0105,0.01,4
9,9,I,0.565,0.425,0.125,0.651,0.3795,0.142,0.18,8


In [9]:
test.head(10)

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight
0,90615,M,0.645,0.475,0.155,1.238,0.6185,0.3125,0.3005
1,90616,M,0.58,0.46,0.16,0.983,0.4785,0.2195,0.275
2,90617,M,0.56,0.42,0.14,0.8395,0.3525,0.1845,0.2405
3,90618,M,0.57,0.49,0.145,0.874,0.3525,0.1865,0.235
4,90619,I,0.415,0.325,0.11,0.358,0.1575,0.067,0.105
5,90620,M,0.56,0.425,0.14,0.8105,0.3525,0.1915,0.215
6,90621,M,0.635,0.49,0.17,1.1835,0.4605,0.2445,0.355
7,90622,I,0.34,0.25,0.075,0.1675,0.075,0.033,0.048
8,90623,I,0.485,0.37,0.11,0.536,0.2565,0.098,0.149
9,90624,F,0.64,0.5,0.195,1.338,0.647,0.3175,0.3965


In [10]:
# train = add_feature(train)
# test = add_feature(test)

In [11]:
# train['Volume']

In [12]:
scaler = StandardScaler()

column_to_keep_unchanged_train = ['Sex', 'Rings']
columns_to_scale = [col for col in train.columns if col not in column_to_keep_unchanged_train]

scaled_features = scaler.fit_transform(train[columns_to_scale])

scaled_train = pd.DataFrame(scaled_features, columns=columns_to_scale)

for col in column_to_keep_unchanged_train:
    scaled_train[col] = train[col]

In [13]:
column_to_keep_unchanged_test = 'Sex'
columns_to_scale = [col for col in test.columns if col != column_to_keep_unchanged_test]

scaled_features = scaler.transform(test[columns_to_scale])

scaled_test = pd.DataFrame(scaled_features, columns=columns_to_scale)

scaled_test[column_to_keep_unchanged_test] = test[column_to_keep_unchanged_test]

In [14]:
scaled_train = pd.get_dummies(scaled_train, columns=['Sex'])
scaled_test = pd.get_dummies(scaled_test, columns=['Sex'])

In [15]:
X_train = scaled_train.drop(['Rings', 'id'], axis=1)
y_train = scaled_train['Rings']

X_test = scaled_test.drop('id', axis=1)

In [16]:
X_train

Unnamed: 0,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Sex_F,Sex_I,Sex_M
0,0.278317,0.288912,0.382451,-0.038314,-0.060061,-0.227155,0.108309,True,False,False
1,0.955044,0.900996,0.250897,0.745005,0.573416,1.061143,0.722736,True,False,False
2,-3.020727,-2.975535,-2.906386,-1.678148,-1.640084,-1.649238,-1.696570,False,True,False
3,0.658976,0.747975,0.382451,0.274140,0.169850,0.357534,0.185113,False,False,True
4,0.320613,0.237905,-0.143763,-0.015371,0.140499,-0.093370,-0.218105,False,True,False
...,...,...,...,...,...,...,...,...,...,...
90610,-1.540387,-1.700360,-1.590852,-1.377712,-1.331906,-1.312298,-1.389356,False,False,True
90611,0.320613,0.237905,0.382451,0.196573,0.223658,0.119694,0.108309,False,False,True
90612,-0.694478,-0.731228,-1.064638,-1.021559,-0.928340,-0.901034,-1.109024,False,True,False
90613,-1.455796,-1.343312,-1.590852,-1.287035,-1.187601,-1.193379,-1.197348,False,True,False


In [17]:
cb = CatBoostRegressor(learning_rate=0.1, iterations=1000, verbose=0)
xgb = XGBRegressor()
lgbm = LGBMRegressor()

models = [('cb', cb), ('xgb', xgb), ('lgbm', lgbm)]

ensemble = VotingRegressor(models)

In [18]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

model = TransformedTargetRegressor(
        regressor=ensemble,
        func=np.log1p,
        inverse_func=np.expm1
    )

rmsle_list = []
models = []
for train_index, val_index in kf.split(X_train, y_train):

    X_train_fold, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

    model.fit(X_train_fold, y_train_fold)
    y_pred_val = model.predict(X_val)
    models.append(model)
    rmsle = rmsle_score(y_val, y_pred_val)
    rmsle_list.append(rmsle)
    score = np.mean(rmsle_list)

y_preds = []
for model in models:
    y_pred = model.predict(X_test)
    y_preds.append(y_pred)
y_preds = np.array(y_preds)
y_preds_avg = np.mean(y_preds, axis=0)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013777 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1335
[LightGBM] [Info] Number of data points in the train set: 72492, number of used features: 10
[LightGBM] [Info] Start training from score 2.328973
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010818 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1333
[LightGBM] [Info] Number of data points in the train set: 72492, number of used features: 10
[LightGBM] [Info] Start training from score 2.328978
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010913 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1333
[LightGBM] [Info] Number of data points in the train set: 72492, number of used features: 10
[LightGBM] [Info] Start tra

In [19]:
print('The mean RMSLE is: ', score)

The mean RMSLE is:  0.14881100682991472


In [20]:
submission = pd.DataFrame({'id': test['id'], 'Rings': y_preds_avg})

submission

Unnamed: 0,id,Rings
0,90615,9.668948
1,90616,9.767095
2,90617,10.065735
3,90618,10.607554
4,90619,7.524032
...,...,...
60406,151021,6.237793
60407,151022,9.226562
60408,151023,12.189552
60409,151024,12.895833


In [21]:
submission.to_csv('submission.csv', index=False)