In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

In [2]:
train_pd = pd.read_csv('data/train.csv')
test_pd = pd.read_csv('data/test.csv')

In [4]:
X_train = np.load('data/X_train.npy')
y_train = np.load('data/y_train.npy')
X_val = np.load('data/X_val.npy')
y_val = np.load('data/y_val.npy')
X_test = np.load('data/X_test.npy')

In [9]:
X_train.shape

(72492, 10)

In [5]:
X_train

array([[0.65 , 0.505, 0.175, ..., 0.   , 0.   , 1.   ],
       [0.67 , 0.52 , 0.195, ..., 1.   , 0.   , 0.   ],
       [0.605, 0.495, 0.15 , ..., 0.   , 0.   , 1.   ],
       ...,
       [0.53 , 0.42 , 0.125, ..., 0.   , 1.   , 0.   ],
       [0.695, 0.525, 0.185, ..., 0.   , 0.   , 1.   ],
       [0.37 , 0.275, 0.095, ..., 0.   , 1.   , 0.   ]])

In [6]:
standard_scaler = StandardScaler()

In [12]:
standard_scaler.fit(X_train[:,:7])

In [13]:
X_train[:,:7] = standard_scaler.transform(X_train[:,:7])

In [14]:
X_val[:,:7] = standard_scaler.transform(X_val[:,:7])
X_test[:,:7] = standard_scaler.transform(X_test[:,:7])

In [16]:
np.save('data/scaled/X_train.npy', X_train)
np.save('data/scaled/X_val.npy', X_val)
np.save('data/scaled/X_test.npy', X_test)

In [20]:
def custom_root_mean_squared_log_error(y_true, y_pred):
    return np.sqrt(np.sum(np.square(np.log1p(1 + y_pred) - np.log1p(1 + y_true))) / len(y_true))

In [18]:
best_xgb_model = XGBRegressor(
    n_estimators=300,
    max_depth=None,
    learning_rate=0.1,
    min_samples_leaf=2,
    min_samples_split=5
)

In [19]:
best_xgb_model.fit(X_train, y_train)

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.



In [21]:
y_pred = best_xgb_model.predict(X_val)
y_pred = np.array(y_pred, dtype=np.int32)
custom_root_mean_squared_log_error(y_val, y_pred)

0.14566423870566664

In [22]:
y_pred = best_xgb_model.predict(X_test)
y_pred = np.array(y_pred, dtype=np.int32)

In [23]:
y_pred

array([ 9,  9, 10, ..., 12, 13,  8])

In [24]:
test_pd = pd.read_csv('data/test.csv')

In [25]:
test_pd.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight
0,90615,M,0.645,0.475,0.155,1.238,0.6185,0.3125,0.3005
1,90616,M,0.58,0.46,0.16,0.983,0.4785,0.2195,0.275
2,90617,M,0.56,0.42,0.14,0.8395,0.3525,0.1845,0.2405
3,90618,M,0.57,0.49,0.145,0.874,0.3525,0.1865,0.235
4,90619,I,0.415,0.325,0.11,0.358,0.1575,0.067,0.105


In [26]:
result = pd.concat([test_pd['id'], pd.DataFrame(y_pred, columns=["Rings"])], axis=1)

In [27]:
result

Unnamed: 0,id,Rings
0,90615,9
1,90616,9
2,90617,10
3,90618,10
4,90619,7
...,...,...
60406,151021,6
60407,151022,9
60408,151023,12
60409,151024,13


In [28]:
result.to_csv('data/scaled/result_with_scale.csv', index=False)