# Regression using the Abalone Dataset

This notebook provides and analysis and results for [this](https://www.kaggle.com/competitions/playground-series-s4e4/overview) Kaggle competition.

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import scipy.stats as stats
import seaborn as sns

In [2]:
# Loading the data
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

## Data cleanup

This dataset has 10 features and the goal is to predict the `Rings` number. The `id` column is not useful so we will drop it in our training datasets as it maps directly to the index.

In [3]:
train_df.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


There are no missing features in this dataset so the cleanup is relatively simple. All we have left to do is label the sex of the abalone has an integer.

In [4]:
train_df.isnull().sum()

id                0
Sex               0
Length            0
Diameter          0
Height            0
Whole weight      0
Whole weight.1    0
Whole weight.2    0
Shell weight      0
Rings             0
dtype: int64

In [5]:
def cleanup_df(df):
    sex_num = df["Sex"].map({"F": 1, "M": -1, "I": 0})
    sex_num = sex_num.rename("Sex_num")
    return pd.concat([df.drop(["Sex", "id"], axis=1), sex_num], axis=1)


train_clean_df = cleanup_df(train_df)
test_clean_df = cleanup_df(test_df)

In [6]:
# Function to convert the predicted array into a submission
def package_result(df, y_out):
    df_out = pd.Series(y_out)
    df_out.index = df["id"]
    df_out.index.name = "id"
    df_out = df_out.rename("Rings")
    return df_out

## Prediction

In [7]:
from sklearn.metrics import root_mean_squared_log_error, make_scorer
from sklearn.model_selection import train_test_split, ValidationCurveDisplay, validation_curve

rmsle_scorer = make_scorer(root_mean_squared_log_error, greater_is_better=False)

X_train = train_clean_df.drop("Rings", axis=1)
y_train = train_clean_df["Rings"]

X_test = test_clean_df

X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(X_train, y_train, train_size=0.3, random_state=42)

We start with a naive Logistic Regression 

In [8]:
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression(C=100, max_iter=2_000)
model_lr.fit(X_train_1, y_train_1)
y_predict_2 = model_lr.predict(X_train_2)

print(root_mean_squared_log_error(y_train_2, y_predict_2))

0.1760267009568668


As expected the Random Forest Regressor works better

In [9]:
from sklearn.ensemble import RandomForestRegressor

model_rfr = RandomForestRegressor(n_jobs=-1)
model_rfr.fit(X_train_1, y_train_1)
y_predict_2 = model_rfr.predict(X_train_2)

print(root_mean_squared_log_error(y_train_2, y_predict_2))

0.15630470004149802


In [10]:
from xgboost import XGBRegressor

model_xgb = XGBRegressor(learning_rate=0.1, n_estimators=1000, early_stopping_rounds=5, eval_metric="rmsle")

model_xgb.fit(X_train_1, y_train_1, eval_set=[(X_train_2, y_train_2)])

y_predict_xgb = model_xgb.predict(X_train_2)

print(root_mean_squared_log_error(y_train_2, y_predict_xgb))

[0]	validation_0-rmsle:0.27054
[1]	validation_0-rmsle:0.25440
[2]	validation_0-rmsle:0.24028
[3]	validation_0-rmsle:0.22803
[4]	validation_0-rmsle:0.21737
[5]	validation_0-rmsle:0.20810
[6]	validation_0-rmsle:0.19988
[7]	validation_0-rmsle:0.19289
[8]	validation_0-rmsle:0.18695
[9]	validation_0-rmsle:0.18179
[10]	validation_0-rmsle:0.17741
[11]	validation_0-rmsle:0.17370
[12]	validation_0-rmsle:0.17051
[13]	validation_0-rmsle:0.16784
[14]	validation_0-rmsle:0.16546
[15]	validation_0-rmsle:0.16359
[16]	validation_0-rmsle:0.16199
[17]	validation_0-rmsle:0.16056
[18]	validation_0-rmsle:0.15940
[19]	validation_0-rmsle:0.15846
[20]	validation_0-rmsle:0.15763
[21]	validation_0-rmsle:0.15696
[22]	validation_0-rmsle:0.15634
[23]	validation_0-rmsle:0.15587
[24]	validation_0-rmsle:0.15543
[25]	validation_0-rmsle:0.15508
[26]	validation_0-rmsle:0.15479
[27]	validation_0-rmsle:0.15452
[28]	validation_0-rmsle:0.15425
[29]	validation_0-rmsle:0.15401
[30]	validation_0-rmsle:0.15385
[31]	validation_0-

In [22]:
# y_predict_xgb_out = model_xgb.predict(X_test)
# y_predict_xgb_out_df = package_result(test_df, y_predict_xgb_out)
# y_predict_xgb_out_df.round(0).astype("int").to_csv("submissions/xgb_naive_guess.csv")

Now what about with some scaling? Would that help the performance?

In [37]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

columns_to_rescale = X_train.select_dtypes("float").columns

preprocessor = ColumnTransformer(
    transformers=[("scale", StandardScaler(), columns_to_rescale)],
    remainder="passthrough",
    verbose_feature_names_out=False,
)
preprocessor.set_output(transform="pandas")

In [38]:
X_train_prep = preprocessor.fit_transform(X_train)

X_train_prep_1, X_train_prep_2, y_train_prep_1, y_train_prep_2 = train_test_split(
    X_train_prep, y_train, train_size=0.3, random_state=42
)

In [39]:
model_xgb_scaled = XGBRegressor(learning_rate=0.1, n_estimators=1000, early_stopping_rounds=5, eval_metric="rmsle")

model_xgb_scaled.fit(X_train_prep_1, y_train_prep_1, eval_set=[(X_train_prep_2, y_train_prep_2)])

y_predict_xgb_scaled = model_xgb_scaled.predict(X_train_prep_2)

print(root_mean_squared_log_error(y_train_prep_2, y_predict_xgb_scaled))

[0]	validation_0-rmsle:0.27054
[1]	validation_0-rmsle:0.25440
[2]	validation_0-rmsle:0.24028
[3]	validation_0-rmsle:0.22803
[4]	validation_0-rmsle:0.21737
[5]	validation_0-rmsle:0.20810
[6]	validation_0-rmsle:0.19988
[7]	validation_0-rmsle:0.19289
[8]	validation_0-rmsle:0.18695
[9]	validation_0-rmsle:0.18179
[10]	validation_0-rmsle:0.17741
[11]	validation_0-rmsle:0.17370
[12]	validation_0-rmsle:0.17051


[13]	validation_0-rmsle:0.16784
[14]	validation_0-rmsle:0.16546
[15]	validation_0-rmsle:0.16359
[16]	validation_0-rmsle:0.16199
[17]	validation_0-rmsle:0.16056
[18]	validation_0-rmsle:0.15940
[19]	validation_0-rmsle:0.15846
[20]	validation_0-rmsle:0.15763
[21]	validation_0-rmsle:0.15696
[22]	validation_0-rmsle:0.15634
[23]	validation_0-rmsle:0.15587
[24]	validation_0-rmsle:0.15543
[25]	validation_0-rmsle:0.15508
[26]	validation_0-rmsle:0.15479
[27]	validation_0-rmsle:0.15452
[28]	validation_0-rmsle:0.15425
[29]	validation_0-rmsle:0.15401
[30]	validation_0-rmsle:0.15385
[31]	validation_0-rmsle:0.15369
[32]	validation_0-rmsle:0.15358
[33]	validation_0-rmsle:0.15349
[34]	validation_0-rmsle:0.15339
[35]	validation_0-rmsle:0.15332
[36]	validation_0-rmsle:0.15319
[37]	validation_0-rmsle:0.15316
[38]	validation_0-rmsle:0.15308
[39]	validation_0-rmsle:0.15301
[40]	validation_0-rmsle:0.15296
[41]	validation_0-rmsle:0.15292
[42]	validation_0-rmsle:0.15289
[43]	validation_0-rmsle:0.15285
[44]	val

In [40]:
print(
    root_mean_squared_log_error(y_train_prep_2, y_predict_xgb_scaled)
    - root_mean_squared_log_error(y_train_2, y_predict_xgb)
)

0.0


The scaling seems to have had absolutely no effect!

Now let's try to treat this as a classification issue!

In [54]:
from sklearn.preprocessing import LabelEncoder

label_encoding = LabelEncoder()
label_encoding.fit(y_train)

y_train_1_p = label_encoding.transform(y_train_1)
y_train_2_p = label_encoding.transform(y_train_2)

from xgboost import XGBClassifier

model_xgb_classifier = XGBClassifier(
    objective="multi:softprob",
    num_class=len(y_train.unique()),
    learning_rate=0.1,
    subsample=0.3,
    n_estimators=1000,
    early_stopping_rounds=5,
)

model_xgb_classifier.fit(X_train_1, y_train_1_p, eval_set=[(X_train_2, y_train_2_p)])

y_predict_xgb_class = model_xgb_classifier.predict(X_train_2)

[0]	validation_0-mlogloss:3.05152
[1]	validation_0-mlogloss:2.86919
[2]	validation_0-mlogloss:2.73506
[3]	validation_0-mlogloss:2.62672
[4]	validation_0-mlogloss:2.53646
[5]	validation_0-mlogloss:2.46060
[6]	validation_0-mlogloss:2.39561
[7]	validation_0-mlogloss:2.33790
[8]	validation_0-mlogloss:2.28729
[9]	validation_0-mlogloss:2.24188
[10]	validation_0-mlogloss:2.20165
[11]	validation_0-mlogloss:2.16450
[12]	validation_0-mlogloss:2.13149
[13]	validation_0-mlogloss:2.10073
[14]	validation_0-mlogloss:2.07419
[15]	validation_0-mlogloss:2.04932
[16]	validation_0-mlogloss:2.02638
[17]	validation_0-mlogloss:2.00535
[18]	validation_0-mlogloss:1.98625
[19]	validation_0-mlogloss:1.96864
[20]	validation_0-mlogloss:1.95241
[21]	validation_0-mlogloss:1.93768
[22]	validation_0-mlogloss:1.92411
[23]	validation_0-mlogloss:1.91163
[24]	validation_0-mlogloss:1.89987
[25]	validation_0-mlogloss:1.88888
[26]	validation_0-mlogloss:1.87843
[27]	validation_0-mlogloss:1.86909
[28]	validation_0-mlogloss:1.8

In [55]:
print(root_mean_squared_log_error(y_train_2_p, y_predict_xgb_class))

0.190965149234135


The classification scheme is also not much more accurate!