## Imports

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

## Load Data

In [2]:
crab_data = pd.read_csv("../Exploratory_Data_Analysis/CrabAgePrediction.csv")

In [3]:
crab_data

Unnamed: 0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,F,1.4375,1.1750,0.4125,24.635715,12.332033,5.584852,6.747181,9
1,M,0.8875,0.6500,0.2125,5.400580,2.296310,1.374951,1.559222,6
2,I,1.0375,0.7750,0.2500,7.952035,3.231843,1.601747,2.764076,6
3,F,1.1750,0.8875,0.2500,13.480187,4.748541,2.282135,5.244657,10
4,I,0.8875,0.6625,0.2125,6.903103,3.458639,1.488349,1.700970,6
...,...,...,...,...,...,...,...,...,...
3888,F,1.4625,1.1375,0.3250,24.819987,11.651644,5.854172,6.378637,8
3889,F,1.5500,1.2125,0.4375,34.458817,15.450477,7.172423,9.780577,10
3890,I,0.6250,0.4625,0.1625,2.012815,0.765436,0.524466,0.637864,5
3891,I,1.0625,0.7750,0.2625,10.347568,4.507570,2.338834,2.976698,6


In [4]:
crab_data.corr()

Unnamed: 0,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
Length,1.0,0.986653,0.823081,0.925374,0.898181,0.903253,0.897736,0.554973
Diameter,0.986653,1.0,0.829532,0.92577,0.893626,0.89981,0.905561,0.573844
Height,0.823081,0.829532,1.0,0.814405,0.770961,0.793272,0.81229,0.551956
Weight,0.925374,0.92577,0.814405,1.0,0.969077,0.965583,0.955269,0.538819
Shucked Weight,0.898181,0.893626,0.770961,0.969077,1.0,0.93128,0.882406,0.41876
Viscera Weight,0.903253,0.89981,0.793272,0.965583,0.93128,1.0,0.906105,0.501328
Shell Weight,0.897736,0.905561,0.81229,0.955269,0.882406,0.906105,1.0,0.625195
Age,0.554973,0.573844,0.551956,0.538819,0.41876,0.501328,0.625195,1.0


## Simple Version

Here we fit a linear regression model to one-hot encoding representing quantile bins in what is probably the most predictive feature (the feature with the highest pearson correlation), shell weight

In [5]:
def df_with_one_hot_quantile_bins_and_bin_ranges(df, column, num_bins):
    return_df = df.copy()
    bins, values = pd.qcut(return_df[column], num_bins, range(num_bins), retbins=True)
    ohe = OneHotEncoder(sparse=False)
    array = ohe.fit_transform(bins.to_numpy().reshape(-1, 1)).astype("int")
    for i, row in enumerate(array.T):
        column_name = column.replace(" ", "_") + "_" + str(i)
        return_df[column_name] = row
    return {"df": return_df, "bins": bins}

In [6]:
shell_weight_result = df_with_one_hot_quantile_bins_and_bin_ranges(crab_data, "Shell Weight", 10)
crab_data_1, shell_weight_bins = [shell_weight_result[_] for _ in ["df", "bins"]]

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
crab_data_1_train, crab_data_1_test = train_test_split(crab_data_1, train_size=0.85, random_state=1)

In [9]:
from sklearn.linear_model import LinearRegression

In [10]:
lr = LinearRegression()

In [11]:
lr.fit(crab_data_1_train[["Shell_Weight_" + str(i) for i in range(10)]], crab_data_1_train["Age"])

LinearRegression()

In [12]:
test_values = lr.predict(crab_data_1_test[["Shell_Weight_" + str(i) for i in range(10)]])
mse = ((test_values - crab_data_1_test["Age"])**2).sum()/len(crab_data_1_test)
mse

6.21363172139207

In [31]:
abs(lr.predict(crab_data_1_test[["Shell_Weight_" + str(i) for i in range(10)]]) - 
    crab_data_1_test["Age"]).sum() / len(crab_data_1_test)

1.8277370505136987

### Use 30 bins instead

In [13]:
shell_weight_result = df_with_one_hot_quantile_bins_and_bin_ranges(crab_data, "Shell Weight", 30)
crab_data_2, shell_weight_bins_2 = [shell_weight_result[_] for _ in ["df", "bins"]]
crab_data_2_train, crab_data_2_test = train_test_split(crab_data_2, train_size=0.85, random_state=1)

lr_2 = LinearRegression()
lr_2.fit(crab_data_2_train[["Shell_Weight_" + str(i) for i in range(30)]], crab_data_2_train["Age"])

test_values_2 = lr_2.predict(crab_data_2_test[["Shell_Weight_" + str(i) for i in range(30)]])
mse_2 = ((test_values_2 - crab_data_2_test["Age"])**2).sum()/len(crab_data_2_test)
mse_2

5.9515499546103285

In [33]:
abs(lr_2.predict(crab_data_2_test[["Shell_Weight_" + str(i) for i in range(30)]]) - 
    crab_data_2_test["Age"]).sum() / len(crab_data_2_test)

1.772517792166096

### Use 90 bins instead

In [14]:
shell_weight_result = df_with_one_hot_quantile_bins_and_bin_ranges(crab_data, "Shell Weight", 90)
crab_data_3, shell_weight_bins_3 = [shell_weight_result[_] for _ in ["df", "bins"]]
crab_data_3_train, crab_data_3_test = train_test_split(crab_data_3, train_size=0.85, random_state=1)

lr_3 = LinearRegression()
lr_3.fit(crab_data_3_train[["Shell_Weight_" + str(i) for i in range(90)]], crab_data_3_train["Age"])

test_values_3 = lr_3.predict(crab_data_3_test[["Shell_Weight_" + str(i) for i in range(90)]])
mse_3 = ((test_values_3 - crab_data_3_test["Age"])**2).sum()/len(crab_data_3_test)
mse_3

6.010734247834715

In [34]:
abs(lr_3.predict(crab_data_3_test[["Shell_Weight_" + str(i) for i in range(90)]]) - 
    crab_data_3_test["Age"]).sum() / len(crab_data_3_test)

1.7740328017979452

## More Complex Version

Here we combine one-hot encoding on sex with 10 quantile bins for shell weight and 3 quantile bins for height to make 90 one hot variables which we use to fit a linear regression model

In [15]:
crab_data_4 = crab_data_1.copy()

ohe_4 = OneHotEncoder(sparse=False)
sex_one_hot = ohe_4.fit_transform(crab_data["Sex"].to_numpy().reshape(-1, 1))

for column_name, values in zip(ohe_4.categories_[0], sex_one_hot.T):
    crab_data_4[column_name] = values.astype("int")

In [16]:
shell_weight_result = df_with_one_hot_quantile_bins_and_bin_ranges(crab_data_4, "Height", 3)
crab_data_5, shell_height_bins = [shell_weight_result[_] for _ in ["df", "bins"]]

In [18]:
counter = 0
for sex in ["M", "F", "I"]:
    for i in range(10):
        for j in range(3):
            column_name = "bin_" + str(counter)
            values = crab_data_5[sex] * crab_data_5["Shell_Weight_" + str(i)] * crab_data_5["Height_" + str(j)]
            crab_data_5[column_name] = values
            counter += 1

In [20]:
crab_data_5_train, crab_data_5_test = train_test_split(crab_data_5, train_size=0.85, random_state=1)

In [21]:
lr_5 = LinearRegression()

In [23]:
columns = (["F", "M", "I"] + ["Shell_Weight_" + str(i) for i in range(10)] + 
                          ["Height_" + str(i) for i in range(3)] + ["bin_" + str(i) for i in range(30)])

lr_5.fit(crab_data_5_train[columns], crab_data_5_train["Age"])

LinearRegression()

In [28]:
mse_5 = ((lr_5.predict(crab_data_5_test[columns]) - crab_data_5_test["Age"])**2).sum() / len(crab_data_5_test)
mse_5

6.0497937137133455

In [29]:
abs(lr_5.predict(crab_data_5_test[columns]) - crab_data_5_test["Age"]).sum() / len(crab_data_5_test)

1.7864271190068493

## Compare with Regression with Raw Features + One-Hot Encoded Sex

In [35]:
crab_data_6 = crab_data.copy()

ohe_6 = OneHotEncoder(sparse=False)
sex_one_hot = ohe_6.fit_transform(crab_data["Sex"].to_numpy().reshape(-1, 1))

for column_name, values in zip(ohe_6.categories_[0], sex_one_hot.T):
    crab_data_6[column_name] = values.astype("int")

In [37]:
crab_data_6_train, crab_data_6_test = train_test_split(crab_data_6, train_size=0.85, random_state=1)

In [42]:
lr_6 = LinearRegression()
training_columns = [c for c in crab_data_6.columns if c not in ["Sex", "Age"]]
lr_6.fit(crab_data_6_train[training_columns], crab_data_6_train["Age"])
predict_6 = lr_6.predict(crab_data_6_test[training_columns])
mse_6 = ((crab_data_6_test["Age"] - predict_6)**2).sum() / len(crab_data_6_test)
abs_6 = abs((crab_data_6_test["Age"] - predict_6)).sum() / len(crab_data_6_test)
mse_6, abs_6

(4.67141577587172, 1.5701804913947337)

## Compare with Regression with Raw Features + One-Hot Encoded Sex + interactions and 2nd degree features

In [44]:
crab_data_7 = crab_data_6.copy()
new_features = []
for i, feature in enumerate(training_columns):
    for feature_2 in training_columns[i:]:
        new_feature_name = feature + "_" + feature_2
        new_features.append(new_feature_name)
        crab_data_7[new_feature_name] = crab_data_7[feature] * crab_data_7[feature_2]
        
crab_data_7_train, crab_data_7_test = train_test_split(crab_data_7, train_size=0.85, random_state=1)

lr_7 = LinearRegression()
lr_7.fit(crab_data_7_train[training_columns + new_features], crab_data_7_train["Age"])
predict_7 = lr_7.predict(crab_data_7_test[training_columns + new_features])
mse_7 = ((crab_data_7_test["Age"] - predict_7)**2).sum() / len(crab_data_7_test)
abs_7 = abs((crab_data_7_test["Age"] - predict_7)).sum() / len(crab_data_7_test)
mse_7, abs_7

(4.42707691784464, 1.51518274722538)