# Feature Creation

## Imports and Reading in Data

In [79]:
import numpy as np
import pandas as pd 

train = pd.read_csv('data/reduced_new_train.csv', index_col='id')
test = pd.read_csv('data/test.csv', index_col='id')

In [72]:
train.head()

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [73]:
# assuming abalone are oval shaped
train['Area'] = train['Length'] * train['Diameter'] * np.pi
test['Area'] = test['Length'] * test['Diameter'] * np.pi

In [74]:
train['Shell weight per mm^2'] = train['Shell weight'] / train['Area']
test['Shell weight per mm^2'] = test['Shell weight'] / test['Area']

In [75]:
train['Meat ratio'] = train['Whole weight.1'] / train['Whole weight']
test['Meat ratio'] = test['Whole weight.1'] / test['Whole weight']

train['Shell ratio'] = train['Shell weight'] / train['Whole weight']
test['Shell ratio'] = test['Shell weight'] / test['Whole weight']

In [76]:
train['Weigth error'] = (train['Whole weight'] - train['Whole weight.1'] - train['Whole weight.2'] - train['Shell weight']) > 0
train['Weigth error'] = train['Weigth error'].astype(int)
test['Weigth error'] = (test['Whole weight'] - test['Whole weight.1'] - test['Whole weight.2'] - test['Shell weight']) > 0
test['Weigth error'] = test['Weigth error'].astype(int)

In [80]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

# Define RMSLE as it's not a built-in metric in sklearn
def rmsle(y_true, y_pred):
    """
    Compute the Root Mean Squared Logarithmic Error for an array of true and predicted values
    """
    # Adding a small value to avoid log of zero
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))


X = train.drop('Rings', axis=1)
X = pd.get_dummies(X)
y = train['Rings']

from autogluon.tabular import TabularPredictor

train_data, test_data = train_test_split(train, test_size=0.2, random_state=42)

predictor = TabularPredictor(label='Rings').fit(train_data, presets='best_quality')

# Assuming test_data includes the target column
y_true = test_data['Rings']
test_data_nolabel = test_data.drop(labels=['Rings'], axis=1)
y_pred = predictor.predict(test_data_nolabel)

# Calculate RMSLE for predictions
rmsle_score = rmsle(y_true, y_pred.to_numpy())
print(f"RMSLE on test data: {rmsle_score}")



No path specified. Models will be saved in: "AutogluonModels/ag-20240408_204153"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 3600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels/ag-20240408_204153/ds_sub_fit/sub_fit_ho.
Running the sub-fit in a ray process to avoid memory leakage.


KeyboardInterrupt: 

In [78]:
# save to file 
# keep 6 digit precision

train.to_csv('data/feature_reduced_new_train.csv', float_format='%.6f')
test.to_csv('data/feature_test.csv', float_format='%.6f')