<a id = "1"></a><br>
# Load Python Pakages


In [None]:
#basics
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import os
import glob

import warnings
warnings.filterwarnings("ignore")


#preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import QuantileTransformer, quantile_transform


#statistics
from scipy import stats
from scipy.stats import skew
from scipy.special import boxcox1p
from scipy.stats import randint

#feature engineering
from sklearn.feature_selection import mutual_info_regression


#transformers and pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn import set_config


#algorithms
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split


#model evaluation
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer
import optuna
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice


#stacking
from sklearn.ensemble import StackingRegressor




<a id = "2"></a><br>
#  First look to data

In [None]:
# Read the data
train = pd.read_csv('/kaggle/input/playground-series-s3e16/train.csv', index_col=[0])
test = pd.read_csv('/kaggle/input/playground-series-s3e16/test.csv', index_col=[0])
original = pd.read_csv('/kaggle/input/crab-age-prediction/CrabAgePrediction.csv')
sample_submission = pd.read_csv("/kaggle/input/playground-series-s3e16/sample_submission.csv")

# reserved for pipeline
pipe_data = train.copy()
pipe_test = test.copy()
pipe_original = original.copy()

# use for preliminary analysis
train_df = train.copy()
test_df = test.copy()
original_df = original.copy()
train_df.head()

In [None]:
test_df.head()

In [None]:
original_df.head()

In [None]:
original_df.index.names = ['id']
original_df.head()

In [None]:
train_df = pd.concat([train_df, original_df])
train_df.head()

In [None]:
# is there any missing value?
train_df.isnull().any()

## Descpriptive statistics

In [None]:
#numerical feature descriptive statistics
train_df.describe().T

## Grouping features for preprocessing purposes

In [None]:
train_df.nunique().sort_values()

In [None]:
# Just bookkeeping
feature_list = [feature for feature in train_df.columns if not feature  == "Age"]
categorical_features= ['Sex']
numerical_features = list(set(feature_list) - set(categorical_features))

assert feature_list.sort() == (numerical_features + categorical_features).sort()

<a id = "3"></a><br>
# Exploratory Data Analysis

In [None]:
# fig, ax = plt.subplots(3, 3, figsize=(20, 20))
# for var, subplot in zip(numerical_features, ax.flatten()):
#     sns.scatterplot(x=var, y='Age',  data=train_df, ax=subplot, hue = 'Age' )
    

In [None]:
# # Display correlations between features and Age on heatmap.

# sns.set(font_scale=1.1)
# correlation_train = train_df.corr()
# mask = np.triu(correlation_train.corr())
# plt.figure(figsize=(15, 15))
# sns.heatmap(correlation_train,
#             annot=True,
#             fmt='.1f',
#             cmap='coolwarm',
#             square=True,
#             mask=mask,
#             linewidths=1,
#             cbar=False);

In [None]:
y= train_df['Age']

In [None]:
# determine the mutual information for numerical features
mutual_df = train_df[numerical_features]

mutual_info = mutual_info_regression(mutual_df, y, random_state=1)

mutual_info = pd.Series(mutual_info)
mutual_info.index = mutual_df.columns
pd.DataFrame(mutual_info.sort_values(ascending=False), columns = ["MI_score"] ).style.background_gradient("cool")


In [None]:
#categorical features must be encoded to get mutual information
mutual_df_categorical = train_df[categorical_features]
for colname in mutual_df_categorical:
    mutual_df_categorical[colname], _ = mutual_df_categorical[colname].factorize()
mutual_info = mutual_info_regression(mutual_df_categorical, y, random_state=1)

mutual_info = pd.Series(mutual_info)
mutual_info.index = mutual_df_categorical.columns
pd.DataFrame(mutual_info.sort_values(ascending=False), columns = ["Categorical_Feature_MI"] ).style.background_gradient("cool")

<a id = "4"></a><br>
# Feature Engineering

In [None]:
import math

train_df ["volume"] = train_df["Height"] * train_df["Diameter"] * train_df["Length"]
train_df ["dim1"] = train_df["Height"] * train_df["Diameter"] 
train_df ["dim2"] = train_df["Height"] * train_df["Length"] 
train_df ["dim3"] = train_df["Diameter"] * train_df["Length"]
train_df ["total_weight"] = train_df["Shell Weight"] + train_df["Viscera Weight"] + train_df["Shucked Weight"]
train_df ["weight_volume_ratio"] = train_df["Weight"] / (train_df["Diameter"] + 1e-8 )
train_df ["shell_to_total_weight"] = train_df["Shell Weight"] / train_df["Weight"]
train_df ["viscera_to_total_weight"] = train_df["Viscera Weight"] / train_df["Weight"]
train_df ["shucked_to_total_weight"] = train_df["Shucked Weight"] / train_df["Weight"]

# Volume: Since crabs have a three-dimensional shape, you can calculate their volume using the length, diameter, and height features.
train_df["volumeV2"] = math.pi*(train_df["Diameter"] / 2) ** 2 *  train_df["Height"]
train_df["surface"] = 2 * math.pi * (train_df["Diameter"] / 2) *  (train_df["Height"] + (train_df["Diameter"] / 2))
# BMI (Body Mass Index): BMI is a measure of body fat based on the weight and height of an individual.
train_df["bmi"] = train_df["Weight"]/(train_df["Length"] ** 2)                                                    
# Square and Cube: Squaring or cubing a feature can capture non-linear relationships in the data.
#     df["Length Squared"] = df["Length"] ** 2
#     df["Diameter Cubed"] = df["Diameter"] ** 3
# Logarithm: Taking the logarithm of a feature can help normalize its distribution and handle extreme values:
train_df["log_weight"] = np.log(train_df["Weight"])


# Additional features
# Shell Weight Squared: You can create a new feature by squaring the shell weight. 
# This can capture any non-linear relationship between the shell weight and the target variable:
train_df["shell_weight_squared"] = train_df["Shell Weight"] * train_df["Shell Weight"] 

# Shell Weight Cubed: Similarly, you can create a new feature by cubing the shell weight. 
# This can capture further non-linear relationships:
train_df["shell_weight_cubed"] = train_df["Shell Weight"] * train_df["Shell Weight"] * train_df["Shell Weight"] 



new_features = ["volume", 'dim1', 'dim2', 'dim3', 'total_weight', 'weight_volume_ratio', 'shell_to_total_weight','viscera_to_total_weight','shucked_to_total_weight', 'volumeV2', 'surface', 'bmi', 'log_weight', 'shell_weight_squared', 'shell_weight_cubed']

# new_features = ["volume", 'dim1', 'dim2', 'dim3', 'total_weight', 'weight_volume_ratio', 'shell_to_total_weight','viscera_to_total_weight','shucked_to_total_weight']


Let's check new features mutual information scores...

In [None]:
mutual_df = train_df[new_features]

mutual_info = mutual_info_regression(mutual_df, y, random_state=1)

mutual_info = pd.Series(mutual_info)
mutual_info.index = mutual_df.columns
pd.DataFrame(mutual_info.sort_values(ascending=False), columns = ["New_Feature_MI"] ).style.background_gradient("cool")

In [None]:
from sklearn.decomposition import PCA, NMF
def add_pca_features(X_train):    
    
    # Select the columns for PCA
    pca_features = X_train.select_dtypes(include=['float64']).columns.tolist()
    n_components = 4 # len(pca_features)

    # Create the pipeline
    pipeline = make_pipeline(StandardScaler(), PCA(n_components=n_components))
    
    # Perform PCA
    pipeline.fit(X_train[pca_features])

    # Create column names for PCA features
    pca_columns = [f'PCA_{i}' for i in range(n_components)]

    # Add PCA features to the dataframe
    X_train[pca_columns] = pipeline.transform(X_train[pca_features])

    return X_train
    
train_df = add_pca_features(train_df)

In [None]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
#tree preprocessor
tree_preprocessor = ColumnTransformer(remainder='passthrough',
    transformers=[
        ('categorical_transformer', categorical_transformer, categorical_features)

    ])

tree_preprocessor

<a id = "7"></a><br>
## A custom pipeline for Feature Engineering

In [None]:
train_df

In [None]:
class FeatureCreator(BaseEstimator, TransformerMixin):
    def __init__(self, add_attributes=True):
        
        self.add_attributes = add_attributes
        
    def fit(self, X, y=None):
        
        return self
    
    def transform(self, X):
        
        if self.add_attributes:
            X_copy = X.copy()
            
            
            X_copy ["volume"] = X_copy["Height"] * X_copy["Diameter"] * X_copy["Length"]
            X_copy ["dim1"] = X_copy["Height"] * X_copy["Diameter"] 
            X_copy ["dim2"] = X_copy["Height"] * X_copy["Length"] 
            X_copy ["dim3"] = X_copy["Diameter"] * X_copy["Length"]
            X_copy ["total_weight"] = X_copy["Shell Weight"] + X_copy["Viscera Weight"] + X_copy["Shucked Weight"]
            X_copy ["weight_volume_ratio"] = X_copy["Weight"] / (X_copy["Diameter"] + 1e-8 )
            X_copy ["shell_to_total_weight"] = X_copy["Shell Weight"] / X_copy["Weight"]
            X_copy ["viscera_to_total_weight"] = X_copy["Viscera Weight"] / X_copy["Weight"]
            X_copy ["shucked_to_total_weight"] = X_copy["Shucked Weight"] / X_copy["Weight"] 
            X_copy["volumeV2"] = math.pi*(X_copy["Diameter"] / 2) ** 2 *  X_copy["Height"]
            X_copy["surface"] = 2 * math.pi * (X_copy["Diameter"] / 2) *  (X_copy["Height"] + (X_copy["Diameter"] / 2))
            X_copy["bmi"] = X_copy["Weight"]/(X_copy["Length"] ** 2)                                                    
            X_copy["log_weight"] = np.log(X_copy["Weight"])
            X_copy["shell_weight_squared"] = X_copy["Shell Weight"] * X_copy["Shell Weight"] 
            X_copy["shell_weight_cubed"] = X_copy["Shell Weight"] * X_copy["Shell Weight"] * X_copy["Shell Weight"] 

            X_copy = add_pca_features(X_copy)
            return X_copy
        else:
            return X_copy

In [None]:
Creator = FeatureCreator(add_attributes = True)

In [None]:
Creator

<a id = "8"></a><br>
# Putting pieces together

In [None]:
pipe_original.index.names = ['id']
pipe_original.head()


pipe_data = pd.concat([pipe_data, pipe_original])
pipe_data.info()



In [None]:
y = pipe_data['Age']
pipe_data = pipe_data.drop('Age', axis=1)
pipe_data.head()

In [None]:
pip install flaml

In [None]:
#flaml
from flaml import AutoML

In [None]:
pip install sklego

In [None]:
from sklego.linear_model import LADRegression

In [None]:
automl = AutoML()

automl_pipeline = Pipeline([
    ('Creator', Creator),
    ('tree_preprocessor', tree_preprocessor),
    ("automl", automl)
])
automl_pipeline

In [None]:
# Specify automl goal and constraint
automl_settings = {
    "time_budget": 10800, # 10800 total running time in seconds
    "task": 'regression',  # task type
    "seed": 24545678,  # random seed
    "metric" : 'mae',
    "eval_method" : 'cv',
    "n_splits" : 5,
    "ensemble" : True,
        "ensemble": {
        "final_estimator": LADRegression(),
        "passthrough": True,
    },

    
}

pipeline_settings = {f"automl__{key}": value for key, value in automl_settings.items()}

In [None]:
automl_pipeline = automl_pipeline.fit(pipe_data, y, **pipeline_settings)

In [None]:
preds_test =  automl_pipeline.predict(pipe_test)

In [None]:
# preds_test = [round(x) for x in preds_test]

<a id = "13"></a><br>
# Submission

In [None]:
unique_targets = np.unique(y)
def mattop_post_process(preds):
     return np.array([min(unique_targets, key = lambda x: abs(x - pred)) for pred in preds])

In [None]:
np.mean(preds_test, axis=0)

In [None]:
# def oof_result(oof_preds, y_train, title):
#     plt.figure(figsize=(20, 6))
#     plt.subplot(1, 2, 1)
#     sns.histplot(oof_preds, kde=True, alpha=0.5, label='oof_preds')
#     sns.histplot(y_train.values, kde=True, alpha=0.5, label='y_train')
#     plt.title('Histogram of OOF Predictions and Train Values')
#     plt.xlabel('Value')
#     plt.ylabel('Frequency')
#     plt.legend()

#     plt.subplot(1, 2, 2)
#     sns.scatterplot(x=y_train.values, y=oof_preds, alpha=0.5)
#     plt.xlabel('Actual Values')
#     plt.ylabel('OOF Predicted Values')
#     plt.title('Actual vs. OOF Predicted Values')
#     plt.suptitle(f'{title}', fontweight='bold', fontsize=16)

#     plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], color='red', alpha=0.5)
#     plt.show()
    
# # oof_result(np.mean(preds_test, axis=1), y, title='')    
# # oof_result(mattop_post_process(preds_test), y, title='After Mattop post process')

In [None]:
preds = mattop_post_process(preds_test)
output = pd.DataFrame({'id': pipe_test.index,
                       'Age': preds})
output.to_csv('submission.csv', index=False)

In [None]:
output.head()