<a href="https://colab.research.google.com/github/ManullangJihan/100-Day-ML-Challenge/blob/main/04_Abalone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Abalone

## Konten

1. Sex -- M, F, and I (infant)
2. Length / continuous / mm / Longest shell measurement
3. Diameter / continuous / mm / perpendicular to length
4. Height / continuous / mm / with meat in shell
5. Whole weight / continuous / grams / whole abalone
6. Shucked weight / continuous / grams / weight of meat
7. Viscera weight / continuous / grams / gut weight (after bleeding)
8. Shell weight / continuous / grams / after being dried
9. Rings / integer / -- / +1.5 gives the age in years  (Target)

[Sumber Data](https://archive.ics.uci.edu/ml/datasets/Abalone)

In [None]:
!pip install shap
!pip install dataprep
!pip install eli5
!pip install catboost
!pip install optuna

In [1]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import plotly.graph_objects as go
import plotly.express as px

import ipywidgets as widgets
from ipywidgets import fixed

In [2]:
import plotly.io as pio

pio.templates[pio.templates.default] = 'plotly_dark'
plt.rcParams['figure.figsize'] = [10, 10]
plt.rcParams.update({'font.size': 10})

In [3]:
# Set random default for reproducibility

global_seed = 42

def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)

set_seed(global_seed)

In [4]:
path = "/content/drive/MyDrive/abalone.data"
main_df = pd.read_csv(path, sep=',', header=None)
main_df.columns = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight', 'Rings']
n_samples, n_features = main_df.shape

print(f'Number of samples: {n_samples}')
print(f'Number of features: {n_features}\n')

main_df.head()

Number of samples: 4177
Number of features: 9



Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked Weight,Viscera Weight,Shell Weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [5]:
# Remove Duplicated Row if there is any

duplicate_idx = main_df.duplicated()
print(f'Number of samples before delete duplicated row: {n_samples}')

main_df = main_df.loc[~duplicate_idx, :]
n_samples = main_df.shape[0]
print(f'Number of samples before after delete duplicated row: {n_samples}')

Number of samples before delete duplicated row: 4177
Number of samples before after delete duplicated row: 4177


In [6]:
from plotly.subplots import make_subplots

## Visualize Data Distribution
def plot_distribution(df, feature):
    feature_mean = df[feature].mean()
    feature_std = df[feature].std()
    feature_max = df[feature].max()
    feature_min = df[feature].min()
    feature_skew =  df[feature].skew()
    print('')
    print(feature + ' ' +  'Insight')
    print(f'Mean     : {feature_mean:.2f}')
    print(f'Std      : {feature_std:.2f}')
    print(f'Max      : {feature_max:.2f}')
    print(f'Min      : {feature_min:.2f}')
    print(f'Skewness : {feature_skew:.3f}\n')

    fig = make_subplots(
        rows=1, cols=2,
        specs=[[{'type': 'xy'}, {'type': 'xy'}]])
    
    fig.add_trace(
        go.Histogram(
            x = df[feature],
            name = feature,
            ), row=1, col=1
        )

    fig.add_trace(
        go.Violin(
            y = df[feature],
            name = feature,
            box_visible = True,
            meanline_visible = True,
            fillcolor='lightseagreen',
        ),
        row=1, col=2
    )

    fig.update_layout(
        title = f"{feature} Distribution",
        width = 750,
        height = 500,
        showlegend = False
    )

    fig.show()

num_cols = main_df.columns[main_df.dtypes != 'object']
selected_feature = widgets.Dropdown(
    options = num_cols,
    value = num_cols[1],
    description = 'Feature',
    disabled = False)

widgets.interact(
    plot_distribution, 
    df = fixed(main_df),
    feature = selected_feature)

interactive(children=(Dropdown(description='Feature', index=1, options=('Length', 'Diameter', 'Height', 'Whole…

<function __main__.plot_distribution>

In [7]:
all_features = main_df.columns.values.tolist()

def plot_feature_interaction(df, x, y):
    fig = px.scatter(
        df, x=x, y=y, color="Rings",
        color_continuous_scale  = 'Viridis',
        trendline='ols',
        trendline_scope = 'overall',
        trendline_color_override = 'deeppink',
        opacity = 0.5
    )

    fig.update_layout(
        title=f'Interaction between {x} and {y}',
        height = 500,
        width = 600,
        showlegend=False)
    
    fig.show()

feature1 = widgets.Dropdown(
    options = all_features,
    value = all_features[1],
    description = 'X-axis'
)

feature2 = widgets.Dropdown(
    options = all_features,
    value = all_features[2],
    description = 'Y-axis'
)


widgets.interact(
    plot_feature_interaction,
    df = fixed(main_df),
    x = feature1,
    y = feature2
)

interactive(children=(Dropdown(description='X-axis', index=1, options=('Sex', 'Length', 'Diameter', 'Height', …

<function __main__.plot_feature_interaction>

In [8]:
# Plot Statistical Information

means = main_df.mean(numeric_only=True)
vars = main_df.var(numeric_only=True)
stds = main_df.std(numeric_only=True)
skews = main_df.skew(numeric_only=True)

df_stats = {
    'Mean': means,
    'Var': vars,
    'Std': stds,
    'Skew': skews
}

def plot_stats(stats=None):
    fig = px.bar(
        df_stats,
        y = stats,
    )

    fig.show()

stat_list = ['Mean', 'Var', 'Std', 'Skew']
stat_selected = widgets.Dropdown(
    options = stat_list,
    value = 'Mean',
    description = "Statistical Information"
)

widgets.interact(
    plot_stats,
    stats = stat_selected,
)

interactive(children=(Dropdown(description='Statistical Information', options=('Mean', 'Var', 'Std', 'Skew'), …

<function __main__.plot_stats>

In [None]:
# Remove Outliers

def clipping_outliers(df, quantile=0.9):
    numeric_cols = df.columns[df.dtypes != 'object'].values.tolist()

    for col in numeric_cols:
        q_val = df[col].quantile(quantile)
        idx = np.where(df[col] < q_val, True, False)
        df[col] = df.loc[idx, col]
    
    return df

removed_outlier_df = clipping_outliers(main_df)
removed_outlier_df.shape

(4177, 9)

In [None]:
# Remove Outliers for value more than 3x Standard Deviation

def remove_outliers(df, std_val=3):
    numeric_cols = df.columns[df.dtypes != 'object'].values.tolist()

    for col in numeric_cols:
        std_val = df[col].std() * 3
        df[col] = df.loc[(df.loc[:, col] > -std_val) & (df.loc[:, col] < std_val), col]
    
    return df

new_df = main_df.copy()
removed_outliers_df = remove_outliers(new_df)
removed_outliers_df.shape

(4177, 9)

In [12]:
main_df = main_df.loc[main_df.loc[:, 'Rings'] > 3, :]
main_df = main_df.loc[main_df.loc[:, 'Rings'] < 24, :]

main_df.shape

(4153, 9)

In [13]:
main_df = pd.get_dummies(main_df)
main_df.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked Weight,Viscera Weight,Shell Weight,Rings,Sex_F,Sex_I,Sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,0,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0,0,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,1,0,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,0,0,1
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,0,1,0


In [14]:
new_df = main_df.copy()

new_df['Proportion_H_L'] = new_df['Height'] / new_df['Length']
new_df['Proportion_H_WW'] = new_df['Height'] / new_df['Whole weight']
new_df['Proportion_H_ShW'] = new_df['Height'] / new_df['Shucked Weight']
new_df['Proportion_H_VW'] = new_df['Height'] / new_df['Viscera Weight']
new_df['Proportion_H_Sw'] = new_df['Height'] / new_df['Shell Weight']

new_df['Proportion_L_WW'] = new_df['Length'] / new_df['Whole weight']
new_df['Proportion_L_ShW'] = new_df['Length'] / new_df['Shucked Weight']
new_df['Proportion_L_VW'] = new_df['Length'] / new_df['Viscera Weight']
new_df['Proportion_L_SW'] = new_df['Length'] / new_df['Shell Weight']

new_df['Proportion_D_L'] = new_df['Diameter'] / new_df['Length']
new_df['Proportion_D_H'] = new_df['Diameter'] / new_df['Height']
new_df['Proportion_D_WW'] = new_df['Diameter'] / new_df['Whole weight']
new_df['Proportion_D_SW'] = new_df['Diameter'] / new_df['Shell Weight']
new_df['Proportion_D_ShW'] = new_df['Diameter'] / new_df['Shucked Weight']
new_df['Proportion_D_VW'] = new_df['Diameter'] / new_df['Viscera Weight']

new_df['Proportion_VW_SW'] = new_df['Viscera Weight'] / new_df['Shell Weight']
new_df['Proportion_VW_WW'] = new_df['Viscera Weight'] / new_df['Whole weight']
new_df['Proportion_VW_ShW'] = new_df['Viscera Weight'] / new_df['Shucked Weight']

new_df['Proportion_ShW_WW'] = new_df['Shucked Weight'] / new_df['Whole weight']
new_df['Proportion_ShW_SW'] = new_df['Shucked Weight'] / new_df['Shell Weight']

new_df['Proportion_SW_WW'] = new_df['Shell Weight'] / new_df['Whole weight']

print(new_df.shape)

(4153, 32)


In [66]:
def clean_data(df):
    assert isinstance(df, pd.DataFrame)
    df.dropna()
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep]

In [62]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

def visualize_model(y_test, y_pred, score):
    fig = px.scatter(
        x=y_test, y=y_pred,
        trendline="ols", trendline_color_override="deeppink")
    fig.update_layout(
        title = f"MSE Score: {score:.3f}",
        height = 500,
        width = 600,
        xaxis_title = "Truth",
        yaxis_title = "Predictions")
    
    fig.show()

def simple_evaluate(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.2, shuffle=True
    )

    params = {'n_estimators': 118, 'max_depth': 4, 'learning_rate': 0.11954781075189252, 'min_child_weight': 56}
    reg = XGBRegressor(**params, objective="reg:squarederror").fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    y_pred = np.round(y_pred)
    mse = mean_squared_error(y_test, y_pred)

    visualize_model(y_test, y_pred, mse)

In [55]:
from sklearn.preprocessing import StandardScaler
from scipy.stats import yeojohnson

def normalize_data(df):
    mean = df.mean()
    std = df.std()
    return (df - mean) / std

def normalize_skewed_data(df, cols):
    for col in cols:
        df[col] = yeojohnson(df[col])[0]
    return df

In [63]:
# Evaluate raw Data

X = main_df.drop('Rings', axis=1)
y = main_df.Rings.values

X = normalize_data(X)
X = normalize_skewed_data(X, ['Height'])

simple_evaluate(X, y)

In [64]:
# Evaluate Feature-Enginnering Data

X = new_df.drop('Rings', axis=1)
y = new_df.Rings.values

X = normalize_data(X)
X = normalize_skewed_data(X, ['Height'])

simple_evaluate(X, y)

In [67]:
from imblearn.over_sampling import SMOTE

new_df = clean_data(new_df)
X_resampled = new_df.drop('Rings', axis=1)
y_resampled = new_df.Rings.values

X_resampled, y_resampled = SMOTE().fit_resample(X_resampled, y_resampled)

In [68]:
# Evaluate Resampled Data

X = normalize_data(X_resampled)
X = normalize_skewed_data(X_resampled, ['Height'])

simple_evaluate(X, y_resampled)

In [70]:
# Evaluate PCA transformed Data
from sklearn.decomposition import PCA

pca = PCA(n_components=20).fit(X_resampled)
X_pca = pca.transform(X_resampled)

simple_evaluate(X_pca, y_resampled)

# TODO

1. Transform the Target because Target skewed to the left
2. Evaluate results
3. Normalize Skewed Features
4. Evaluate results with normalized skewed features

In [None]:
from functools import partial
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

import optuna
from optuna import trial

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1500),
        'max_depth': trial.suggest_int('max_depth', 2, 13),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.05, 0.50),
        'min_child_weight':  trial.suggest_int('min_child_weight', 1, 100)
    }

    model = XGBRegressor(**params, objective='reg:squarederror')
    
    mses = cross_val_score(
        model, X_1, y_1, scoring='neg_mean_squared_error', cv=5
    )
       
    return -1.0 * np.mean(mses)

optimization_function = partial(
    objective)

study = optuna.create_study(direction='minimize')
study.optimize(optimization_function, n_trials=15)

[32m[I 2022-06-08 09:52:07,768][0m A new study created in memory with name: no-name-4b816aeb-6d8c-45c2-8930-c83349a67136[0m
[32m[I 2022-06-08 09:53:43,739][0m Trial 0 finished with value: 5.765892515596842 and parameters: {'n_estimators': 972, 'max_depth': 12, 'learning_rate': 0.3732847555343503, 'min_child_weight': 73}. Best is trial 0 with value: 5.765892515596842.[0m
[32m[I 2022-06-08 09:54:29,106][0m Trial 1 finished with value: 5.591109682957437 and parameters: {'n_estimators': 838, 'max_depth': 6, 'learning_rate': 0.21343312423030258, 'min_child_weight': 45}. Best is trial 1 with value: 5.591109682957437.[0m
[32m[I 2022-06-08 09:54:58,717][0m Trial 2 finished with value: 5.034621030915826 and parameters: {'n_estimators': 1160, 'max_depth': 2, 'learning_rate': 0.18486098071290924, 'min_child_weight': 68}. Best is trial 2 with value: 5.034621030915826.[0m
[32m[I 2022-06-08 09:56:04,144][0m Trial 3 finished with value: 5.678328687256606 and parameters: {'n_estimators':