### First glance

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from optuna.samplers import TPESampler
import pickle
from IPython.core.display import HTML
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_log_error

In [None]:
original_train = pd.read_csv('/kaggle/input/ps-4-e-2-abalone-dataset-from-uci/abalone.data', header=None)

train = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s4e4/sample_submission.csv')

In [None]:
# Show all properties on display
pd.set_option('display.max_columns', None)

train.head()

In [None]:
print(original_train.shape)
original_train.columns.tolist()

In [None]:
# Save original dataset into .csv
original_train.columns = train.columns[1:]
original_train.to_csv('orig.csv', index=False)
original_train.tail()

In [None]:
submission_id = test.id

train.drop(columns='id', axis=1, inplace=True)
test.drop(columns='id', axis=1, inplace=True)

In [None]:
train = train.drop_duplicates()

# Check whether all duplicates were removed
train_duplicates = train[train.duplicated()]
print(len(train_duplicates))

In [None]:
print(f'Train data: {train.shape}')
print(f'Test data: {test.shape}\n')

train_data_percentage = np.round(train.shape[0] / (train.shape[0] + test.shape[0]), 4)
print(f'Train data consists of {train_data_percentage * 100}% of all observations')
print(f'Test data consists of {(1 - train_data_percentage) * 100}% of all observations')

In [None]:
train.describe().T

In [None]:
print('TRAIN data\n')
print(f'{train.isna().sum()}\n\n\n')

print('TEST data\n')
print(test.isna().sum())

In [None]:
train = train.drop_duplicates()

# Check whether all duplicates were removed
duplicates = train[train.duplicated()]
len(duplicates)

In [None]:
X = pd.get_dummies(train, drop_first=True, dtype=int)
test = pd.get_dummies(test, drop_first=True, dtype=int)

In [None]:
sns.set(rc={'figure.figsize': (20, 16)})
X.hist(color='orange');

In [None]:
print(f'{train.Rings.value_counts()}\n\n')
print(train.Rings.value_counts() / train.shape[0])

In [None]:
# Split the train data into X and y
X = X.drop(['Rings'], axis=1)
y = train.Rings

# for column in X.columns.tolist():
#     X[column] = X[column].apply(lambda x: (x - X[column].min()) / (X[column].max() - X[column].min()))

# # Transform test data
# for column in test.columns.tolist():
#     test[column] = test[column].apply(lambda x: (x - test[column].min()) / (test[column].max() - test[column].min()))

# X.hist(color='LightSeaGreen');

In [None]:
%%time
# I figured out best hyperparameters previously
best_forest = RandomForestRegressor(
    random_state=27,
)

best_forest.fit(X, y)
importance = best_forest.feature_importances_

feature_importance = pd.DataFrame(data=importance, index=X.columns, columns=['importance']) \
    .sort_values(ascending=True, by='importance')

feature_importance.plot(kind='barh', figsize=(12, 8), color='orange');

In [None]:
print(X.columns)

In [None]:
# Training dataset
numeric_columns_train = X.select_dtypes(include=np.number)
corr_train = numeric_columns_train.corr(method='pearson')
mask_train = np.triu(np.ones_like(corr_train))
sns.heatmap(corr_train, annot=True, fmt='.2f', mask=mask_train, cmap='Spectral', cbar=None, linewidth=2)
plt.tight_layout()
plt.show()

In [None]:
X = X.drop(['Diameter', 'Whole weight.2'], axis=1)
test = test.drop(['Diameter', 'Whole weight.2'], axis=1)

In [None]:
# Training dataset
numeric_columns_train = X.select_dtypes(include=np.number)
corr_train = numeric_columns_train.corr(method='pearson')
mask_train = np.triu(np.ones_like(corr_train))
sns.heatmap(corr_train, annot=True, fmt='.2f', mask=mask_train, cmap='coolwarm', cbar=None, linewidth=2)
plt.tight_layout()
plt.show()

In [None]:
# Split data into train and val
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=27)

In [None]:
# %%time
# def objective(trial):
#     model = RandomForestRegressor(
#         n_estimators=trial.suggest_int("n_estimators", 100, 1000),
# #         criterion=trial.suggest_categorical("criterion", ['poisson', 'absolute_error', 'friedman_mse', 'squared_error']),
#         min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 100),
#         max_depth=trial.suggest_int("max_depth", 1, 100),
#         min_samples_split=trial.suggest_int("min_samples_split", 2, 100),
#         random_state=27
#     )
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     return np.sqrt(mean_squared_log_error(y_test, y_pred))


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

# optuna.logging.set_verbosity(optuna.logging.WARNING)

# sampler = TPESampler(seed=27)
# study = optuna.create_study(study_name="random_forest", direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=10)

# print("Number of finished trials: ", len(study.trials))
# print("Best trial:")
# trial = study.best_trial
# print("  Value: ", trial.value)
# print("  Params: ")
# for key, value in trial.params.items():
#     print(f"    {key}: {value}")
# print()

"""
Number of finished trials:  10
Best trial:
  Value:  0.164014686713176
  Params:
    n_estimators: 544
    min_samples_leaf: 60
    max_depth: 8
    min_samples_split: 13

CPU times: user 6min 35s, sys: 276 ms, total: 6min 35s
Wall time: 6min 35s
"""

In [None]:
# %%time
# def objective(trial):
#     model = XGBRegressor(
#         max_depth=trial.suggest_int('max_depth', 1, 100),
#         learning_rate=trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
#         n_estimators=trial.suggest_int('n_estimators', 50, 1000),
#         min_child_weight=trial.suggest_int('min_child_weight', 1, 10),
#         gamma=trial.suggest_float('gamma', 1e-8, 1.0, log=True),
#         subsample=trial.suggest_float('subsample', 0.01, 1.0, log=True),
#         colsample_bytree=trial.suggest_float('colsample_bytree', 0.01, 1.0, log=True),
#         reg_alpha=trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
#         reg_lambda=trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
#         use_label_encoder=False,
#         random_state=27
#     )
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     try:
#         return np.sqrt(mean_squared_log_error(y_test, y_pred))
#     except Exception as e:
#         print(e)


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

# optuna.logging.set_verbosity(optuna.logging.WARNING)

# sampler = TPESampler(seed=27)
# study = optuna.create_study(study_name="xgb", direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=20)

# print("Number of finished trials: ", len(study.trials))
# print("Best trial:")
# trial = study.best_trial
# print("  Value: ", trial.value)
# print("  Params: ")
# for key, value in trial.params.items():
#     print(f"    {key}: {value}")
# print()

"""
Number of finished trials:  1
Best trial:
  Value:  0.1775845058982026
  Params:
    max_depth: 43
    learning_rate: 0.42576257222865277
    n_estimators: 749
    min_child_weight: 9
    gamma: 1.1669337024772915e-05
    subsample: 0.9097315662154742
    colsample_bytree: 0.6114890625963008
    reg_alpha: 4.761254082318455e-07
    reg_lambda: 0.008602430632882225

CPU times: user 24.5 s, sys: 667 ms, total: 25.2 s
Wall time: 25.2 s
"""

In [None]:
# %%time
# def objective(trial):
#     model = CatBoostRegressor(
#         iterations=trial.suggest_int("iterations", 100, 1000),
#         learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
#         colsample_bylevel=trial.suggest_float("colsample_bylevel", 0.05, 1.0),
#         min_data_in_leaf=trial.suggest_int("min_data_in_leaf", 1, 100),
#         depth=trial.suggest_int("depth", 4, 16),
#         l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
#         verbose=False,
#         random_state=27
#     )
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     return np.sqrt(mean_squared_log_error(y_test, y_pred))


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

# optuna.logging.set_verbosity(optuna.logging.WARNING)

# sampler = TPESampler(seed=27)
# study = optuna.create_study(study_name="catboost", direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=20)

# print("Number of finished trials: ", len(study.trials))
# print("Best trial:")
# trial = study.best_trial
# print("  Value: ", trial.value)
# print("  Params: ")

# for key, value in trial.params.items():
#     print(f"    {key}: {value}")
# print()

"""
Number of finished trials:  20
Best trial:
  Value:  0.27250015755480833
  Params:
    iterations: 101
    learning_rate: 0.0010172906333606835
    colsample_bylevel: 0.4796381789116622
    min_data_in_leaf: 42
    depth: 13
    l2_leaf_reg: 2.895211427077531e-08

CPU times: user 18min 10s, sys: 9min 21s, total: 27min 31s
Wall time: 13min 5s
"""


In [None]:
# %%time
# def objective(trial):
#     model = LGBMRegressor(
#         n_estimators=trial.suggest_int("n_estimators", 100, 1000),
#         max_depth=trial.suggest_int("max_depth", 1, 100),
#         learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
#         verbosity=-1,
#         boosting_type=trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
#         num_leaves=trial.suggest_int('num_leaves', 2, 256),
#         min_child_samples=trial.suggest_int('min_child_samples', 5, 100),
#         random_state=27
#     )
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     return np.sqrt(mean_squared_log_error(y_test, y_pred))


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

# optuna.logging.set_verbosity(optuna.logging.WARNING)

# sampler = TPESampler(seed=27)
# study = optuna.create_study(study_name="lgbm", direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=20)

# print("Number of finished trials: ", len(study.trials))
# print("Best trial:")
# trial = study.best_trial
# print("  Value: ", trial.value)
# print("  Params: ")
# for key, value in trial.params.items():
#     print(f"    {key}: {value}")
# print()

"""
Number of finished trials:  20
Best trial:
  Value:  0.9971664373669932
  Params:
    n_estimators: 676
    max_depth: 100
    learning_rate: 0.0010257989336468524
    boosting_type: dart
    num_leaves: 37
    min_child_samples: 22

CPU times: user 38min 41s, sys: 5.25 s, total: 38min 47s
Wall time: 38min 50s
"""

In [None]:
base_models = [
    ('XGBoost', XGBRegressor(
        n_estimators=395,
        max_depth=6,
        learning_rate=0.01,
        random_state=27
    )),
    ('LightGBM', LGBMRegressor(
        n_estimators=676,
        max_depth=100,
        learning_rate=0.0010257989336468524,
        boosting_type='dart',
        num_leaves=37,
        min_child_samples=22,
        random_state=27
    )),
    ('Catboost', CatBoostRegressor(
        iterations=101,
        learning_rate=0.0010172906333606835,
        colsample_bylevel=0.4796381789116622,
        min_data_in_leaf=42,
        depth=13,
        l2_leaf_reg=2.895211427077531e-08,
        random_state=27
    )),
    ('Random_forest', RandomForestRegressor(
        n_estimators=544,
        min_samples_leaf=60,
        max_depth=8,
        min_samples_split=13,
        random_state=27
    ))
]

In [None]:
meta_model = XGBRegressor(
    n_estimators=395,
    max_depth=6,
    learning_rate=0.01,
    random_state=27
)

In [None]:
%%time
RandomForestRegressor(
    n_estimators=544,
    min_samples_leaf=60,
    max_depth=8,
    min_samples_split=13,
    random_state=27
)
stacking_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)
stacking_model.fit(X, y)

In [None]:
y_pred_val = stacking_model.predict(X_val)

rmsle_val = np.sqrt(mean_squared_log_error(y_val, y_pred_val))
print(f"Validation Root mean squared logarithmic error regression loss: {rmsle_val:.8f}")

In [None]:
y_pred_test = stacking_model.predict(test)
y_pred_test[:10]

In [None]:
submission = pd.DataFrame({
    'id': sample_submission.id,
    'Rings': y_pred_test
})

submission.to_csv('Kapturov_S4E4_submission.csv', index=False)
submission.head(10)

In [None]:
pickle.dump(stacking_model, open("Kapturov_stacking_model.pkl", "wb"))

### Second approach

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn import preprocessing
from tensorflow import keras
import tensorflow_addons as tfa
from tensorflow.keras import layers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, Lambda, Concatenate, Add, BatchNormalization, LeakyReLU,ELU
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import classification_report

In [None]:
df_train = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
df_test  = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')
df_sub = pd.read_csv('/kaggle/input/playground-series-s4e4/sample_submission.csv')

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df_train['Sex_encoded'] = label_encoder.fit_transform(df_train['Sex'])
df_test['Sex_encoded']  = label_encoder.fit_transform(df_test['Sex'])
df_train.drop(columns=['Sex'], inplace=True)
df_test.drop(columns=['Sex'], inplace=True)

In [None]:
df_train.drop(columns=['id'], inplace=True)
df_test.drop(columns=['id'], inplace=True)

In [None]:
sns.set(style="whitegrid")
plt.figure(figsize=(12, 5))
sns.scatterplot(data=df_train, x='Shell weight', y='Rings', hue='Sex', palette='Set1')
plt.title('Rings vs. Shell Weight by Sex')
plt.xlabel('Shell Weight')
plt.ylabel('Rings')
plt.legend(title='Sex')
plt.gcf().set_facecolor('#DFFF00')

plt.show()

In [None]:
plt.figure(figsize=(12,5))
sns.boxplot(data=df_train, x='Sex', y='Rings', palette='Set1')
plt.title('Age Distribution (Rings) by Sex')
plt.xlabel('Sex')
plt.ylabel('Rings')
plt.gcf().set_facecolor('#FF00FF')
plt.xticks(ticks=[0, 1, 2], labels=['Male', 'Female', 'Infant'])
plt.show()

In [None]:
corr_matrix = df_train.corr()
plt.figure(figsize=(12,5))
sns.heatmap(corr_matrix, annot=True, cmap='viridis', fmt=".2f")
plt.gcf().set_facecolor('#00FFFF')
plt.title('Correlation Matrix')
plt.show()

In [None]:
numerical_features = ['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']
num_plots = len(numerical_features)
rows = 3
cols = math.ceil(num_plots / rows)

fig, axes = plt.subplots(rows, cols, figsize=(15, 10))

for i, feature in enumerate(numerical_features):
    row = i // cols
    col = i % cols
    ax = axes[row, col]
    sns.histplot(df_train[feature], kde=True, ax=ax)
    ax.set_title(f'Distribution of {feature}')
    ax.set_xlabel(feature)
    ax.set_ylabel('Frequency')

for i in range(num_plots, rows * cols):
    row = i // cols
    col = i % cols
    fig.delaxes(axes[row, col])

plt.gcf().set_facecolor('#FFF8DC')  # Set background color of the entire figure
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.boxplot(data=df_train[['Length', 'Diameter', 'Height', 'Whole weight', 'Shell weight']], orient='h', palette='Set3')
plt.title('Boxplot of Numerical Features')
plt.gcf().set_facecolor('#008080')
plt.show()

In [None]:
gender_counts = df_train['Sex'].value_counts()
plt.figure(figsize=(12, 5))
plt.pie(gender_counts.values, labels=gender_counts.index, autopct='%1.1f%%', startangle=140)
plt.axis('equal')
plt.title('Distribution of Gender')
plt.gcf().set_facecolor('#00FF00')
plt.show()