### First glance

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from optuna.samplers import TPESampler
import pickle
from IPython.core.display import HTML
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_log_error

In [None]:
  original_train = pd.read_csv('/kaggle/input/ps-4-e-2-abalone-dataset-from-uci/abalone.data', header=None)

train = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s4e4/sample_submission.csv')

In [None]:
# Show all properties on display
pd.set_option('display.max_columns', None)

train.head()

In [None]:
print(original_train.shape)
original_train.columns.tolist()

In [None]:
# Save original dataset into .csv
original_train.columns = train.columns[1:]
original_train.to_csv('orig.csv', index=False)
original_train.tail()

In [None]:
submission_id = test.id

train.drop(columns='id', axis=1, inplace=True)
test.drop(columns='id', axis=1, inplace=True)

In [None]:
train = train.drop_duplicates()

# Check whether all duplicates were removed
train_duplicates = train[train.duplicated()]
print(len(train_duplicates))

In [None]:
print(f'Train data: {train.shape}')
print(f'Test data: {test.shape}\n')

train_data_percentage = np.round(train.shape[0] / (train.shape[0] + test.shape[0]), 4)
print(f'Train data consists of {train_data_percentage * 100}% of all observations')
print(f'Test data consists of {(1 - train_data_percentage) * 100}% of all observations')

In [None]:
train.describe().T

In [None]:
print('TRAIN data\n')
print(f'{train.isna().sum()}\n\n\n')

print('TEST data\n')
print(test.isna().sum())

In [None]:
train = train.drop_duplicates()

# Check whether all duplicates were removed
duplicates = train[train.duplicated()]
len(duplicates)

In [None]:
X = pd.get_dummies(train, drop_first=True, dtype=int)
test = pd.get_dummies(test, drop_first=True, dtype=int)

In [None]:
sns.set(rc={'figure.figsize': (20, 16)})
X.hist(color='orange');

In [None]:
print(f'{train.Rings.value_counts()}\n\n')
print(train.Rings.value_counts() / train.shape[0])

In [None]:
# Split the train data into X and y
X = X.drop(['Rings'], axis=1)
y = train.Rings

# for column in X.columns.tolist():
#     X[column] = X[column].apply(lambda x: (x - X[column].min()) / (X[column].max() - X[column].min()))

# # Transform test data
# for column in test.columns.tolist():
#     test[column] = test[column].apply(lambda x: (x - test[column].min()) / (test[column].max() - test[column].min()))

# X.hist(color='LightSeaGreen');

In [None]:
%%time
# I figured out best hyperparameters previously
best_forest = RandomForestRegressor(
    random_state=27,
)

best_forest.fit(X, y)
importance = best_forest.feature_importances_

feature_importance = pd.DataFrame(data=importance, index=X.columns, columns=['importance']) \
    .sort_values(ascending=True, by='importance')

feature_importance.plot(kind='barh', figsize=(12, 8), color='orange');

In [None]:
print(X.columns)

In [None]:
# Training dataset
numeric_columns_train = X.select_dtypes(include=np.number)
corr_train = numeric_columns_train.corr(method='pearson')
mask_train = np.triu(np.ones_like(corr_train))
sns.heatmap(corr_train, annot=True, fmt='.2f', mask=mask_train, cmap='Spectral', cbar=None, linewidth=2)
plt.tight_layout()
plt.show()

In [None]:
X = X.drop(['Diameter', 'Whole weight.2'], axis=1)
test = test.drop(['Diameter', 'Whole weight.2'], axis=1)

In [None]:
# Training dataset
numeric_columns_train = X.select_dtypes(include=np.number)
corr_train = numeric_columns_train.corr(method='pearson')
mask_train = np.triu(np.ones_like(corr_train))
sns.heatmap(corr_train, annot=True, fmt='.2f', mask=mask_train, cmap='coolwarm', cbar=None, linewidth=2)
plt.tight_layout()
plt.show()

In [None]:
# Split data into train and val
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=27)

In [None]:
# %%time
# def objective(trial):
#     model = RandomForestRegressor(
#         n_estimators=trial.suggest_int("n_estimators", 100, 1000),
# #         criterion=trial.suggest_categorical("criterion", ['poisson', 'absolute_error', 'friedman_mse', 'squared_error']),
#         min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 100),
#         max_depth=trial.suggest_int("max_depth", 1, 100),
#         min_samples_split=trial.suggest_int("min_samples_split", 2, 100),
#         random_state=27
#     )
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     return np.sqrt(mean_squared_log_error(y_test, y_pred))


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

# optuna.logging.set_verbosity(optuna.logging.WARNING)

# sampler = TPESampler(seed=27)
# study = optuna.create_study(study_name="random_forest", direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=10)

# print("Number of finished trials: ", len(study.trials))
# print("Best trial:")
# trial = study.best_trial
# print("  Value: ", trial.value)
# print("  Params: ")
# for key, value in trial.params.items():
#     print(f"    {key}: {value}")
# print()

"""
Number of finished trials:  10
Best trial:
  Value:  0.164014686713176
  Params:
    n_estimators: 544
    min_samples_leaf: 60
    max_depth: 8
    min_samples_split: 13

CPU times: user 6min 35s, sys: 276 ms, total: 6min 35s
Wall time: 6min 35s
"""

In [None]:
# %%time
# def objective(trial):
#     model = XGBRegressor(
#         max_depth=trial.suggest_int('max_depth', 1, 100),
#         learning_rate=trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
#         n_estimators=trial.suggest_int('n_estimators', 50, 1000),
#         min_child_weight=trial.suggest_int('min_child_weight', 1, 10),
#         gamma=trial.suggest_float('gamma', 1e-8, 1.0, log=True),
#         subsample=trial.suggest_float('subsample', 0.01, 1.0, log=True),
#         colsample_bytree=trial.suggest_float('colsample_bytree', 0.01, 1.0, log=True),
#         reg_alpha=trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
#         reg_lambda=trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
#         use_label_encoder=False,
#         random_state=27
#     )
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     try:
#         return np.sqrt(mean_squared_log_error(y_test, y_pred))
#     except Exception as e:
#         print(e)


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

# optuna.logging.set_verbosity(optuna.logging.WARNING)

# sampler = TPESampler(seed=27)
# study = optuna.create_study(study_name="xgb", direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=20)

# print("Number of finished trials: ", len(study.trials))
# print("Best trial:")
# trial = study.best_trial
# print("  Value: ", trial.value)
# print("  Params: ")
# for key, value in trial.params.items():
#     print(f"    {key}: {value}")
# print()

"""
Number of finished trials:  1
Best trial:
  Value:  0.1775845058982026
  Params:
    max_depth: 43
    learning_rate: 0.42576257222865277
    n_estimators: 749
    min_child_weight: 9
    gamma: 1.1669337024772915e-05
    subsample: 0.9097315662154742
    colsample_bytree: 0.6114890625963008
    reg_alpha: 4.761254082318455e-07
    reg_lambda: 0.008602430632882225

CPU times: user 24.5 s, sys: 667 ms, total: 25.2 s
Wall time: 25.2 s
"""

In [None]:
# %%time
# def objective(trial):
#     model = CatBoostRegressor(
#         iterations=trial.suggest_int("iterations", 100, 1000),
#         learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
#         colsample_bylevel=trial.suggest_float("colsample_bylevel", 0.05, 1.0),
#         min_data_in_leaf=trial.suggest_int("min_data_in_leaf", 1, 100),
#         depth=trial.suggest_int("depth", 4, 16),
#         l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
#         verbose=False,
#         random_state=27
#     )
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     return np.sqrt(mean_squared_log_error(y_test, y_pred))


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

# optuna.logging.set_verbosity(optuna.logging.WARNING)

# sampler = TPESampler(seed=27)
# study = optuna.create_study(study_name="catboost", direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=20)

# print("Number of finished trials: ", len(study.trials))
# print("Best trial:")
# trial = study.best_trial
# print("  Value: ", trial.value)
# print("  Params: ")

# for key, value in trial.params.items():
#     print(f"    {key}: {value}")
# print()

"""
Number of finished trials:  20
Best trial:
  Value:  0.27250015755480833
  Params:
    iterations: 101
    learning_rate: 0.0010172906333606835
    colsample_bylevel: 0.4796381789116622
    min_data_in_leaf: 42
    depth: 13
    l2_leaf_reg: 2.895211427077531e-08

CPU times: user 18min 10s, sys: 9min 21s, total: 27min 31s
Wall time: 13min 5s
"""


In [None]:
# %%time
# def objective(trial):
#     model = LGBMRegressor(
#         n_estimators=trial.suggest_int("n_estimators", 100, 1000),
#         max_depth=trial.suggest_int("max_depth", 1, 100),
#         learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
#         verbosity=-1,
#         boosting_type=trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
#         num_leaves=trial.suggest_int('num_leaves', 2, 256),
#         min_child_samples=trial.suggest_int('min_child_samples', 5, 100),
#         random_state=27
#     )
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     return np.sqrt(mean_squared_log_error(y_test, y_pred))


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

# optuna.logging.set_verbosity(optuna.logging.WARNING)

# sampler = TPESampler(seed=27)
# study = optuna.create_study(study_name="lgbm", direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=20)

# print("Number of finished trials: ", len(study.trials))
# print("Best trial:")
# trial = study.best_trial
# print("  Value: ", trial.value)
# print("  Params: ")
# for key, value in trial.params.items():
#     print(f"    {key}: {value}")
# print()

"""
Number of finished trials:  20
Best trial:
  Value:  0.9971664373669932
  Params:
    n_estimators: 676
    max_depth: 100
    learning_rate: 0.0010257989336468524
    boosting_type: dart
    num_leaves: 37
    min_child_samples: 22

CPU times: user 38min 41s, sys: 5.25 s, total: 38min 47s
Wall time: 38min 50s
"""

In [None]:
base_models = [
    ('XGBoost', XGBRegressor(
        n_estimators=395,
        max_depth=6,
        learning_rate=0.01,
        random_state=27
    )),
    ('LightGBM', LGBMRegressor(
        n_estimators=676,
        max_depth=100,
        learning_rate=0.0010257989336468524,
        boosting_type='dart',
        num_leaves=37,
        min_child_samples=22,
        random_state=27
    )),
    ('Catboost', CatBoostRegressor(
        iterations=101,
        learning_rate=0.0010172906333606835,
        colsample_bylevel=0.4796381789116622,
        min_data_in_leaf=42,
        depth=13,
        l2_leaf_reg=2.895211427077531e-08,
        random_state=27
    )),
    ('Random_forest', RandomForestRegressor(
        n_estimators=544,
        min_samples_leaf=60,
        max_depth=8,
        min_samples_split=13,
        random_state=27
    ))
]

In [None]:
meta_model = XGBRegressor(
    n_estimators=395,
    max_depth=6,
    learning_rate=0.01,
    random_state=27
)

In [None]:
%%time
RandomForestRegressor(
    n_estimators=544,
    min_samples_leaf=60,
    max_depth=8,
    min_samples_split=13,
    random_state=27
)
stacking_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)
stacking_model.fit(X, y)

In [None]:
y_pred_val = stacking_model.predict(X_val)

rmsle_val = np.sqrt(mean_squared_log_error(y_val, y_pred_val))
print(f"Validation Root mean squared logarithmic error regression loss: {rmsle_val:.8f}")

In [None]:
y_pred_test = stacking_model.predict(test)
y_pred_test[:10]

In [None]:
submission = pd.DataFrame({
    'id': sample_submission.id,
    'Rings': y_pred_test
})

submission.to_csv('Kapturov_S4E4_submission.csv', index=False)
submission.head(10)

In [None]:
pickle.dump(stacking_model, open("Kapturov_stacking_model.pkl", "wb"))

### Second approach

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn import preprocessing
from tensorflow import keras
import tensorflow_addons as tfa
from tensorflow.keras import layers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, Lambda, Concatenate, Add, BatchNormalization, LeakyReLU,ELU
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import classification_report

In [None]:
df_train = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
df_test  = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')
df_sub = pd.read_csv('/kaggle/input/playground-series-s4e4/sample_submission.csv')

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df_train['Sex_encoded'] = label_encoder.fit_transform(df_train['Sex'])
df_test['Sex_encoded']  = label_encoder.fit_transform(df_test['Sex'])
df_train.drop(columns=['Sex'], inplace=True)
df_test.drop(columns=['Sex'], inplace=True)

In [None]:
df_train.drop(columns=['id'], inplace=True)
df_test.drop(columns=['id'], inplace=True)

In [None]:
sns.set(style="whitegrid")
plt.figure(figsize=(12, 5))
sns.scatterplot(data=df_train, x='Shell weight', y='Rings', hue='Sex', palette='Set1')
plt.title('Rings vs. Shell Weight by Sex')
plt.xlabel('Shell Weight')
plt.ylabel('Rings')
plt.legend(title='Sex')
plt.gcf().set_facecolor('#DFFF00')

plt.show()

In [None]:
plt.figure(figsize=(12,5))
sns.boxplot(data=df_train, x='Sex', y='Rings', palette='Set1')
plt.title('Age Distribution (Rings) by Sex')
plt.xlabel('Sex')
plt.ylabel('Rings')
plt.gcf().set_facecolor('#FF00FF')
plt.xticks(ticks=[0, 1, 2], labels=['Male', 'Female', 'Infant'])
plt.show()

In [None]:
corr_matrix = df_train.corr()
plt.figure(figsize=(12,5))
sns.heatmap(corr_matrix, annot=True, cmap='viridis', fmt=".2f")
plt.gcf().set_facecolor('#00FFFF')
plt.title('Correlation Matrix')
plt.show()

In [None]:
numerical_features = ['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']
num_plots = len(numerical_features)
rows = 3
cols = math.ceil(num_plots / rows)

fig, axes = plt.subplots(rows, cols, figsize=(15, 10))

for i, feature in enumerate(numerical_features):
    row = i // cols
    col = i % cols
    ax = axes[row, col]
    sns.histplot(df_train[feature], kde=True, ax=ax)
    ax.set_title(f'Distribution of {feature}')
    ax.set_xlabel(feature)
    ax.set_ylabel('Frequency')

for i in range(num_plots, rows * cols):
    row = i // cols
    col = i % cols
    fig.delaxes(axes[row, col])

plt.gcf().set_facecolor('#FFF8DC')  # Set background color of the entire figure
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.boxplot(data=df_train[['Length', 'Diameter', 'Height', 'Whole weight', 'Shell weight']], orient='h', palette='Set3')
plt.title('Boxplot of Numerical Features')
plt.gcf().set_facecolor('#008080')
plt.show()

In [None]:
gender_counts = df_train['Sex'].value_counts()
plt.figure(figsize=(12, 5))
plt.pie(gender_counts.values, labels=gender_counts.index, autopct='%1.1f%%', startangle=140)
plt.axis('equal')
plt.title('Distribution of Gender')
plt.gcf().set_facecolor('#00FF00')
plt.show()

In [None]:
numerical_val = df_train.select_dtypes(include=['int64', 'float64']).columns
num_cols = 2
num_rows = 8
fig, axes = plt.subplots(num_rows, num_cols, figsize=(14, 10))
axes = axes.flatten()

for i, feature in enumerate(numerical_val):
    sns.histplot(df_train[feature], kde=True, ax=axes[i*num_cols])
    axes[i*num_cols].set_title(f'{feature} Distribution')
    axes[i*num_cols].set_xlabel('')
    axes[i*num_cols].set_ylabel('')

    if i < len(numerical_val) - 1:
        sns.boxplot(x=df_train[feature], ax=axes[i*num_cols+1])
        axes[i*num_cols+1].set_title(f'{feature} Boxplot')
        axes[i*num_cols+1].set_xlabel('')
        axes[i*num_cols+1].set_ylabel('')

fig.suptitle('Distribution of Features for Outliers', fontsize=16)
plt.tight_layout()
plt.gcf().set_facecolor('lightblue')
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df_train['Sex_encoded'] = label_encoder.fit_transform(df_train['Sex'])
df_test['Sex_encodd']  = label_encoder.fit_transform(df_test['Sex'])
df_train.drop(columns=['Sex'], inplace=True)
df_test.drop(columns=['Sex'], inplace=True)

In [None]:
y = df_train['Rings']
df_train = df_train.drop(['Rings'],axis=1)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train, y, test_size=0.2, random_state=42)


In [None]:
def create_model(optimizer='adam', dropout_rate=0.0, learning_rate=0.001, activation='relu', hidden_layers=1):
    model = Sequential()
    model.add(Dense(128, activation=activation, input_shape=(X_train.shape[1],)))
    model.add(Dropout(rate=dropout_rate))

    for _ in range(hidden_layers):
        model.add(Dense(64, activation=activation))
        model.add(Dropout(rate=dropout_rate))

    model.add(Dense(1))

    if optimizer == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer == 'rmsprop':
        optimizer = RMSprop(learning_rate=learning_rate)
    elif optimizer == 'sgd':
        optimizer = SGD(learning_rate=learning_rate)

    model.compile(optimizer=optimizer, loss='mse')
    return model

model = KerasRegressor(build_fn=create_model, verbose=0)

param_dist = {
    'optimizer': ['adam', 'rmsprop', 'sgd'],
    'dropout_rate': np.linspace(0.0, 0.5, 6),
    'learning_rate': [0.001, 0.01, 0.1],
    'batch_size': [16, 32, 64],
    'activation': ['relu', 'tanh'],
    'hidden_layers': [1, 2, 3]
}

random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=20, cv=3, scoring='neg_mean_squared_error', verbose=2)
random_search_result = random_search.fit(X_train, y_train)

print("Best: %f using %s" % (random_search_result.best_score_, random_search_result.best_params_))
means = random_search_result.cv_results_['mean_test_score']
stds = random_search_result.cv_results_['std_test_score']
params = random_search_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

best_model = random_search_result.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Test MSE:", mse)


In [None]:
Final_Neur = best_model.predict(df_test)

In [None]:
df_public1 = pd.read_csv('/kaggle/input/ps4e4-prediction-generalization-regression/submission.csv')
df_public2 = pd.read_csv('/kaggle/input/k/jainsanyam10/ps4e4-prediction-generalization-regression/submission.csv')
df_public3 = pd.read_csv('/kaggle/input/galaxybillion/bestcnn.csv')
df_public4 = pd.read_csv('/kaggle/input/galaxybillion/bestnn.csv')
df_public5 = pd.read_csv('/kaggle/input/interspace/voyagerone2.csv')
df_public6 = pd.read_csv('/kaggle/input/neuralnetwo/voyageronefront.csv')

In [None]:
df_sub['Rings'] =  Final_Neur*0.001+df_public6['Rings']*0.5 +df_public2['Rings']*0.001+df_public5['Rings']*0.5

In [None]:
df_sub.to_csv('submission.csv', index=False)
df_sub['Rings'].hist()

### Third approach

In [None]:
assert sys.version_info >= (3, 5)

import sklearn
assert sklearn.__version__ >= "0.20"
import numpy as np
import os

import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
tqdm_notebook.get_lock().locks = []
from prettytable import PrettyTable
%matplotlib inline
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.4)
from copy import deepcopy
from functools import partial
from itertools import combinations

from sklearn.cluster import KMeans
# !pip install yellowbrick
# from yellowbrick.cluster import KElbowVisualizer

import random
from random import uniform
import gc
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_squared_log_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler,PowerTransformer, FunctionTransformer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy import stats
import statsmodels.api as sm
import math
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.base import BaseEstimator, TransformerMixin
!pip install optuna
!pip install cmaes
import optuna
import xgboost as xgb
# !pip install catboost
!pip install lightgbm --install-option=--gpu --install-option="--boost-root=C:/local/boost_1_69_0" --install-option="--boost-librarydir=C:/local/boost_1_69_0/lib64-msvc-14.1"
import lightgbm as lgb
!pip install category_encoders
from category_encoders import OneHotEncoder, OrdinalEncoder, CountEncoder, CatBoostEncoder
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.linear_model import PassiveAggressiveRegressor, ARDRegression, RidgeCV, ElasticNetCV
from sklearn.linear_model import TheilSenRegressor, RANSACRegressor, HuberRegressor
from sklearn.ensemble import HistGradientBoostingRegressor,ExtraTreesRegressor,GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from catboost import CatBoost, CatBoostRegressor,CatBoostClassifier
from catboost import Pool
from sklearn.neighbors import KNeighborsRegressor
# Suppress warnings
import warnings
warnings.filterwarnings("ignore")
pd.pandas.set_option('display.max_columns',None)

In [None]:
train=pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
test=pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')
submission=pd.read_csv("/kaggle/input/playground-series-s4e4/sample_submission.csv")

original=pd.read_csv("/kaggle/input/playgrounds4e04originaldata/Original.csv")

train_copy=train.copy()
test_copy=test.copy()

# Tag Orignal
original["original"]=1
train["original"]=0
test["original"]=0

train.drop(columns=["id"],inplace=True)
test.drop(columns=["id"],inplace=True)
original.drop(columns=["id"],inplace=True)

train=train.rename(columns={'Whole weight':'Whole_weight','Whole weight.1':'Shucked_weight', 'Whole weight.2':'Viscera_weight', 'Shell weight':'Shell_weight'})
test=test.rename(columns={'Whole weight':'Whole_weight','Whole weight.1':'Shucked_weight', 'Whole weight.2':'Viscera_weight', 'Shell weight':'Shell_weight'})

train=pd.concat([train,original],axis='rows')
train.reset_index(inplace=True,drop=True)

device='cpu'

target='Rings'

train.head()

In [None]:
table = PrettyTable()

table.field_names = ['Column Name', 'Data Type', 'Train Missing %', 'Test Missing %']
for column in train.columns:
    data_type = str(train[column].dtype)
    non_null_count_train= 100-train[column].count()/train.shape[0]*100
    if column!=target:
        non_null_count_test = 100-test[column].count()/test.shape[0]*100
    else:
        non_null_count_test="NA"
    table.add_row([column, data_type, non_null_count_train,non_null_count_test])
print(table)

In [None]:
class_0 = train[train['original'] == 0][target]
class_1 = train[train['original'] == 1][target]

mean_0 = np.mean(class_0)
median_0 = np.median(class_0)
mean_1 = np.mean(class_1)
median_1 = np.median(class_1)

fig, ax = plt.subplots(figsize=(12, 6))

ax.hist(class_0, bins=20, density=True, alpha=0.5, label='Original=0 Histogram')
ax.hist(class_1, bins=20, density=True, alpha=0.5, label='Original=1 Histogram')

x_values_0 = np.linspace(class_0.min(), class_0.max(), len(class_0))
density_values_0 = (1 / (np.sqrt(2 * np.pi) * np.std(class_0))) * np.exp(-0.5 * ((x_values_0 - mean_0) / np.std(class_0))**2)
ax.plot(x_values_0, density_values_0, color='red', label='Original=0 Density')

x_values_1 = np.linspace(class_1.min(), class_1.max(), len(class_1))
density_values_1 = (1 / (np.sqrt(2 * np.pi) * np.std(class_1))) * np.exp(-0.5 * ((x_values_1 - mean_1) / np.std(class_1))**2)
ax.plot(x_values_1, density_values_1, color='green', label='Original=1 Density')

ax.axvline(mean_0, color='blue', linestyle='dashed', linewidth=2, label='Mean (Original=0)')
ax.axvline(median_0, color='green', linestyle='dashed', linewidth=2, label='Median (Original=0)')
ax.axvline(mean_1, color='blue', linestyle='dashed', linewidth=2, label='Mean (Original=1)')
ax.axvline(median_1, color='red', linestyle='dashed', linewidth=2, label='Median (Original=1)')

ax.set_xlabel(target)
ax.set_ylabel('Frequency / Density')
ax.set_title('Histograms and Density Plots')

x_min = min(min(class_0), min(class_1))
x_max = max(max(class_0), max(class_1))
ax.set_xlim([x_min, x_max])

ax.legend(bbox_to_anchor=(1,1),fancybox=False,shadow=False, loc='upper left')

plt.tight_layout()
plt.show()

In [None]:
cont_cols=[f for f in train.columns if train[f].dtype in [float,int] and train[f].nunique()>2 and f not in [target]]

# Calculate the number of rows needed for the subplots
num_rows = (len(cont_cols) + 2) // 3

# Create subplots for each continuous column
fig, axs = plt.subplots(num_rows, 3, figsize=(15, num_rows*5))

# Loop through each continuous column and plot the histograms
for i, col in enumerate(cont_cols):
    # Determine the range of values to plot
    max_val = max(train[col].max(), test[col].max(), original[col].max())
    min_val = min(train[col].min(), test[col].min(), original[col].min())
    range_val = max_val - min_val

    # Determine the bin size and number of bins
    bin_size = range_val / 20
    num_bins_train = round(range_val / bin_size)
    num_bins_test = round(range_val / bin_size)
    num_bins_original = round(range_val / bin_size)

    # Calculate the subplot position
    row = i // 3
    col_pos = i % 3

    # Plot the histograms
    sns.histplot(train[col], ax=axs[row][col_pos], color='orange', kde=True, label='Train', bins=num_bins_train)
    sns.histplot(test[col], ax=axs[row][col_pos], color='green', kde=True, label='Test', bins=num_bins_test)
    sns.histplot(original[col], ax=axs[row][col_pos], color='blue', kde=True, label='Original', bins=num_bins_original)
    axs[row][col_pos].set_title(col)
    axs[row][col_pos].set_xlabel('Value')
    axs[row][col_pos].set_ylabel('Frequency')
    axs[row][col_pos].legend()

# Remove any empty subplots
if len(cont_cols) % 3 != 0:
    for col_pos in range(len(cont_cols) % 3, 3):
        axs[-1][col_pos].remove()

plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(data=original, vars=cont_cols+[target], hue='Sex')
plt.show()

In [None]:
plt.subplots(figsize=(16, 5))
sns.violinplot(x='Sex', y=col, data=train)
plt.title('Rings Distribution by Sex', fontsize=14)
plt.xlabel('Sex', fontsize=12)
plt.ylabel('Rings', fontsize=12)
sns.despine()
fig.tight_layout()
plt.show()

In [None]:
features=[f for f in test.columns if f!='Sex']
corr = train[features].corr()
plt.figure(figsize = (8, 6), dpi = 300)
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, mask = mask, cmap = 'magma', annot = True, annot_kws = {'size' : 6})
plt.title('Features Correlation Matrix\n', fontsize = 15, weight = 'bold')
plt.show()

In [None]:
def min_max_scaler(train, test, column):
    '''
    Min Max just based on train might have an issue if test has extreme values, hence changing the denominator uding overall min and max
    '''
    sc=MinMaxScaler()

    max_val=max(train[column].max(),test[column].max())
    min_val=min(train[column].min(),test[column].min())

    train[column]=(train[column]-min_val)/(max_val-min_val)
    test[column]=(test[column]-min_val)/(max_val-min_val)

    return train,test

def rmse(y1,y2):
    ''' RMSE Evaluator'''
    return(np.sqrt(mean_squared_error(np.array(y1),np.array(y2))))

def nearest(y_predicted):

    y_original=y_unique
    modified_prediction = np.zeros_like(y_predicted)

    for i, y_pred in enumerate(y_predicted):
        nearest_value = min(y_original, key=lambda x: abs(x - y_pred))
        modified_prediction[i] = nearest_value

    return modified_prediction

global y_unique
y_unique=train[target].unique()
y_unique_log=np.log1p(train[target]).unique()

xgb_params = {
            'n_estimators': 500,
            'max_depth': 6,
            'learning_rate': 0.0116,
            'colsample_bytree': 1,
            'subsample': 0.6085,
            'min_child_weight': 9,
            'reg_lambda': 4.879e-07,
            'max_bin': 431,
            'n_jobs': -1,
            'eval_metric': 'mae',
            'objective': "reg:absoluteerror",
            'tree_method': 'hist',
            'verbosity': 0,
            'random_state': 42,
        }

def fill_missing_numerical(train,test,target, max_iterations=10):
    '''Iterative Missing Imputer: Updates filled missing values iteratively using CatBoost Algorithm'''
    train_temp=train.copy()
    if target in train_temp.columns:
        train_temp=train_temp.drop(columns=target)


    df=pd.concat([train_temp,test],axis="rows")
    df=df.reset_index(drop=True)
    features=[ f for f in df.columns if df[f].isna().sum()>0]
    if len(features)>0:
        # Step 1: Store the instances with missing values in each feature
        missing_rows = store_missing_rows(df, features)

        # Step 2: Initially fill all missing values with "Missing"
#         for f in features:
#             df[f]=df[f].fillna(df[f].median())

        cat_features=[f for f in df.columns if not pd.api.types.is_numeric_dtype(df[f])]
        dictionary = {feature: [] for feature in features}

        for iteration in tqdm(range(max_iterations), desc="Iterations"):
            for feature in features:
#                 print(feature)
                # Skip features with no missing values
                rows_miss = missing_rows[feature].index
                replace_dict={}
                rev_replace_dict={}
                for col in  cat_features:
                    df[col]=df[col].astype(str)
                    int_cat=dict(zip(df[col].unique(),np.arange(0, df[col].nunique())))
                    rev_int_cat=dict(zip(np.arange(0, df[col].nunique()), df[col].unique()))
                    df[col]=df[col].replace(int_cat)

                    replace_dict[col]=int_cat
                    rev_replace_dict[col]=rev_int_cat

                missing_temp = df.loc[rows_miss].copy()
                non_missing_temp = df.drop(index=rows_miss).copy()
                y_pred_prev=missing_temp[feature]
                missing_temp = missing_temp.drop(columns=[feature])


                # Step 3: Use the remaining features to predict missing values using Random Forests
                X_train = non_missing_temp.drop(columns=[feature])
                y_train = non_missing_temp[[feature]]

#                 model1 = CatBoostRegressor(**cb_params)
#                 model1.fit(X_train, y_train,cat_features=cat_features, verbose=False)

                model2 = xgb.XGBRegressor(**xgb_params)
                model2.fit(X_train, y_train, verbose=False)

                # Step 4: Predict missing values for the feature and update all N features
                y_pred = np.array(model2.predict(missing_temp))

                df.loc[rows_miss, feature] = y_pred
#                 error_minimize=rmse(y_pred,y_pred_prev) #mean_squared_error
                error_minimize=np.sqrt(mean_squared_error(y_pred,y_pred_prev) )#mean_squared_error
                dictionary[feature].append(error_minimize)  # Append the error_minimize value

                for col in  cat_features:
                    df[col]=df[col].replace(rev_int_cat)


        for feature, values in dictionary.items():
            values=np.array(values)/sum(values)
            iterations = range(1, len(values) + 1)  # x-axis values (iterations)
            plt.plot(iterations, values, label=feature)  # plot the values
            plt.xlabel('Iterations')
            plt.ylabel('RMSE')
            plt.title('Minimization of RMSE with iterations')
            plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.show()
        train[features] = np.array(df.iloc[:train.shape[0]][features])
        test[features] = np.array(df.iloc[train.shape[0]:][features])

    return train,test

In [None]:
def new_features(data):
    df=data.copy()

    # Clean the weights by capping the over weights with total body weights
    df['Shell_weight']=np.where(df['Shell_weight']>df['Whole_weight'],df['Whole_weight'],df['Shell_weight'])
    df['Viscera_weight']=np.where(df['Viscera_weight']>df['Whole_weight'],df['Whole_weight'],df['Viscera_weight'])
    df['Shucked_weight']=np.where(df['Shucked_weight']>df['Whole_weight'],df['Whole_weight'],df['Shucked_weight'])

    # Abalone Surface area
    df["surface_area"]=df["Length"]*df["Diameter"]
    df['total_area']=2*(df["surface_area"]+df["Height"]*df["Diameter"]+df["Length"]*df["Height"])

    # Abalone density approx
    df['approx_density']=df['Whole_weight']/(df['surface_area']*df['Height']+1e-5)

    # Abalone BMI
    df['bmi']=df['Whole_weight']/(df['Height']**2+1e-5)

    # Measurement derived
    df["length_dia_ratio"]=df['Length']/(df['Diameter']+1e-5)
    df["length_height_ratio"]=df['Length']/(df['Height']+1e-5)
    df['shell_shuck_ratio']=df["Shell_weight"]/(df["Shucked_weight"]+1e-5)
    df['shell_viscera_ratio']=df['Shell_weight']/(df['Viscera_weight']+1e-5)

    df['viscera_tot_ratio']=df['Viscera_weight']/(df['Whole_weight']  +1e-5)
    df['shell_tot_ratio']=df['Shell_weight']/(df['Whole_weight']    +1e-5)
    df['shuck_tot_ratio']=df['Shucked_weight']/(df['Whole_weight']   +1e-5)
    df['shell_body_ratio']=df['Shell_weight']/(df['Shell_weight']+df['Whole_weight']+1e-5)
    df['flesh_ratio']=df['Shucked_weight']/(df['Whole_weight']+df['Shucked_weight']+1e-5)

    df['inv_viscera_tot']= df['Whole_weight'] / (df['Viscera_weight']+1e-5)
    df['inv_shell_tot']= df['Whole_weight'] /( df['Shell_weight']+1e-5)
    df['inv_shuck_tot']= df['Whole_weight'] / (df['Shucked_weight']+1e-5)


    # Water Loss during experiment
    df["water_loss"]=df["Whole_weight"]-df["Shucked_weight"]-df['Viscera_weight']-df['Shell_weight']
    df["water_loss"]=np.where(df["water_loss"]<0,min(df["Shucked_weight"].min(),df["Viscera_weight"].min(),df["Shell_weight"].min()),df["water_loss"])
    return df

train=new_features(train)
test=new_features(test)
original=new_features(original)

In [None]:
cont_cols = [f for f in train.columns if train[f].dtype != 'O' and train[f].nunique()>10000]

sc=MinMaxScaler()

global unimportant_features
global overall_best_score
global overall_best_col
unimportant_features=[]
overall_best_score=1e5
overall_best_col='none'

# for col in cont_cols:
#      train, test=min_max_scaler(train, test, col)

def transformer(train, test,cont_cols, target):
    '''
    Algorithm applies multiples transformations on selected columns and finds the best transformation using a single variable model performance
    '''
    global unimportant_features
    global overall_best_score
    global overall_best_col
    train_copy = train.copy()
    test_copy = test.copy()
    table = PrettyTable()
    table.field_names = ['Feature', 'Original RMSLE', 'Transformation', 'Tranformed RMSLE']

    for col in cont_cols:
        train_copy, test_copy=min_max_scaler(train_copy, test_copy, col)
        for c in ["log_"+col, "sqrt_"+col, "bx_cx_"+col, "y_J_"+col, "log_sqrt"+col, "pow_"+col, "pow2_"+col]:
            if c in train_copy.columns:
                train_copy = train_copy.drop(columns=[c])

        # Log Transformation after MinMax Scaling (keeps data between 0 and 1)
        train_copy["log_"+col] = np.log1p(train_copy[col])
        test_copy["log_"+col] = np.log1p(test_copy[col])

        # Square Root Transformation
        train_copy["sqrt_"+col] = np.sqrt(train_copy[col])
        test_copy["sqrt_"+col] = np.sqrt(test_copy[col])

        # Box-Cox transformation
        combined_data = pd.concat([train_copy[[col]], test_copy[[col]]], axis=0)
        epsilon = 1e-5
        transformer = PowerTransformer(method='box-cox')
#         scaled_data = transformer.fit_transform(combined_data + epsilon)

#         train_copy["bx_cx_" + col] = scaled_data[:train_copy.shape[0]]
#         test_copy["bx_cx_" + col] = scaled_data[train_copy.shape[0]:]
        train_copy["bx_cx_" + col] = transformer.fit_transform(train_copy[[col]]+epsilon)
        test_copy["bx_cx_" + col] = transformer.transform(test_copy[[col]]+epsilon)
        # Yeo-Johnson transformation
        transformer = PowerTransformer(method='yeo-johnson')
        train_copy["y_J_"+col] = transformer.fit_transform(train_copy[[col]])
        test_copy["y_J_"+col] = transformer.transform(test_copy[[col]])

        # Power transformation, 0.25
        power_transform = lambda x: np.power(x + 1 - np.min(x), 0.25)
        transformer = FunctionTransformer(power_transform)
        train_copy["pow_"+col] = transformer.fit_transform(train_copy[[col]])
        test_copy["pow_"+col] = transformer.transform(test_copy[[col]])

        # Power transformation, 2
        power_transform = lambda x: np.power(x + 1 - np.min(x), 2)
        transformer = FunctionTransformer(power_transform)
        train_copy["pow2_"+col] = transformer.fit_transform(train_copy[[col]])
        test_copy["pow2_"+col] = transformer.transform(test_copy[[col]])

        # Log to power transformation
        train_copy["log_sqrt"+col] = np.log1p(train_copy["sqrt_"+col])
        test_copy["log_sqrt"+col] = np.log1p(test_copy["sqrt_"+col])

        temp_cols = [col, "log_"+col, "sqrt_"+col, "bx_cx_"+col, "y_J_"+col,  "pow_"+col , "pow2_"+col,"log_sqrt"+col]

        train_copy,test_copy = fill_missing_numerical(train_copy,test_copy,target,3)
#         train_copy[temp_cols] = train_copy[temp_cols].fillna(0)
#         test_copy[temp_cols] = test_copy[temp_cols].fillna(0)

        pca = TruncatedSVD(n_components=1)
        x_pca_train = pca.fit_transform(train_copy[temp_cols])
        x_pca_test = pca.transform(test_copy[temp_cols])
        x_pca_train = pd.DataFrame(x_pca_train, columns=[col+"_pca_comb"])
        x_pca_test = pd.DataFrame(x_pca_test, columns=[col+"_pca_comb"])
        temp_cols.append(col+"_pca_comb")

        test_copy = test_copy.reset_index(drop=True)

        train_copy = pd.concat([train_copy, x_pca_train], axis='columns')
        test_copy = pd.concat([test_copy, x_pca_test], axis='columns')

        kf = KFold(n_splits=5, shuffle=True, random_state=42)

        rmse_scores = []

        for f in temp_cols:
            X = train_copy[[f]].values
            y = train_copy[target].values

            rmses = []
            for train_idx, val_idx in kf.split(X, y):
                X_train, y_train = X[train_idx], y[train_idx]
                x_val, y_val = X[val_idx], y[val_idx]
                model=LinearRegression()
                model.fit(X_train,np.log1p(y_train))
                y_pred=nearest(np.expm1(model.predict(x_val)))
                rmses.append(rmse(np.log1p(y_val),np.log1p(y_pred)))
            rmse_scores.append((f,np.mean(rmses)))

            if overall_best_score > np.mean(rmses):
                overall_best_score = np.mean(rmses)
                overall_best_col = f

            if f == col:
                orig_rmse = np.mean(rmses)

        best_col, best_rmse = sorted(rmse_scores, key=lambda x: x[1], reverse=False)[0]
        cols_to_drop = [f for f in temp_cols if f != best_col]
        final_selection = [f for f in temp_cols if f not in cols_to_drop]

        if cols_to_drop:
            unimportant_features = unimportant_features+cols_to_drop
        table.add_row([col,orig_rmse,best_col ,best_rmse])
    print(table)
    print("overall best CV RMSLE score: ",overall_best_score)
    return train_copy, test_copy

train, test= transformer(train, test,cont_cols, target)
train, test=fill_missing_numerical(train,test,target, max_iterations=3)

In [None]:
table = PrettyTable()
table.field_names = ['Cluster WOE Feature', 'RMSLE (CV-TRAIN)']
for col in cont_cols:
    sub_set=[f for f in unimportant_features if col in f]
#     print(sub_set)
    temp_train=train[sub_set]
    temp_test=test[sub_set]
    sc=StandardScaler()
    temp_train=sc.fit_transform(temp_train)
    temp_test=sc.transform(temp_test)
    model = KMeans()

    # print(ideal_clusters)
    kmeans = KMeans(n_clusters=5)
    kmeans.fit(np.array(temp_train))
    labels_train = kmeans.labels_

    train[col+"_unimp_cluster_WOE"] = labels_train
    test[col+"_unimp_cluster_WOE"] = kmeans.predict(np.array(temp_test))

    kf=KFold(n_splits=5, shuffle=True, random_state=42)

    X=train[[col+"_unimp_cluster_WOE"]].values
    y=train[target].values

    best_rmse=[]
    for train_idx, val_idx in kf.split(X,y):
        X_train,y_train=X[train_idx],y[train_idx]
        x_val,y_val=X[val_idx],y[val_idx]
        model=LinearRegression()
        model.fit(X_train,np.log1p(y_train))
        y_pred=nearest(np.expm1(model.predict(x_val)))
        best_rmse.append(rmse(np.log1p(y_val),np.log1p(y_pred)))

    table.add_row([col+"_unimp_cluster_WOE",np.mean(best_rmse)])
    if overall_best_score<np.mean(best_rmse):
            overall_best_score=np.mean(best_rmse)
            overall_best_col=col+"_unimp_cluster_WOE"

print(table)

In [None]:
cat_cols = [f for f in test.columns if (train[f].dtype != 'O' and train[f].nunique()<2000 and train[f].nunique()>2 and "WOE" not in f) or (train[f].dtype == 'O') ]
print(train[cat_cols].nunique())

In [None]:
def nearest_val(target):
    return min(common, key=lambda x: abs(x - target))

cat_cols_updated=['Sex']
for col in cat_cols:
    if train[col].dtype!="O":
        train[f"{col}_cat"]=train[col]global overall_best_score
# overall_best_score = 0
def OHE(train_df,test_df,cols,target):
    '''
    Function for one hot encoding, it first combines the data so that no category is missed and
    the category with least frequency can be dropped because of redundancy
    '''
    combined = pd.concat([train_df, test_df], axis=0)
    for col in cols:
        one_hot = pd.get_dummies(combined[col])
        counts = combined[col].value_counts()
        min_count_category = counts.idxmin()
        one_hot = one_hot.drop(min_count_category, axis=1)
        one_hot.columns=[str(f)+col+"_OHE" for f in one_hot.columns]
        combined = pd.concat([combined, one_hot], axis="columns")
        combined = combined.loc[:, ~combined.columns.duplicated()]

    # split back to train and test dataframes
    train_ohe = combined[:len(train_df)]
    test_ohe = combined[len(train_df):]
    test_ohe.reset_index(inplace=True,drop=True)
    test_ohe.drop(columns=[target],inplace=True)
    return train_ohe, test_ohe

def high_freq_ohe(train, test, extra_cols, target, n_limit=50):
    '''
    If you wish to apply one hot encoding on a feature with so many unique values, then this can be applied,
    where it takes a maximum of n categories and drops the rest of them treating as rare categories
    '''
    train_copy=train.copy()
    test_copy=test.copy()
    ohe_cols=[]
    for col in extra_cols:
        dict1=train_copy[col].value_counts().to_dict()
        ordered=dict(sorted(dict1.items(), key=lambda x: x[1], reverse=True))
        rare_keys=list([*ordered.keys()][n_limit:])
#         ext_keys=[f[0] for f in ordered.items() if f[1]<50]
        rare_key_map=dict(zip(rare_keys, np.full(len(rare_keys),9999)))

        train_copy[col]=train_copy[col].replace(rare_key_map)
        test_copy[col]=test_copy[col].replace(rare_key_map)
    train_copy, test_copy = OHE(train_copy, test_copy, extra_cols, target)
    drop_cols=[f for f in train_copy.columns if "9999" in f or train_copy[f].nunique()==1]
    train_copy=train_copy.drop(columns=drop_cols)
    test_copy=test_copy.drop(columns=drop_cols)

    return train_copy, test_copy

def cat_encoding(train, test,cat_cols_updated, target):
    global overall_best_score
    global overall_best_col
    table = PrettyTable()
    table.field_names = ['Feature', 'Encoded Features', 'RMSLE Score']
    train_copy=train.copy()
    test_copy=test.copy()
    train_dum = train.copy()
    for feature in cat_cols_updated:
#         print(feature)
#         cat_labels = train_dum.groupby([feature])[target].mean().sort_values().index
#         cat_labels2 = {k: i for i, k in enumerate(cat_labels, 0)}
#         train_copy[feature + "_target"] = train[feature].map(cat_labels2)
#         test_copy[feature + "_target"] = test[feature].map(cat_labels2)

        dic = train[feature].value_counts().to_dict()
        train_copy[feature + "_count"] =train[feature].map(dic)
        test_copy[feature + "_count"] = test[feature].map(dic)

        dic2=train[feature].value_counts().to_dict()
#         list1=np.arange(len(dic2.values()),0,-1) # Higher rank for high count
        list1=np.arange(len(dic2.values())) # Higher rank for low count
        dic3=dict(zip(list(dic2.keys()),list1))

        train_copy[feature+"_count_label"]=train[feature].replace(dic3).astype(float)
        test_copy[feature+"_count_label"]=test[feature].replace(dic3).astype(float)

        temp_cols = [ feature + "_count", feature + "_count_label"]#,feature + "_target"

        train_copy[feature]=train_copy[feature].astype(str)+"_"+feature
        test_copy[feature]=test_copy[feature].astype(str)+"_"+feature

        if train_copy[feature].nunique()<=15:
            train_copy[feature]=train_copy[feature].astype(str)+"_"+feature
            test_copy[feature]=test_copy[feature].astype(str)+"_"+feature
            train_copy, test_copy = OHE(train_copy, test_copy, [feature], target)

        else:
            train_copy,test_copy=high_freq_ohe(train_copy,test_copy,[feature], target, n_limit=15)

        train_copy=train_copy.drop(columns=[feature])
        test_copy=test_copy.drop(columns=[feature])

        kf = KFold(n_splits=5, shuffle=True, random_state=42)

        rmse_scores = []

        for f in temp_cols:
            X = train_copy[[f]].values
            y = train_copy[target].astype(int).values

            rmses = []
            for train_idx, val_idx in kf.split(X, y):
                X_train, y_train = X[train_idx], y[train_idx]
                x_val, y_val = X[val_idx], y[val_idx]
                model=LinearRegression()
                model.fit(X_train,np.log1p(y_train))
                y_pred=nearest(np.expm1(model.predict(x_val)))
                rmses.append(rmse(np.log1p(y_val),np.log1p(y_pred)))
            rmse_scores.append((f,np.mean(rmses)))
            if overall_best_score > np.mean(rmses):
                overall_best_score = np.mean(rmses)
                overall_best_col = f
        best_col, best_auc = sorted(rmse_scores, key=lambda x: x[1], reverse=False)[0]

        corr = train_copy[temp_cols].corr(method='pearson')
        corr_with_best_col = corr[best_col]
        cols_to_drop = [f for f in temp_cols if corr_with_best_col[f] > 0.5 and f != best_col]
        final_selection = [f for f in temp_cols if f not in cols_to_drop]
        if cols_to_drop:
            train_copy = train_copy.drop(columns=cols_to_drop)
            test_copy = test_copy.drop(columns=cols_to_drop)

        table.add_row([feature, best_col, best_auc])
        print(feature)
    print(table)
    print("overall best CV score: ", overall_best_score)
    return train_copy, test_copy

train, test= cat_encoding(train, test,cat_cols_updated, target)
train, test=fill_missing_numerical(train,test,target, max_iterations=3)

        test[f"{col}_cat"]=test[col]
        cat_cols_updated.append(f"{col}_cat")
        uncommon=list((set(test[col].unique())| set(train[col].unique()))-(set(test[col].unique())& set(train[col].unique())))
        if uncommon:
            common=list(set(test[col].unique())& set(train[col].unique()))
            train[f"{col}_cat"]=train[col].apply(nearest_val)
            test[f"{col}_cat"]=test[col].apply(nearest_val)
print(train[cat_cols_updated].nunique())

In [None]:
first_drop=[ f for f in unimportant_features if f in train.columns]
train=train.drop(columns=first_drop)
test=test.drop(columns=first_drop)

In [None]:
final_drop_list=[]

table = PrettyTable()
table.field_names = ['Original', 'Final Transformation', "RMSLE(CV)- Regression"]
dt_params={'criterion': 'absolute_error'}
threshold=0.85
# It is possible that multiple parent features share same child features, so store selected features to avoid selecting the same feature again
best_cols=[]

for col in cont_cols:
    sub_set=[f for f in train.columns if col in f and train[f].nunique()>100]
    print(sub_set)
    if len(sub_set)>2:
        correlated_features = []

        for i, feature in enumerate(sub_set):
            # Check correlation with all remaining features
            for j in range(i+1, len(sub_set)):
                correlation = np.abs(train[feature].corr(train[sub_set[j]]))
                # If correlation is greater than threshold, add to list of highly correlated features
                if correlation > threshold:
                    correlated_features.append(sub_set[j])

        # Remove duplicate features from the list
        correlated_features = list(set(correlated_features))
        print(correlated_features)
        if len(correlated_features)>=2:

            temp_train=train[correlated_features]
            temp_test=test[correlated_features]
            #Scale before applying PCA
            sc=StandardScaler()
            temp_train=sc.fit_transform(temp_train)
            temp_test=sc.transform(temp_test)

            # Initiate PCA
            pca=TruncatedSVD(n_components=1)
            x_pca_train=pca.fit_transform(temp_train)
            x_pca_test=pca.transform(temp_test)
            x_pca_train=pd.DataFrame(x_pca_train, columns=[col+"_pca_comb_final"])
            x_pca_test=pd.DataFrame(x_pca_test, columns=[col+"_pca_comb_final"])
            train=pd.concat([train,x_pca_train],axis='columns')
            test=pd.concat([test,x_pca_test],axis='columns')

            # Clustering
            model = KMeans()
            kmeans = KMeans(n_clusters=28)
            kmeans.fit(np.array(temp_train))
            labels_train = kmeans.labels_

            train[col+'_final_cluster'] = labels_train
            test[col+'_final_cluster'] = kmeans.predict(np.array(temp_test))

            cat_labels=cat_labels=train.groupby([col+"_final_cluster"])[target].mean()
            cat_labels2=cat_labels.to_dict()
            train[col+"_final_cluster"]=train[col+"_final_cluster"].map(cat_labels2)
            test[col+"_final_cluster"]=test[col+"_final_cluster"].map(cat_labels2)

            correlated_features=correlated_features+[col+"_pca_comb_final",col+"_final_cluster"]

            # See which transformation along with the original is giving you the best univariate fit with target
            kf=KFold(n_splits=5, shuffle=True, random_state=42)

            rmse_scores = []

            for f in temp_cols:
                X = train_copy[[f]].values
                y = train_copy[target].astype(int).values

                rmses = []
                for train_idx, val_idx in kf.split(X, y):
                    X_train, y_train = X[train_idx], y[train_idx]
                    x_val, y_val = X[val_idx], y[val_idx]
                    model=LinearRegression()
                    model.fit(X_train,np.log1p(y_train))
                    y_pred=nearest(np.expm1(model.predict(x_val)))
                    rmses.append(rmse(np.log1p(y_val),np.log1p(y_pred)))

                if f not in best_cols:
                    rmse_scores.append((f,np.mean(rmses)))
            best_col, best_rmse=sorted(rmse_scores, key=lambda x:x[1], reverse=False)[0]
            best_cols.append(best_col)

            cols_to_drop = [f for f in correlated_features if  f not in best_cols]
            if cols_to_drop:
                final_drop_list=final_drop_list+cols_to_drop
            table.add_row([col,best_col ,best_acc])

print(table)

In [None]:
final_features=[f for f in train.columns if f not in [target]]
final_features=[*set(final_features)]

sc=StandardScaler()

train_scaled=train.copy()
test_scaled=test.copy()
train_scaled[final_features]=sc.fit_transform(train[final_features])
test_scaled[final_features]=sc.transform(test[final_features])
len(final_features)

In [None]:
def post_processor(train, test):
    cols=train.drop(columns=[target]).columns
    train_cop=train.copy()
    test_cop=test.copy()
    drop_cols=[]
    for i, feature in enumerate(cols):
        for j in range(i+1, len(cols)):
            if sum(abs(train_cop[feature]-train_cop[cols[j]]))==0:
                if cols[j] not in drop_cols:
                    drop_cols.append(cols[j])
    print(drop_cols)
    train_cop.drop(columns=drop_cols,inplace=True)
    test_cop.drop(columns=drop_cols,inplace=True)

    return train_cop, test_cop

train_cop, test_cop=   post_processor(train_scaled, test_scaled)

In [None]:
data=train_cop[train_cop[target].isin([7,8,9,10,11])]
data_=train_cop[~train_cop[target].isin([7,8,9,10,11])]

from sklearn.ensemble import IsolationForest

def isolation_forest(data):
    data_pure=data.copy()

#     model = IsolationForest(contamination=0.010650427383702971, random_state=0)
    model = IsolationForest(contamination=0.01, random_state=0)

    drop_cols=[f for f in data_pure.columns if f in target]
    model.fit(data_pure.drop(columns=drop_cols))

    # Predict the anomaly scores for each data point
    anomalies = model.predict(data_pure.drop(columns=drop_cols))

    outliers = anomalies == -1

    # Combine the outlier information with the original data and labels
    data_pure['outlier_ISF'] = outliers

    # Print the identified outliers
    identified_outliers = data_pure[data_pure['outlier_ISF']]
    print(f"Number of detected Potential outliers: {identified_outliers.shape[0]}")

    data_clean=data[~data_pure['outlier_ISF']]
    print(data_clean.shape)

    return data_clean


data_clean=isolation_forest(data).reset_index(drop=True)

from sklearn.neighbors import LocalOutlierFactor

# params={'n_neighbors': 13, 'contamination': 0.01019797151100069}
params={'n_neighbors': 13, 'contamination': 0.01}


def lof(data):
    data_pure=data.copy()
    drop_cols=[f for f in data_pure.columns if f in target]

    features = data_pure.drop(columns=drop_cols)
    lof = LocalOutlierFactor(**params)  # Adjust contamination based on your data
    anomalies = lof.fit_predict(features)  # Negative scores are outliers

    outliers = anomalies == -1

    # Combine the outlier information with the original data and labels
    data_pure['outliers_LOF'] = outliers

    # Print the identified outliers
    identified_outliers = data_pure[data_pure['outliers_LOF']]
    print(f"Number of detected Potential outliers: {identified_outliers.shape[0]}")

    data_clean=data[~data_pure['outliers_LOF']]
    print(data_clean.shape)

    return data_clean

data_clean=lof(data_clean).reset_index(drop=True)

def pca_anamolies(data):
    data_pure=data.copy()
    drop_cols=[f for f in data_pure.columns if f in target]

    features = data_pure.drop(columns=drop_cols)

    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)

    pca = PCA(n_components=2)  # Choose the number of components for visualization
    principal_components = pca.fit_transform(scaled_features)

    # Calculate the reconstruction error (MSE) for each data point
    reconstruction_errors = ((scaled_features - pca.inverse_transform(principal_components)) ** 2).mean(axis=1)

    # Set a threshold for anomaly detection
    threshold = 3.5 # Adjust the threshold based on your data and desired sensitivity

    # Identify potential outliers
    potential_outliers = [index for index, error in enumerate(reconstruction_errors) if error > threshold]

    # Create a new column 'outliers' in the DataFrame
    data_pure['outliers_PCA'] = False
    data_pure.loc[potential_outliers, 'outliers_PCA'] = True

    print(f"Number of detected Potential outliers: {len(potential_outliers)}")

    data_clean=data[~data_pure['outliers_PCA']]
    print(data_clean.shape)

    # Plot the data with potential outliers highlighted
    plt.scatter(principal_components[:, 0], principal_components[:, 1], c='green', label='Normal Data')
    plt.scatter(principal_components[potential_outliers, 0], principal_components[potential_outliers, 1], c='red', label='Potential Outliers')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.legend()
    plt.title('PCA with Potential Outliers')
    plt.show()
    return data_clean

data_clean=pca_anamolies( data_clean).reset_index(drop=True)

In [None]:
X_train = train_cop.drop(columns=[target])
y_train = train_cop[target]

X_test = test_cop.copy()

print(X_train.shape, X_test.shape)

In [None]:
def get_most_important_features(X_train, y_train, n,model_input):

    lgb_params = {
            'n_estimators': 100,
            'max_depth': 6,
            "num_leaves": 16,
            'learning_rate': 0.05,
            'subsample': 0.7,
            'colsample_bytree': 0.8,
            #'reg_alpha': 0.25,
            'reg_lambda': 5e-07,
            'objective': 'regression_l2',
            'metric': 'mean_absolute_error',
            'boosting_type': 'gbdt',
            'random_state': 42,
            'verbose':-1
        }
    cb_params = {
            'iterations': 300,
            'depth': 6,
            'learning_rate': 0.01,
            'l2_leaf_reg': 0.5,
            'random_strength': 0.2,
            'max_bin': 150,
            'od_wait': 80,
            'one_hot_max_size': 70,
            'grow_policy': 'Depthwise',
            'bootstrap_type': 'Bayesian',
            'od_type': 'IncToDec',
            'eval_metric': 'MSLE',
            'loss_function': 'RMSE',
            'random_state': 42,
             'verbose':False
        }

    xgb_params = {
            'n_estimators': 500,
            'max_depth': 6,
            'learning_rate': 0.0116,
            'colsample_bytree': 1,
            'min_child_weight': 9,
            'n_jobs': -1,
            'eval_metric': 'rmsle',
            'objective': "reg:squarederror",
            'tree_method': 'hist',
            'verbosity': 0,
            'random_state': 42,
        }
    if 'xgb' in model_input:
        model = xgb.XGBRegressor(**xgb_params)
    elif 'cat' in model_input:
        model=CatBoostRegressor(**cb_params)
    else:
        model=lgb.LGBMRegressor(**lgb_params)



    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores = []

    for train_idx, val_idx in kfold.split(X_train,y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        if 'lgb' in model_input:
            model.fit(X_train_fold, np.log1p(y_train_fold))
        else:
            model.fit(X_train_fold, np.log1p(y_train_fold),verbose=False)

        y_pred = model.predict(X_val_fold)

        rmses = np.sqrt(mean_squared_error(np.log1p(y_val_fold),y_pred))
        rmse_scores.append(rmses)

    avg_rmse = np.mean(rmse_scores)

    feature_importances = model.feature_importances_

    feature_importance_list = [(X_train.columns[i], importance) for i, importance in enumerate(feature_importances)]

    sorted_features = sorted(feature_importance_list, key=lambda x: x[1], reverse=True)

    top_n_features = [feature[0] for feature in sorted_features[:n]]
    print(avg_rmse)
    return top_n_features


In [None]:
n_imp_features_cat=get_most_important_features(X_train.reset_index(drop=True), y_train,75, 'cat')
n_imp_features_xgb=get_most_important_features(X_train.reset_index(drop=True), y_train,75, 'xgb')
n_imp_features_lgbm=get_most_important_features(X_train.reset_index(drop=True), y_train, 75, 'lgbm')

In [None]:
n_imp_features=[*set(n_imp_features_lgbm+n_imp_features_cat)]#
print(f"{len(n_imp_features)} features have been selected from three algorithms for the final model")

X_train=X_train[n_imp_features]
X_test=X_test[n_imp_features]