In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from optuna.samplers import TPESampler
import pickle
from IPython.core.display import HTML
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_log_error

In [None]:
original_train = pd.read_csv('/kaggle/input/ps-4-e-2-abalone-dataset-from-uci/abalone.data', header=None)

train = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s4e4/sample_submission.csv')

In [None]:
# Show all properties on display
pd.set_option('display.max_columns', None)

train.head()

In [None]:
print(original_train.shape)
original_train.columns.tolist()

In [None]:
# Save original dataset into .csv
original_train.columns = train.columns[1:]
original_train.to_csv('orig.csv', index=False)
original_train.tail()

In [None]:
submission_id = test.id

train.drop(columns='id', axis=1, inplace=True)
test.drop(columns='id', axis=1, inplace=True)

In [None]:
train = train.drop_duplicates()

# Check whether all duplicates were removed
train_duplicates = train[train.duplicated()]
print(len(train_duplicates))

In [None]:
print(f'Train data: {train.shape}')
print(f'Test data: {test.shape}\n')

train_data_percentage = np.round(train.shape[0] / (train.shape[0] + test.shape[0]), 4)
print(f'Train data consists of {train_data_percentage * 100}% of all observations')
print(f'Test data consists of {(1 - train_data_percentage) * 100}% of all observations')

In [None]:
train.describe().T

In [None]:
print('TRAIN data\n')
print(f'{train.isna().sum()}\n\n\n')

print('TEST data\n')
print(test.isna().sum())

In [None]:
train = train.drop_duplicates()

# Check whether all duplicates were removed
duplicates = train[train.duplicated()]
len(duplicates)

In [None]:
X = pd.get_dummies(train, drop_first=True, dtype=int)
test = pd.get_dummies(test, drop_first=True, dtype=int)

In [None]:
sns.set(rc={'figure.figsize': (20, 16)})
X.hist(color='orange');

In [None]:
print(f'{train.Rings.value_counts()}\n\n')
print(train.Rings.value_counts() / train.shape[0])

In [None]:
# Split the train data into X and y
X = X.drop(['Rings'], axis=1)
y = train.Rings

# for column in X.columns.tolist():
#     X[column] = X[column].apply(lambda x: (x - X[column].min()) / (X[column].max() - X[column].min()))

# # Transform test data
# for column in test.columns.tolist():
#     test[column] = test[column].apply(lambda x: (x - test[column].min()) / (test[column].max() - test[column].min()))

# X.hist(color='LightSeaGreen');

In [None]:
%%time
# I figured out best hyperparameters previously
best_forest = RandomForestRegressor(
    random_state=27,
)

best_forest.fit(X, y)
importance = best_forest.feature_importances_

feature_importance = pd.DataFrame(data=importance, index=X.columns, columns=['importance']) \
    .sort_values(ascending=True, by='importance')

feature_importance.plot(kind='barh', figsize=(12, 8), color='orange');

In [None]:
print(X.columns)

In [None]:
# Training dataset
numeric_columns_train = X.select_dtypes(include=np.number)
corr_train = numeric_columns_train.corr(method='pearson')
mask_train = np.triu(np.ones_like(corr_train))
sns.heatmap(corr_train, annot=True, fmt='.2f', mask=mask_train, cmap='Spectral', cbar=None, linewidth=2)
plt.tight_layout()
plt.show()

In [None]:
X = X.drop(['Diameter', 'Whole weight.2'], axis=1)
test = test.drop(['Diameter', 'Whole weight.2'], axis=1)

In [None]:
# Training dataset
numeric_columns_train = X.select_dtypes(include=np.number)
corr_train = numeric_columns_train.corr(method='pearson')
mask_train = np.triu(np.ones_like(corr_train))
sns.heatmap(corr_train, annot=True, fmt='.2f', mask=mask_train, cmap='coolwarm', cbar=None, linewidth=2)
plt.tight_layout()
plt.show()

In [None]:
# Split data into train and val
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=27)