In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_squared_error
from math import sqrt
import lightgbm as lgb
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Read and view dataframes

In [None]:
df_train = pd.read_csv("/kaggle/input/playground-series-s3e8/train.csv", low_memory = False)
df_test = pd.read_csv("/kaggle/input/playground-series-s3e8/test.csv", low_memory=False)
df_train.head()

# Encode string values to integers

In [None]:
df_train = df_train.drop('id', axis=1)
df_test = df_test.drop('id', axis=1)
df_train.head()

In [None]:
df_train.isna().sum()

In [None]:
cut_values = [['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']]
cut_encoder = OrdinalEncoder(categories=cut_values)
df_train['cut'] = cut_encoder.fit_transform(df_train[['cut']])
df_test['cut'] = cut_encoder.fit_transform(df_test[['cut']])

In [None]:
color_values = [['D', 'E', 'F', 'G', 'H', 'I', 'J']]
color_encoder = OrdinalEncoder(categories=color_values)
df_train['color'] = color_encoder.fit_transform(df_train[['color']])
df_test['color'] = color_encoder.fit_transform(df_test[['color']])

In [None]:
clarity_values = [['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']]
clarity_encoder = OrdinalEncoder(categories=clarity_values)
df_train['clarity'] = clarity_encoder.fit_transform(df_train[['clarity']])
df_test['clarity'] = clarity_encoder.fit_transform(df_test[['clarity']])

In [None]:
for column in ['x', 'y', 'z']:
    df_train[column] = df_train[column].replace(0, 1)
    df_test[column] = df_test[column].replace(0, 1)

In [None]:
df_train.head()

# Engineer new features

In [None]:
df_train['volume'] = df_train['x'] * df_train['y'] * df_train['z']
df_train['density'] = df_train['carat'] / df_train['volume']
df_train['table_percentage'] = (df_train['table'] / ((df_train['x'] + df_train['y']) / 2)) * 100
df_train['depth_percentage'] = (df_train['depth'] / ((df_train['x'] + df_train['y']) / 2)) * 100
df_train['symmetry'] = (abs(df_train['x'] - df_train['z']) + abs(df_train['y'] - df_train['z'])) / (df_train['x'] + df_train['y'] + df_train['z'])
df_train['surface_area'] = 2 * ((df_train['x'] * df_train['y']) + (df_train['x'] * df_train['z']) + (df_train['y'] * df_train['z']))
df_train['depth_to_table_ratio'] = df_train['depth'] / df_train['table']

In [None]:
df_test['volume'] = df_test['x'] * df_test['y'] * df_test['z']
df_test['density'] = df_test['carat'] / df_test['volume']
df_test['table_percentage'] = (df_test['table'] / ((df_test['x'] + df_test['y']) / 2)) * 100
df_test['depth_percentage'] = (df_test['depth'] / ((df_test['x'] + df_test['y']) / 2)) * 100
df_test['symmetry'] = (abs(df_test['x'] - df_test['z']) + abs(df_test['y'] - df_test['z'])) / (df_test['x'] + df_test['y'] + df_test['z'])
df_test['surface_area'] = 2 * ((df_test['x'] * df_test['y']) + (df_test['x'] * df_test['z']) + (df_test['y'] * df_test['z']))
df_test['depth_to_table_ratio'] = df_test['depth'] / df_test['table']

In [None]:
df_train.head()

# Split data into x_train and y_train

In [None]:
y_train = df_train['price']
x_train = df_train.drop('price', axis=1)
x_test = df_test

In [None]:
x_train

In [None]:
x_test

In [None]:
x_train.describe()

# Splitting into training data and validation data

In [None]:
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()
x_test = x_test.to_numpy()

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2)

# Creating regression model

In [None]:
lgb_params = {
    'objective': 'regression',
    'metric': 'mse',
}
kf = KFold(n_splits=10, shuffle=True, random_state=8)
model = lgb.LGBMRegressor(**lgb_params, importance_type='gain')
cv_results = cross_validate(model, x_train, y_train, scoring='neg_root_mean_squared_error', cv=kf, return_estimator=True)

In [None]:
columns = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z',
       'volume', 'density', 'table_percentage', 'depth_percentage',
       'symmetry', 'surface_area', 'depth_to_table_ratio']

In [None]:
feature_importances = [estimator.feature_importances_ for estimator in cv_results["estimator"]]
feature_importances = np.mean(feature_importances, axis=0)
feature_importances.shape

In [None]:
plt.barh(columns, feature_importances)
plt.xlabel("Feature importance")
plt.ylabel("Feature name")
plt.title("Feature importance using LGBM and cross validation")
plt.show()

In [None]:
y_pred = np.mean([model.predict(x_val) for model in cv_results['estimator'] ], axis=0)
print(y_pred[:5], y_val[:5])
sqrt(mean_squared_error(y_val, y_pred))

# Make submission

In [None]:
x_test.shape

In [None]:
submission = pd.read_csv("/kaggle/input/playground-series-s3e8/sample_submission.csv")
submission

In [None]:
predictions = np.mean([model.predict(x_test) for model in cv_results['estimator'] ], axis=0)
predictions.shape

In [None]:
import warnings
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter("ignore", SettingWithCopyWarning)

for i in range(predictions.shape[0]):
    submission['price'][i] = predictions[i]

submission

In [None]:
submission.to_csv('submission.csv', index=False)