In [None]:
import pandas as pd
import requests
import json
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import datasets
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import pickle
import joblib

from sklearn import linear_model
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder

In [None]:
df = pd.read_csv("../data/df_baseline.csv")
test = pd.read_csv("../data/tests/diamonds_test.csv")

In [None]:
df

In [None]:
test

In [None]:
df.describe()

In [None]:
df = df[df['x'] > 0.05]
df = df[df['x'] < 90]

In [None]:
df = df[df['y'] > 0.05]
df = df[df['y'] < 50]

In [None]:
df = df[df['z'] > 2]


In [None]:
df = df[df['depth'] > 50]
df = df[df['depth'] < 75]

In [None]:
df = df[df['table'] > 45]
df = df[df['table'] < 75]

In [None]:
df['L/W ratio'] = df['x'] / df['y']

In [None]:
test.describe()

In [None]:
test.loc[test['y'] < 2, 'y'] = 5.739648
test.loc[test['y'] > 10, 'y'] = 10

In [None]:
test.loc[test['x'] < 2, 'x'] = 5.729978
test.loc[test['x'] > 10, 'x'] = 10

In [None]:
test.loc[test['z'] < 2, 'z'] = 3.538479

In [None]:
test.loc[test['table'] < 49, 'table'] = 57.490337
test.loc[test['table'] > 73, 'table'] = 57.490337

In [None]:
test.loc[test['depth'] < 52, 'depth'] = 61.753581
test.loc[test['depth'] > 73, 'depth'] = 61.753581

In [None]:
test['L/W ratio'] = test['x'] / test['y']
test

In [None]:
df

In [None]:
df.describe()

In [None]:
df.corr()

In [None]:
sns.jointplot(data=df, x="table", y="price")

In [None]:
sns.jointplot(data=df, x="depth", y="price")

In [None]:
sns.jointplot(data=df, x="x", y="price")

In [None]:
sns.jointplot(data=df, x="y", y="price")

In [None]:
sns.jointplot(data=df, x="carat", y="price")

In [None]:
sns.jointplot(data=df, x="z", y="price")

In [None]:
df.describe()

In [None]:
test.describe()


In [None]:
df

In [None]:
test

In [None]:
sns.histplot(data=test, x="carat")

In [None]:
test.describe()

In [None]:
test.describe()

## Categorical data

In [None]:
#Extract City
df_cat = df[['cut','color','clarity']]
df_cat

In [None]:
cols = df_cat.columns

In [None]:
#Label encoding

le = LabelEncoder()
for i in cols:
    df_cat[i] = le.fit_transform(df_cat[i])
df_cat

In [None]:
test_cols = ['cut','color','clarity']
for i in test_cols:
    test[i] = le.fit_transform(test[i])
test

# Numerical variables


In [None]:
df_num = df[['price','carat','depth','table','x','y','z','L/W ratio']]
df_num

In [None]:
test.describe()

In [None]:
df_num.describe()

In [None]:
df = pd.concat([df_num, df_cat], axis=1)

In [None]:
df

In [None]:
df.corr()

In [None]:
df = df[['price','carat','depth','table','L/W ratio','cut','color','clarity']]
df

In [None]:
test = test[['carat','depth','table','L/W ratio','cut','color','clarity']]
test

In [None]:
X = df[['carat','depth','table','L/W ratio','cut','color','clarity']]
y = df['price']

## Scaling

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(X)
scaled_data

In [None]:
scaled_df = pd.DataFrame(scaled_data, columns=['price','carat','depth','table','L/W ratio','cut','color','clarity'])
scaled_df.describe()

In [None]:
scaled_test = scaler.fit_transform(test)
scaled_test

In [None]:
scaled_test = pd.DataFrame(scaled_test, columns=['carat','depth','table','L/W ratio','cut','color','clarity'])
scaled_test.describe()

In [None]:
X = scaled_df

In [None]:
# MODEL

#model = RandomForestRegressor()
model = GradientBoostingRegressor()

In [None]:
param_grid = {'n_estimators': [16, 32, 64, 128, 256, 512],
              'max_depth': [2, 4, 8, 16]}

In [None]:
grid_search = GridSearchCV(model,
                           param_grid,
                           cv=5,
                           verbose=3,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1)

In [None]:
grid_search.fit(X,y)

In [None]:
print('\n')
print('Best hyperparameters: ', grid_search.best_params_, '\n')
print('Best score: ', -grid_search.best_score_, '\n')

In [None]:
final_model = grid_search.best_estimator_

In [None]:
#model = RandomForestRegressor(max_depth=16, n_estimators=128)
model = GradientBoostingRegressor(max_depth=8, n_estimators=256)

In [None]:
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
check = pd.DataFrame({'Ground truth':y_test, 'Predictions':predictions, 'Diff':y_test-predictions})
check

In [None]:
rmse = mean_squared_error(y_test, predictions, squared=False)
rmse

In [None]:
model = RandomForestRegressor(max_depth=8, n_estimators=256)

In [None]:
model.fit(X, y)

In [None]:
predictions = model.predict(test)

In [None]:
predictions

In [None]:
result = pd.DataFrame({'id': range(0, len(predictions)), 'price': predictions})
result

In [None]:
result.describe()

In [None]:
result.to_csv("../submissions/submision_no_z_enlabel_random_forest.csv", index=False)

In [None]:
X