In [19]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler

import tools_tb

import warnings

warnings.filterwarnings("ignore")

## Preprocessing

In [54]:
df = pd.read_csv("diamonds_train.csv", index_col = 0)

# Cleaning
df = tools_tb.ordinal_encoder(df)
df = tools_tb.nominal_encoder(df)
#df = tools_tb.outliers_remover(df)

df["length_width"] = df["x"] / df["y"]
#df = df[(df["length_width"] > 0.8) & (df["length_width"] < 1.2)]
df[df["length_width"].isna()] = 0

# Defining X and y
to_drop = ["cut", "clarity"]

X, y = tools_tb.variables_split(df, to_drop)

## Model

In [56]:
# Using Grid
split = .2
seed = 7881
count = 1

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split, random_state = seed)

# Cross validation
skf = StratifiedKFold(n_splits = 4, shuffle = True, random_state = seed)

# Parameters for Random Forest
parameters = {
    "n_estimators" : [1000],
    "criterion" : ["mse"],
    "max_depth" : [X.shape[1]],
    "min_samples_split" : [2],
    "min_samples_leaf" : [2],
    "max_features" : [10],
    "n_jobs" : [-1],
    "random_state" : [seed],
    "warm_start" : [True]
}

# Model
rf = RandomForestRegressor()

grid = GridSearchCV(estimator = rf, param_grid = parameters, n_jobs = -1, cv = skf)

grid.fit(X_train, y_train)

# Predictions
y_pred_train = grid.predict(X_train)
y_pred_test = grid.predict(X_test)

# Scores
score_train = grid.score(X_train, y_train)
score_test = grid.score(X_test, y_test)

# RMSE
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"- Model {count} | seed = {seed} -")
print(f"Train data\nScore = {score_train}\nRMSE = {rmse_train}")
print("-" * 25)
print(f"Test data\nScore = {score_test}\nRMSE = {rmse_test}")
#print("#" * 50)
#print("grid.best_stimator_", grid.best_estimator_)
#print("-" * 25)
#print("grid.best_params_", grid.best_params_)
#print("-" * 25)
#print("grid.best_score", grid.best_score_)

- Model 1 | seed = 7881 -
Train data
Score = 0.9934837096209027
RMSE = 321.66151775702184
-------------------------
Test data
Score = 0.9825056651809854
RMSE = 524.5122380563685
##################################################
grid.best_stimator_ RandomForestRegressor(max_depth=16, max_features=10, min_samples_leaf=2,
                      n_estimators=1000, n_jobs=-1, random_state=7881,
                      warm_start=True)
-------------------------
grid.best_params_ {'criterion': 'mse', 'max_depth': 16, 'max_features': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000, 'n_jobs': -1, 'random_state': 7881, 'warm_start': True}
-------------------------
grid.best_score 0.9800092225709334


## Model and full data

In [57]:
grid.fit(X, y)

y_pred_final = grid.predict(X)

score_final = grid.score(X, y)
rmse_final = np.sqrt(mean_squared_error(y, y_pred_final))

print(f"Full data\nScore = {score_final}\nRMSE = {rmse_final}")

Full data
Score = 0.9933593265315354
RMSE = 324.40561953647705


## Save model

In [58]:
X_pred = pd.read_csv("diamonds_test.csv", index_col = 0)

X_pred = tools_tb.ordinal_encoder(X_pred)
X_pred = tools_tb.nominal_encoder(X_pred)

X_pred["length_width"] = X_pred["x"] / X_pred["y"]

# Delete the old categorical independent variables
X_pred = tools_tb.variables_split(X_pred, to_drop, False)

predictions_submit = grid.predict(X_pred)

submission = pd.DataFrame({"id": range(len(predictions_submit)), "price": predictions_submit})

tools_tb.chequeator(submission)

You're ready to submit!


In [12]:
submission

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
0,450.808116
1,1850.465643
2,9660.855531
3,536.093782
4,8836.371928
...,...
13444,4518.847658
13445,454.144140
13446,14684.673229
13447,9946.739563
