In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import VotingRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR, LinearSVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor


import tools_tb

import warnings

warnings.filterwarnings("ignore")

## Preprocessing

In [2]:
df = pd.read_csv("diamonds_train.csv", index_col = 0)

# Cleaning
df = tools_tb.ordinal_encoder(df)
#df["color_2"] = df.color.map(color_dict)
df = tools_tb.nominal_encoder(df)
df = tools_tb.outliers_remover(df)

# Create the columns in the df for them


#df["length_width"] = df["x"] / df["y"]
#df = df[(df["length_width"] > 0.8) & (df["length_width"] < 1.2)]
#df[df["length_width"].isna()] = 0

#scaler = StandardScaler()
#df[["depth", "table", "x", "y", "z"]] = scaler.fit_transform(df[["depth", "table", "x", "y", "z"]])

# Defining X and y
to_drop = ["cut", "clarity"]

X, y = tools_tb.variables_split(df, to_drop)

## Baseline Model

In [3]:
# Split parameters
split = .2
seed = random.randint(0, 1000)

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split, random_state = 42)

# Model
rf = RandomForestRegressor(n_estimators = 500, criterion = "mse", random_state = 42, n_jobs = -1)

# Training
rf.fit(X_train, y_train)

# Predictions
y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)

# Scores
score_train = rf.score(X_train, y_train)
score_test = rf.score(X_test, y_test)

# RMSE
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"Train data\nScore = {score_train}\nRMSE = {rmse_train}")
print("#" * 50)
print(f"Test data\nScore = {score_test}\nRMSE = {rmse_test}")

Train data
Score = 0.9973096884961524
RMSE = 207.61223721551022
##################################################
Test data
Score = 0.9810513749181917
RMSE = 534.4145615987889


## New model

In [26]:
#### Split parameters
train_split = .2
val_split = .3

seed_ = 42
seeds = random.sample(range(0, 100000), 2)

model = 1
#################################

#### Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = train_split, random_state = seed_)

#### Model
rf = RandomForestRegressor(n_estimators = 500, criterion = "mse", random_state = seed_, n_jobs = -1, max_depth = 15, max_features = 9, warm_start = True)

#### Cross-Validation
for seed in seeds:
    # Cross-validation split
    X_val_train, X_val_test, y_val_train, y_val_test = train_test_split(X_train, y_train, test_size = val_split, random_state = seed)

    # Training
    rf.fit(X_val_train, y_val_train)

    # Predictions
    y_pred_val_train = rf.predict(X_val_train)
    y_pred_val_test = rf.predict(X_val_test)

    # Scores
    score_val_train = rf.score(X_val_train, y_val_train)
    score_val_test = rf.score(X_val_test, y_val_test)

    # RMSE
    rmse_val_train = np.sqrt(mean_squared_error(y_val_train, y_pred_val_train))
    rmse_val_test = np.sqrt(mean_squared_error(y_val_test, y_pred_val_test))

    print(f"CV Model {model} | seed = {seed}")
    print(f"Val. Train data\nScore = {score_val_train}\nRMSE = {rmse_val_train}")
    print("-" * 25)
    print(f"Val. Test data\nScore = {score_val_test}\nRMSE = {rmse_val_test}")
    print("#" * 50)
    model += 1


#### Final model
rf.fit(X_train, y_train)

y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)

# Scores
score_train = rf.score(X_train, y_train)
score_test = rf.score(X_test, y_test)

# RMSE
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print("\n - Final Model - ")
print(f"Train data\nScore = {score_train}\nRMSE = {rmse_train}")
print("#" * 50)
print(f"Test data\nScore = {score_test}\nRMSE = {rmse_test}")


#### Model with full data
rf.fit(X, y)

y_predict = rf.predict(X)

score = rf.score(X, y)
rmse = np.sqrt(mean_squared_error(y, y_predict))
print("#" * 50)
print(f"\n - Full data model - \nScore = {score}\nRMSE = {rmse}")

CV Model 1 | seed = 9297
Val. Train data
Score = 0.9939758605257584
RMSE = 307.51440984645654
-------------------------
Val. Test data
Score = 0.9811316212124156
RMSE = 552.1354508485847
##################################################
CV Model 2 | seed = 5189
Val. Train data
Score = 0.9900811082445001
RMSE = 396.3377455304904
-------------------------
Val. Test data
Score = 0.9899586899747856
RMSE = 398.73008378161984
##################################################

 - Final Model - 
Train data
Score = 0.9900224333510529
RMSE = 399.81951021083523
##################################################
Test data
Score = 0.9901243385038909
RMSE = 385.8091554137571
##################################################

 - Full data model - 
Score = 0.9900444191594383
RMSE = 397.05699021223774


## New model 2

In [3]:
split = 0.1
seed = 42

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split, random_state = seed)

# Cross validation
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = seed)

# Model parameters
parameters = {
    "n_estimators" : [500],
    "criterion" : ["mse"],
    "max_depth" : range(10, X.shape[1]),
    "min_samples_split" : range(2, 5),
    "min_samples_leaf" : range(1, 4),
    "min_weight_fraction_leaf" : np.linspace(0, 1, 3),
    "n_jobs" : [-1],
    "random_state" : random.sample(range(0, 10000), 5)
}

# Model
rf = RandomForestRegressor()

# Grid
grid = GridSearchCV(estimator = rf, param_grid = parameters, n_jobs = -1, cv = skf)

# Training
grid.fit(X_train, y_train)

# Predictions
y_pred_train = grid.predict(X_train)
y_pred_test = grid.predict(X_test)

# Scores
score_train = grid.score(X_train, y_train)
score_test = grid.score(X_test, y_test)

# RMSE
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"Train data\nScore = {score_train}\nRMSE = {rmse_train}")
print("-" * 25)
print(f"Test data\nScore = {score_test}\nRMSE = {rmse_test}")
print("#" * 50)
print("grid.best_stimator_", grid.best_estimator_)
print("-" * 25)
print("grid.best_params_", grid.best_params_)
print("-" * 25)
print("grid.best_score", grid.best_score_)

KeyboardInterrupt: 

## To save the model

In [17]:
X_pred = pd.read_csv("diamonds_test.csv", index_col = 0)


# Cleaning
X_pred = tools_tb.ordinal_encoder(X_pred)
X_pred = tools_tb.nominal_encoder(X_pred)


# Delete the old categorical independent variables
X_pred = tools_tb.variables_split(X_pred, to_drop, False)

predictions_submit = rf.predict(X_pred)

submission = pd.DataFrame({"id": range(len(predictions_submit)), "price": predictions_submit})

tools_tb.chequeator(submission)

You're ready to submit!
