In [2]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import VotingRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR, LinearSVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor


import tools_tb

import warnings

warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("diamonds_train.csv", index_col = 0)

# Cleaning
df = tools_tb.ordinal_encoder(df)
df = tools_tb.nominal_encoder(df)
df = tools_tb.outliers_remover(df)

# Defining X and y
to_drop = ["cut", "clarity"]

X, y = tools_tb.variables_split(df, to_drop)

In [4]:
# Split parameters
split = .1
seed = 74909

# 74909, 

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split, random_state = seed)

# Model
rf = RandomForestRegressor(n_estimators = 500, criterion = "mse", random_state = 42, n_jobs = -1)

# Training
rf.fit(X_train, y_train)

# Predictions
y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)

# Scores
score_train = rf.score(X_train, y_train)
score_test = rf.score(X_test, y_test)

# RMSE
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"Train data\nScore = {score_train}\nRMSE = {rmse_train}")
print("#" * 50)
print(f"Test data\nScore = {score_test}\nRMSE = {rmse_test}")

Train data
Score = 0.9973793956677462
RMSE = 203.82192256740407
##################################################
Test data
Score = 0.9815142283927661
RMSE = 538.3337680160502


In [14]:
df = pd.read_csv("diamonds_train.csv", index_col = 0)

# Cleaning
df = tools_tb.ordinal_encoder(df)
#df = tools_tb.nominal_encoder(df)
#df = tools_tb.outliers_remover(df)

color_dict = {
        "D" : 0,
        "E" : 0,
        "F" : 0,
        "G" : 1,
        "H" : 1,
        "I" : 1,
        "J" : 1,
    }

# Create the columns in the df for them
df["color"] = df.color.map(color_dict)

df["length_width"] = df["x"] / df["y"]
#df = df[(df["length_width"] > 0.8) & (df["length_width"] < 1.2)]
df[df["length_width"].isna()] = 0

#scaler = StandardScaler()
#df[["depth", "table", "x", "y", "z"]] = scaler.fit_transform(df[["depth", "table", "x", "y", "z"]])

# Defining X and y
to_drop = ["cut", "clarity"]

X, y = tools_tb.variables_split(df, to_drop)

In [15]:
split = 0.2
seed = 42

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split, random_state = seed)

# Models
rf = RandomForestRegressor(max_depth=16, max_features=X.shape[1], min_samples_leaf=2,n_estimators=1000, n_jobs=-1, random_state=7881, warm_start=True)

# Training
rf.fit(X_train, y_train)

# Predictions
y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)

# Scores
score_train = rf.score(X_train, y_train)
score_test = rf.score(X_test, y_test)

# RMSE
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"Train data\nScore = {score_train}\nRMSE = {rmse_train}")
print("-" * 25)
print(f"Test data\nScore = {score_test}\nRMSE = {rmse_test}")

Train data
Score = 0.9884670120634639
RMSE = 427.42428737480117
-------------------------
Test data
Score = 0.9612975249249112
RMSE = 783.8314921632134


In [23]:
split = 0.2
seed = 42

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split, random_state = seed)

# Model
svr = SVR(C = 10000, gamma = 0.1, max_iter = 10000)

# Training
svr.fit(X_train, y_train)

# Predictions
y_pred_train = svr.predict(X_train)
y_pred_test = svr.predict(X_test)

# Scores
score_train = svr.score(X_train, y_train)
score_test = svr.score(X_test, y_test)

# RMSE
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"Train data\nScore = {score_train}\nRMSE = {rmse_train}")
print("#" * 50)
print(f"Test data\nScore = {score_test}\nRMSE = {rmse_test}")

Train data
Score = 0.9763436980357585
RMSE = 614.3881695701701
##################################################
Test data
Score = 0.9746863246780925
RMSE = 623.4184374794745


In [27]:
split = 0.2
seed = 42

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split, random_state = seed)

# Cross validation
skf = StratifiedKFold(n_splits = 4, shuffle = True, random_state = seed)

# Model parameters
parameters = {
    "gamma" : [.1, .3, .6],
    "C" : range(1000, 11000, 3000),
    "max_iter": [10000]
}

# Model
svr = SVR()

# Grid
grid = GridSearchCV(estimator = svr, param_grid = parameters, n_jobs = -1, cv = skf)

# Training
grid.fit(X_train, y_train)

# Predictions
y_pred_train = grid.predict(X_train)
y_pred_test = grid.predict(X_test)

# Scores
score_train = grid.score(X_train, y_train)
score_test = grid.score(X_test, y_test)

# RMSE
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"Train data\nScore = {score_train}\nRMSE = {rmse_train}")
print("-" * 25)
print(f"Test data\nScore = {score_test}\nRMSE = {rmse_test}")
print("#" * 50)
print("grid.best_stimator_", grid.best_estimator_)
print("-" * 25)
print("grid.best_params_", grid.best_params_)
print("-" * 25)
print("grid.best_score", grid.best_score_)

Train data
Score = 0.9793698950212636
RMSE = 573.7466144470136
-------------------------
Test data
Score = 0.9790274101974886
RMSE = 567.4506148456112
##################################################
grid.best_stimator_ SVR(C=4000, gamma=0.1, max_iter=10000)
-------------------------
grid.best_params_ {'C': 4000, 'gamma': 0.1, 'max_iter': 10000}
-------------------------
grid.best_score 0.9756983035767868


In [15]:
split = 0.2
seed = 42

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split, random_state = seed)

# Models
rf_ = RandomForestRegressor(max_depth=16, max_features=10, min_samples_leaf=2,n_estimators=1000, n_jobs=-1, random_state=7881, warm_start=True)

svr_ = SVR(C=4000, gamma=0.1, max_iter=10000)

models = [("rf", rf_), ("svr", svr_)]

# Ensemble
vr = VotingRegressor(models, weights = [5.5, 4.5], n_jobs = -1)

# Training
vr.fit(X_train, y_train)

# Predictions
y_pred_train = vr.predict(X_train)
y_pred_test = vr.predict(X_test)

# Scores
score_train = vr.score(X_train, y_train)
score_test = vr.score(X_test, y_test)

# RMSE
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"Train data\nScore = {score_train}\nRMSE = {rmse_train}")
print("-" * 25)
print(f"Test data\nScore = {score_test}\nRMSE = {rmse_test}")

KeyboardInterrupt: 

In [5]:
vr.fit(X, y)

y_pred_final = vr.predict(X)

score_final = vr.score(X, y)
rmse_final = np.sqrt(mean_squared_error(y, y_pred_final))

print(f"Full data\nScore = {score_final}\nRMSE = {rmse_final}")

Full data
Score = 0.9884387470831816
RMSE = 427.8938945729848


In [8]:
X_pred = pd.read_csv("diamonds_test.csv", index_col = 0)

# Cleaning
X_pred = tools_tb.ordinal_encoder(X_pred)
X_pred = tools_tb.nominal_encoder(X_pred)

X_pred["length_width"] = X_pred["x"] / X_pred["y"]

scaler = StandardScaler()
X_pred[["depth", "table", "x", "y", "z"]] = scaler.fit_transform(X_pred[["depth", "table", "x", "y", "z"]])

# Delete the old categorical independent variables
X_pred = tools_tb.variables_split(X_pred, to_drop, False)

predictions_submit = vr.predict(X_pred)

submission = pd.DataFrame({"id": range(len(predictions_submit)), "price": predictions_submit})

tools_tb.chequeator(submission)

You're ready to submit!
