In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import VotingRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR, LinearSVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor


import tools_tb

import warnings

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("diamonds_train.csv", index_col = 0)

# Cleaning
df = tools_tb.ordinal_encoder(df)
df = tools_tb.nominal_encoder(df)
df = tools_tb.outliers_remover(df)

df["length_width"] = df["x"] / df["y"]
#df = df[(df["length_width"] > 0.8) & (df["length_width"] < 1.2)]
df[df["length_width"].isna()] = 0


# Defining X and y
to_drop = ["cut", "clarity"]

X, y = tools_tb.variables_split(df, to_drop)

In [18]:
split = 0.2
seed = 42

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split, random_state = seed)

# Model
dt = DecisionTreeRegressor(criterion = "mse", max_depth = 10, max_features = 6, random_state = seed)

# Training
dt.fit(X_train, y_train)

# Predictions
y_pred_train = dt.predict(X_train)
y_pred_test = dt.predict(X_test)

# Scores
score_train = dt.score(X_train, y_train)
score_test = dt.score(X_test, y_test)

# RMSE
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"Train data\nScore = {score_train}\nRMSE = {rmse_train}")
print("#" * 50)
print(f"Test data\nScore = {score_test}\nRMSE = {rmse_test}")

Train data
Score = 0.9704216555650502
RMSE = 688.3964071586621
##################################################
Test data
Score = 0.9615950094559882
RMSE = 760.8223517300186
