In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import seaborn as sns
import pickle

# para calcular las métricas

from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn import tree

from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder  
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
plt.rcParams["figure.figsize"] = (10,8)

# Decision Tree

In [2]:
df_tt = pd.read_csv("../data/preproc.csv", index_col = 0)
df_tt.head(2)

Unnamed: 0_level_0,carat,cut,color,clarity,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,-1.128026,1,0,6,6.353
1,0.669489,0,1,2,9.183


In [3]:
X = df_tt.drop('price', axis =1)
y = df_tt['price']

In [4]:
# create a regressor object
regressor = DecisionTreeRegressor(random_state = 0) 
  
# fit the regressor with X and Y data
regressor.fit(X, y)

DecisionTreeRegressor(random_state=0)

In [5]:
max_features = np.sqrt(len(X.columns))
max_features

2.0

In [6]:
print(regressor.tree_.max_depth)

22


# Predicción¶

Nos traemos el CSV de test limpio

In [7]:
X_test = pd.read_csv("../data/test_limpio_.csv", index_col = 0)
X_test.head(2)

Unnamed: 0,carat,cut,color,clarity
0,-1.023184,0,5,5
1,0.94188,1,5,5


In [8]:
y_pred_test = regressor.predict(X_test)
y_pred_train = regressor.predict(X)

# Validación del Modelo

In [9]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y, y_pred_train))
print('Mean Squared Error:', metrics.mean_squared_error(y, y_pred_train))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y, y_pred_train)))
print("R2:",  metrics.r2_score(y, y_pred_train))

Mean Absolute Error: 0.08841003850598857
Mean Squared Error: 0.040876682777854015
Root Mean Squared Error: 0.20217982782130867
R2: 0.9604560958905882


In [10]:
def metricas(y, y_pred_test, y_pred_train, tipo_modelo):
    
    
    resultados = {'Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y, y_pred_train))}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [11]:
dt_resultsI = metricas(y, y_pred_test, y_pred_train, "Decission Tree I")
dt_resultsI

Unnamed: 0,0,modelo
0,Root Mean Squared Error:,Decission Tree I
1,0.20218,Decission Tree I


# Probar el modelo en el CSV de test y exportar CSV

In [12]:
DT_TestValues = pd.DataFrame(y_pred_test)
DT_TestValues.head(3)

Unnamed: 0,0
0,6.244182
1,8.3964
2,9.287


In [13]:
DT_TestValues.reset_index()
DT_TestValues.columns = ["Price"]
DT_TestValues.head(2)

Unnamed: 0,Price
0,6.244182
1,8.3964


In [14]:
DT_TestValues.to_csv("../data/data predict test1.csv", index_label="id")

# GridSearch

In [15]:
# Generación del GridSearch
# ==============================================================================

# definimos un diccionario con los hiperparámetros que queremos testear. 
param = {"max_depth": [4, 5, 6, 7],
        "min_samples_split": [10, 50, 100],
        "max_features": [1,2,3,4,5,6]}

gs = GridSearchCV(
            estimator=DecisionTreeRegressor(),
            param_grid= param,
            cv=10,
            verbose=3,
            return_train_score = True,
            scoring="neg_mean_squared_error")

In [16]:
%time
gs.fit(X, y)

CPU times: user 2 µs, sys: 4 µs, total: 6 µs
Wall time: 5.96 µs
Fitting 10 folds for each of 72 candidates, totalling 720 fits
[CV 1/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.248, test=-0.255) total time=   0.0s
[CV 2/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.238, test=-0.240) total time=   0.0s
[CV 3/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.248, test=-0.257) total time=   0.0s
[CV 4/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.247, test=-0.241) total time=   0.0s
[CV 5/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.857, test=-0.843) total time=   0.0s
[CV 6/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.478, test=-0.480) total time=   0.0s
[CV 7/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.731, test=-0.755) total time=   0.0s
[CV 8/10] END max_depth=4, max_features=

[CV 2/10] END max_depth=4, max_features=3, min_samples_split=50;, score=(train=-0.136, test=-0.138) total time=   0.0s
[CV 3/10] END max_depth=4, max_features=3, min_samples_split=50;, score=(train=-0.122, test=-0.122) total time=   0.0s
[CV 4/10] END max_depth=4, max_features=3, min_samples_split=50;, score=(train=-0.121, test=-0.119) total time=   0.0s
[CV 5/10] END max_depth=4, max_features=3, min_samples_split=50;, score=(train=-0.117, test=-0.111) total time=   0.0s
[CV 6/10] END max_depth=4, max_features=3, min_samples_split=50;, score=(train=-0.132, test=-0.131) total time=   0.0s
[CV 7/10] END max_depth=4, max_features=3, min_samples_split=50;, score=(train=-0.211, test=-0.213) total time=   0.0s
[CV 8/10] END max_depth=4, max_features=3, min_samples_split=50;, score=(train=-0.112, test=-0.109) total time=   0.0s
[CV 9/10] END max_depth=4, max_features=3, min_samples_split=50;, score=(train=-0.164, test=-0.165) total time=   0.0s
[CV 10/10] END max_depth=4, max_features=3, min_

[CV 2/10] END max_depth=4, max_features=6, min_samples_split=50;, score=(train=nan, test=nan) total time=   0.0s
[CV 3/10] END max_depth=4, max_features=6, min_samples_split=50;, score=(train=nan, test=nan) total time=   0.0s
[CV 4/10] END max_depth=4, max_features=6, min_samples_split=50;, score=(train=nan, test=nan) total time=   0.0s
[CV 5/10] END max_depth=4, max_features=6, min_samples_split=50;, score=(train=nan, test=nan) total time=   0.0s
[CV 6/10] END max_depth=4, max_features=6, min_samples_split=50;, score=(train=nan, test=nan) total time=   0.0s
[CV 7/10] END max_depth=4, max_features=6, min_samples_split=50;, score=(train=nan, test=nan) total time=   0.0s
[CV 8/10] END max_depth=4, max_features=6, min_samples_split=50;, score=(train=nan, test=nan) total time=   0.0s
[CV 9/10] END max_depth=4, max_features=6, min_samples_split=50;, score=(train=nan, test=nan) total time=   0.0s
[CV 10/10] END max_depth=4, max_features=6, min_samples_split=50;, score=(train=nan, test=nan) t

[CV 5/10] END max_depth=5, max_features=2, min_samples_split=100;, score=(train=-0.138, test=-0.131) total time=   0.0s
[CV 6/10] END max_depth=5, max_features=2, min_samples_split=100;, score=(train=-0.106, test=-0.108) total time=   0.0s
[CV 7/10] END max_depth=5, max_features=2, min_samples_split=100;, score=(train=-0.099, test=-0.100) total time=   0.0s
[CV 8/10] END max_depth=5, max_features=2, min_samples_split=100;, score=(train=-0.516, test=-0.527) total time=   0.0s
[CV 9/10] END max_depth=5, max_features=2, min_samples_split=100;, score=(train=-0.122, test=-0.128) total time=   0.0s
[CV 10/10] END max_depth=5, max_features=2, min_samples_split=100;, score=(train=-0.124, test=-0.126) total time=   0.0s
[CV 1/10] END max_depth=5, max_features=3, min_samples_split=10;, score=(train=-0.126, test=-0.121) total time=   0.0s
[CV 2/10] END max_depth=5, max_features=3, min_samples_split=10;, score=(train=-0.088, test=-0.090) total time=   0.0s
[CV 3/10] END max_depth=5, max_features=3

[CV 6/10] END max_depth=5, max_features=6, min_samples_split=10;, score=(train=nan, test=nan) total time=   0.0s
[CV 7/10] END max_depth=5, max_features=6, min_samples_split=10;, score=(train=nan, test=nan) total time=   0.0s
[CV 8/10] END max_depth=5, max_features=6, min_samples_split=10;, score=(train=nan, test=nan) total time=   0.0s
[CV 9/10] END max_depth=5, max_features=6, min_samples_split=10;, score=(train=nan, test=nan) total time=   0.0s
[CV 10/10] END max_depth=5, max_features=6, min_samples_split=10;, score=(train=nan, test=nan) total time=   0.0s
[CV 1/10] END max_depth=5, max_features=6, min_samples_split=50;, score=(train=nan, test=nan) total time=   0.0s
[CV 2/10] END max_depth=5, max_features=6, min_samples_split=50;, score=(train=nan, test=nan) total time=   0.0s
[CV 3/10] END max_depth=5, max_features=6, min_samples_split=50;, score=(train=nan, test=nan) total time=   0.0s
[CV 4/10] END max_depth=5, max_features=6, min_samples_split=50;, score=(train=nan, test=nan) t

[CV 1/10] END max_depth=6, max_features=2, min_samples_split=100;, score=(train=-0.130, test=-0.126) total time=   0.0s
[CV 2/10] END max_depth=6, max_features=2, min_samples_split=100;, score=(train=-0.084, test=-0.084) total time=   0.0s
[CV 3/10] END max_depth=6, max_features=2, min_samples_split=100;, score=(train=-0.104, test=-0.106) total time=   0.0s
[CV 4/10] END max_depth=6, max_features=2, min_samples_split=100;, score=(train=-0.151, test=-0.155) total time=   0.0s
[CV 5/10] END max_depth=6, max_features=2, min_samples_split=100;, score=(train=-0.172, test=-0.163) total time=   0.0s
[CV 6/10] END max_depth=6, max_features=2, min_samples_split=100;, score=(train=-0.256, test=-0.263) total time=   0.0s
[CV 7/10] END max_depth=6, max_features=2, min_samples_split=100;, score=(train=-0.129, test=-0.139) total time=   0.0s
[CV 8/10] END max_depth=6, max_features=2, min_samples_split=100;, score=(train=-0.429, test=-0.433) total time=   0.0s
[CV 9/10] END max_depth=6, max_features=

[CV 4/10] END max_depth=6, max_features=5, min_samples_split=50;, score=(train=nan, test=nan) total time=   0.0s
[CV 5/10] END max_depth=6, max_features=5, min_samples_split=50;, score=(train=nan, test=nan) total time=   0.0s
[CV 6/10] END max_depth=6, max_features=5, min_samples_split=50;, score=(train=nan, test=nan) total time=   0.0s
[CV 7/10] END max_depth=6, max_features=5, min_samples_split=50;, score=(train=nan, test=nan) total time=   0.0s
[CV 8/10] END max_depth=6, max_features=5, min_samples_split=50;, score=(train=nan, test=nan) total time=   0.0s
[CV 9/10] END max_depth=6, max_features=5, min_samples_split=50;, score=(train=nan, test=nan) total time=   0.0s
[CV 10/10] END max_depth=6, max_features=5, min_samples_split=50;, score=(train=nan, test=nan) total time=   0.0s
[CV 1/10] END max_depth=6, max_features=5, min_samples_split=100;, score=(train=nan, test=nan) total time=   0.0s
[CV 2/10] END max_depth=6, max_features=5, min_samples_split=100;, score=(train=nan, test=nan)

[CV 9/10] END max_depth=7, max_features=1, min_samples_split=100;, score=(train=-0.161, test=-0.164) total time=   0.0s
[CV 10/10] END max_depth=7, max_features=1, min_samples_split=100;, score=(train=-0.126, test=-0.128) total time=   0.0s
[CV 1/10] END max_depth=7, max_features=2, min_samples_split=10;, score=(train=-0.072, test=-0.075) total time=   0.0s
[CV 2/10] END max_depth=7, max_features=2, min_samples_split=10;, score=(train=-0.088, test=-0.089) total time=   0.0s
[CV 3/10] END max_depth=7, max_features=2, min_samples_split=10;, score=(train=-0.106, test=-0.110) total time=   0.0s
[CV 4/10] END max_depth=7, max_features=2, min_samples_split=10;, score=(train=-0.100, test=-0.097) total time=   0.0s
[CV 5/10] END max_depth=7, max_features=2, min_samples_split=10;, score=(train=-0.106, test=-0.105) total time=   0.0s
[CV 6/10] END max_depth=7, max_features=2, min_samples_split=10;, score=(train=-0.072, test=-0.082) total time=   0.0s
[CV 7/10] END max_depth=7, max_features=2, mi

[CV 9/10] END max_depth=7, max_features=4, min_samples_split=10;, score=(train=-0.061, test=-0.059) total time=   0.0s
[CV 10/10] END max_depth=7, max_features=4, min_samples_split=10;, score=(train=-0.061, test=-0.065) total time=   0.0s
[CV 1/10] END max_depth=7, max_features=4, min_samples_split=50;, score=(train=-0.061, test=-0.060) total time=   0.0s
[CV 2/10] END max_depth=7, max_features=4, min_samples_split=50;, score=(train=-0.061, test=-0.062) total time=   0.0s
[CV 3/10] END max_depth=7, max_features=4, min_samples_split=50;, score=(train=-0.062, test=-0.057) total time=   0.0s
[CV 4/10] END max_depth=7, max_features=4, min_samples_split=50;, score=(train=-0.061, test=-0.063) total time=   0.0s
[CV 5/10] END max_depth=7, max_features=4, min_samples_split=50;, score=(train=-0.061, test=-0.061) total time=   0.0s
[CV 6/10] END max_depth=7, max_features=4, min_samples_split=50;, score=(train=-0.060, test=-0.068) total time=   0.0s
[CV 7/10] END max_depth=7, max_features=4, min_

[CV 5/10] END max_depth=7, max_features=6, min_samples_split=100;, score=(train=nan, test=nan) total time=   0.0s
[CV 6/10] END max_depth=7, max_features=6, min_samples_split=100;, score=(train=nan, test=nan) total time=   0.0s
[CV 7/10] END max_depth=7, max_features=6, min_samples_split=100;, score=(train=nan, test=nan) total time=   0.0s
[CV 8/10] END max_depth=7, max_features=6, min_samples_split=100;, score=(train=nan, test=nan) total time=   0.0s
[CV 9/10] END max_depth=7, max_features=6, min_samples_split=100;, score=(train=nan, test=nan) total time=   0.0s
[CV 10/10] END max_depth=7, max_features=6, min_samples_split=100;, score=(train=nan, test=nan) total time=   0.0s


240 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/Caskroom/miniconda/base/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/Caskroom/miniconda/base/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 1315, in fit
    super().fit(
  File "/usr/local/Caskroom/miniconda/base/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 308, in fit
    raise ValueError("max_features must be in (0, n_features]")
ValueError: max_features must be in (0, n_features]

 -0.12970367 -0.13515639 -0.

GridSearchCV(cv=10, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [4, 5, 6, 7],
                         'max_features': [1, 2, 3, 4, 5, 6],
                         'min_samples_split': [10, 50, 100]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=3)

In [17]:
best_tree = gs.best_estimator_
best_tree

DecisionTreeRegressor(max_depth=7, max_features=4, min_samples_split=10)

In [18]:
y_pred_testII = best_tree.predict(X_test)
y_pred_trainII = best_tree.predict(X)

In [19]:
dt_II_results = metricas(y, y_pred_testII, y_pred_trainII, "Decision tree II")

In [20]:
dt_II_results

Unnamed: 0,0,modelo
0,Root Mean Squared Error:,Decision tree II
1,0.246883,Decision tree II
