In [123]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as ticker
import seaborn as sns
from sklearn.model_selection import train_test_split
plt.rcParams["figure.figsize"] = (10,8)

import pickle

import statsmodels.formula.api as smf


from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.model_selection import GridSearchCV
from sklearn import tree



from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn import metrics

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_squared_error


# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')


In [31]:
train_data = pd.read_csv('../data/train_prep.csv', index_col=0)
train_data.drop(["id"], axis=1, inplace=True)
train_data.head(1)

Unnamed: 0,carat,x,y,z,price,cut_encoded,color_encoded,clarity_encoded
0,-1.127542,-1.266882,-1.304346,-1.241417,6.353,3,0,3


In [98]:
X = train_data.drop("price", axis = 1) # variables predictoras
y = train_data["price"] # variable respuesta 

In [67]:
X = train_data.drop('price', axis =1)
y = train_data['price']


X_train, X_test, y_train, y_test = train_test_split(X, y, train_size   = 0.8, random_state = 42)

### `Decision Tree`

In [68]:
regressor = DecisionTreeRegressor(random_state = 0) 

regressor.fit(X_train, y_train)

In [69]:
max_features = np.sqrt(len(X_train.columns))
max_features

2.6457513110645907

In [70]:
print(regressor.tree_.max_depth)

31


In [71]:
y_pred_test_dt = regressor.predict(X_test)
y_pred_train_dt = regressor.predict(X_train)

In [72]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    
    
    resultados = {'MAE': [metrics.mean_absolute_error(y_test, y_test_pred), metrics.mean_absolute_error(y_train, y_train_pred)],
                'MSE': [metrics.mean_squared_error(y_test, y_test_pred), metrics.mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)), np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))],
                'R2':  [metrics.r2_score(y_test, y_test_pred), metrics.r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df
    

In [73]:
dt_results1 = metricas(y_test, y_train, y_pred_test_dt, y_pred_train_dt, "Decission Tree I")
dt_results1

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.087138,0.018922,0.137557,0.981472,test,Decission Tree I
1,0.000375,3.3e-05,0.005704,0.999969,train,Decission Tree I


In [74]:
# Grid de hiperparámetros evaluados
# ==============================================================================
param = {"max_depth": [2,3, 4,6,8,10,12,14,16,18],
         "min_samples_split": [200, 400, 600, 800],
         "max_features": [1,2,3], 
         "min_samples_leaf": [200, 400, 600, 800]}

# Búsqueda por grid search con validación cruzada
# ==============================================================================
arbol2 = GridSearchCV(
        estimator = DecisionTreeRegressor(), 
        param_grid = param, 
        n_jobs = -1,
        verbose = 1, 
        cv = 10, 
        return_train_score = True, 
        scoring = "neg_mean_squared_error")

In [75]:
arbol2.fit(X_train, y_train)

Fitting 10 folds for each of 480 candidates, totalling 4800 fits


In [76]:
print("----------------------------------------")
print("Mejores hiperparámetros encontrados (cv)")
print("----------------------------------------")
arbol2.best_params_

----------------------------------------
Mejores hiperparámetros encontrados (cv)
----------------------------------------


{'max_depth': 18,
 'max_features': 3,
 'min_samples_leaf': 200,
 'min_samples_split': 400}

In [77]:
arbol3 = DecisionTreeRegressor( max_depth =  18, max_features=3, min_samples_split=400)
arbol3.fit(X_train,y_train)

In [78]:
y_pred_arbol_test= arbol3.predict(X_test)
y_pred_arbol_train= arbol3.predict(X_train)

In [79]:
results_decission_tree2 =  metricas(y_test, y_train, y_pred_arbol_test,  y_pred_arbol_train, "Decission Tree II")
results_decission_tree2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.121457,0.026257,0.162039,0.97429,test,Decission Tree II
1,0.121044,0.026485,0.162743,0.974461,train,Decission Tree II


In [80]:
df_all_results = pd.concat([dt_results1, results_decission_tree2], axis = 0)
df_all_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.087138,0.018922,0.137557,0.981472,test,Decission Tree I
1,0.000375,3.3e-05,0.005704,0.999969,train,Decission Tree I
0,0.121457,0.026257,0.162039,0.97429,test,Decission Tree II
1,0.121044,0.026485,0.162743,0.974461,train,Decission Tree II


### `Random forest`

In [114]:
# Grid de hiperparámetros evaluados
# ==============================================================================
param = {"max_depth": [2,3, 4,6,8,10],
         "min_samples_split": [50, 100, 150, 200, 250],
         "max_features": [1,2,3,4], 
         "min_samples_leaf": [50, 100, 150, 200, 250]}


# Búsqueda por grid search con validación cruzada (cross-validarion)
# ==============================================================================
random_forest = GridSearchCV(
        estimator = RandomForestRegressor(), 
        param_grid = param, 
        n_jobs = -1,
        verbose = 1,  
        cv = 10, 
        return_train_score = True, 
        scoring = "neg_mean_squared_error")

In [116]:
random_forest.fit(X_train, y_train)

Fitting 10 folds for each of 600 candidates, totalling 6000 fits


In [117]:
print("----------------------------------------")
print("Mejores hiperparámetros encontrados (cv)")
print("----------------------------------------")

random_forest.best_params_

----------------------------------------
Mejores hiperparámetros encontrados (cv)
----------------------------------------


{'max_depth': 10,
 'max_features': 4,
 'min_samples_leaf': 50,
 'min_samples_split': 100}

In [118]:
clf=RandomForestRegressor(min_samples_split= 100,
                           max_features=4,
                           max_depth=10)
clf.fit(X_train,y_train)

In [119]:
y_pred_clf_test= clf.predict(X_test)
y_pred_clf_train= clf.predict(X_train)

In [121]:
results_forest = metricas(y_test, y_train, y_pred_clf_test,  y_pred_clf_train, "Random Forest")
df_all_results = pd.concat([df_all_results, results_forest], axis = 0)
df_all_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.087138,0.018922,0.137557,0.981472,test,Decission Tree I
1,0.000375,3.3e-05,0.005704,0.999969,train,Decission Tree I
0,0.121457,0.026257,0.162039,0.97429,test,Decission Tree II
1,0.121044,0.026485,0.162743,0.974461,train,Decission Tree II
0,0.079406,0.014935,0.122209,0.985376,test,KNN2
1,0.042938,0.004607,0.067878,0.995557,train,KNN2
0,0.079406,0.014935,0.122209,0.985376,test,KNN2
1,0.042938,0.004607,0.067878,0.995557,train,KNN2
0,0.091223,0.01515,0.123086,0.985165,test,Random Forest
1,0.089039,0.014643,0.12101,0.98588,train,Random Forest


### `KNeighbors`

In [104]:
knn_scores = []

for k in range(1,21):
    # por defecto nos devuelve la precisión
    score=cross_val_score(KNeighborsRegressor(n_neighbors = k),
                          X = X,
                          y = y,
                          cv=10, 
                         scoring = "neg_mean_squared_error")
    knn_scores.append(score.mean())

In [105]:
knn = pd.DataFrame(knn_scores, range(1,21)).reset_index()
knn.columns = ["number_neighbors", "score"]
knn.sort_values(by = "score", ascending = False).head(3)

Unnamed: 0,number_neighbors,score
3,4,-0.013545
4,5,-0.01355
5,6,-0.01361


In [106]:
knn = KNeighborsRegressor(n_neighbors = 2)
knn.fit(X_train, y_train)

In [107]:
y_pred_knn_test = knn.predict(X_test)
y_pred_knn_train = knn.predict(X_train)

In [112]:
results_knn = metricas(y_test,y_train, y_pred_knn_test,  y_pred_knn_train, "KNN2")
df_all_results = pd.concat([df_all_results, results_knn])
df_all_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.087138,0.018922,0.137557,0.981472,test,Decission Tree I
1,0.000375,3.3e-05,0.005704,0.999969,train,Decission Tree I
0,0.121457,0.026257,0.162039,0.97429,test,Decission Tree II
1,0.121044,0.026485,0.162743,0.974461,train,Decission Tree II
0,0.079406,0.014935,0.122209,0.985376,test,KNN2
1,0.042938,0.004607,0.067878,0.995557,train,KNN2
0,0.079406,0.014935,0.122209,0.985376,test,KNN2
1,0.042938,0.004607,0.067878,0.995557,train,KNN2


### `Gradient Boosting`

In [124]:
# Grid de hiperparámetros evaluados
# ==============================================================================
param = {"max_depth": [2,3, 4,6,8,10],
         "min_samples_split": [50, 100, 150, 200, 250],
         "max_features": [1,2,3,4], 
         "min_samples_leaf": [50, 100, 150, 200, 250]}

# Búsqueda por grid search con validación cruzada
# ==============================================================================
grid = GridSearchCV(
        estimator  = GradientBoostingRegressor(),
        param_grid = param,
        n_jobs     = -1,
        verbose    = 1,
        cv         = 10, 
        return_train_score = True
       )

grid.fit(X = X_train, y = y_train)


Fitting 10 folds for each of 600 candidates, totalling 6000 fits


In [125]:
# Mejores hiperparámetros por validación cruzada
# ==============================================================================
print("----------------------------------------")
print("Mejores hiperparámetros encontrados (cv)")
print("----------------------------------------")
print(grid.best_params_)

----------------------------------------
Mejores hiperparámetros encontrados (cv)
----------------------------------------
{'max_depth': 10, 'max_features': 4, 'min_samples_leaf': 50, 'min_samples_split': 100}


In [126]:
y_pred_gb_test= grid.predict(X_test)
y_pred_gb_train= grid.predict(X_train)

In [127]:
results_gradient = metricas(y_test,y_train, y_pred_gb_test, y_pred_gb_train, "Gradient")
df_all_results = pd.concat([df_all_results, results_gradient], axis = 0)
df_all_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.087138,0.018922,0.137557,0.981472,test,Decission Tree I
1,0.000375,3.3e-05,0.005704,0.999969,train,Decission Tree I
0,0.121457,0.026257,0.162039,0.97429,test,Decission Tree II
1,0.121044,0.026485,0.162743,0.974461,train,Decission Tree II
0,0.079406,0.014935,0.122209,0.985376,test,KNN2
1,0.042938,0.004607,0.067878,0.995557,train,KNN2
0,0.079406,0.014935,0.122209,0.985376,test,KNN2
1,0.042938,0.004607,0.067878,0.995557,train,KNN2
0,0.091223,0.01515,0.123086,0.985165,test,Random Forest
1,0.089039,0.014643,0.12101,0.98588,train,Random Forest


GUARDAMOS EL MEJOR MODELO PARA USARLO LUEGO EN LA PREDICCIÓN

In [128]:
with open('../data/mejor_modelo.pkl', 'wb') as modelo:
        pickle.dump(grid, modelo)

### `Predicción`

In [129]:
# encoding Resting
with open('../data/encodingclarity.pkl', 'rb') as resting:
    encoding_resting = pickle.load(resting)

# encoding Slope
with open('../data/encodingcolor.pkl', 'rb') as slope:
    encoding_slope = pickle.load(slope)

# encoding Sex
with open('../data/encodingcut.pkl', 'rb') as sex:
    encoding_sex = pickle.load(sex)

# estandarización
with open('../data/estandarizacion.pkl', 'rb') as estandarizacion:
    estandarizacion = pickle.load(estandarizacion)

In [130]:
with open('../data/mejor_modelo.pkl', 'rb') as modelo:
    modelo = pickle.load(modelo)

In [131]:
df_final = pd.read_csv("../data/test_prep.csv", index_col=0)

In [132]:
df_final.head(1)

Unnamed: 0,carat,x,y,z,cut_encoded,color_encoded,clarity_encoded
0,-1.023184,-1.176305,-1.132481,-1.222407,2,5,2


In [133]:
arr = modelo.predict(df_final)

In [134]:
df = pd.DataFrame(arr, columns=['price'])
df.insert(0, 'id', range(0, len(df)))
df.head()

Unnamed: 0,id,price
0,0,6.082527
1,1,8.508292
2,2,9.504674
3,3,7.840583
4,4,8.900336


In [135]:
df.to_csv("../submissions/submission_3.csv", index=False)