In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as ticker
import seaborn as sns
from sklearn.model_selection import train_test_split
plt.rcParams["figure.figsize"] = (10,8)


from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.model_selection import GridSearchCV
from sklearn import tree



from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn import metrics


# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [32]:
df = pd.read_csv("../data/one.csv", index_col = 0)
df_test = pd.read_csv("../data/one_test.csv", index_col = 0)
df.head()

Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_map,color_D,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0,0.3,62.4,58.0,4.31,4.28,2.68,6.353,3,1,...,0,0,0,0,0,1,0,0,0,0
1,1,1.01,62.7,56.0,6.42,6.46,4.04,9.183,4,0,...,0,0,0,0,0,0,0,0,0,1
2,2,0.72,61.8,59.0,5.71,5.74,3.54,7.983,4,0,...,0,0,0,0,0,0,0,1,0,0
3,3,1.08,63.2,57.0,6.54,6.5,4.12,8.371,2,0,...,0,0,0,0,0,1,0,0,0,0
4,4,0.36,62.3,59.0,4.5,4.55,2.82,6.588,3,0,...,0,0,0,0,0,0,1,0,0,0


In [3]:
df.isnull().sum()

id              0
carat           0
depth           0
table           0
x               0
y               0
z               0
price           0
cut_map         0
color_D         0
color_E         0
color_F         0
color_G         0
color_H         0
color_I         0
color_J         0
clarity_I1      0
clarity_IF      0
clarity_SI1     0
clarity_SI2     0
clarity_VS1     0
clarity_VS2     0
clarity_VVS1    0
clarity_VVS2    0
dtype: int64

In [4]:
class Ajuste_modelo_lineal:
    
    
    def __init__(self, dataframe, variable_respuesta):
        """ 
        inicializamos la clase con el dataframe y la variable respuesta
        """
        self.dataframe = dataframe # variable que contiene el dataframe
        self.variable_respuesta = variable_respuesta # variable que contiene la variable respuesta del modelo
        
    def separar_datos(self):

        """
        Esta función separa los datos en train y test y devuelve los 4 datasets listos para ser usados en el modelo
        Returns:
            _type_: X_train, X_test, y_train, y_test son las variables son las variables predicotroas y las variables respuesta necesarias para el entrenamiento del modelo
        """
        # lo primero que hacemos es definir cual es nuestra variable Y y nuestras variables X

        X = self.dataframe.drop(self.variable_respuesta, axis =1) # seleccion de variables predictoras
        y = self.dataframe[self.variable_respuesta]               # seleccion de la variable respuesta
        
        # dividimos los datos en train y test con un 80% de train y un 20% de test, Recordamos que el random state es una semilla que nos permite reproducir los resultados 
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size   = 0.8, random_state = 42)
        
        return X_train, X_test, y_train, y_test
    
    
    def gridsearch(self, tipo_modelo, X_test, X_train, y_test, y_train, modelo = DecisionTreeRegressor()):
        """
        Esta función realiza un gridsearch sobre el modelo que le pasemos y nos devuelve el mejor modelo con los mejores hiperparametros
        Args:
            tipo_modelo (_type_):  modelo que deseamos optimizar 
            X_test (_type_): variables predictoras de test
            X_train (_type_): variables predictoras de train
            y_test (_type_): variable respuesta de test
            y_train (_type_): variable respuesta de train
            modelo (_type_, optional): _description_. Defaults to DecisionTreeRegressor().

        Returns:
            df: df con las metricas del modelo
        """
        profundidad = int(input("Cual es la profundidad máxima que quieres"))
        features = int(input("¿Cual es el nº de features maximo que quieres?"))
        leaf = int(input("¿Cual es el min_sample_leaf que quieres?"))
        split = int(input("¿Cual es el min_samples_split que quieres?"))
        
        param = {"max_depth": range(1, profundidad +1, 2),
                "min_samples_split": range(1, split +1, 2),
                "min_samples_leaf": range(1, leaf +1, 2),
                "max_features": range(1, features + 1, 2)}

        gs = GridSearchCV(
                    estimator = modelo,
                    param_grid= param,
                    cv=10,
                    verbose = 0,
                    return_train_score = True,
                    scoring="neg_mean_squared_error")
        gs.fit(X_train, y_train)
        
        self.best_tree = gs.best_estimator_
        print(f"el mejor arbol es {self.best_tree}")
        
        y_pred_test_dt2 = self.best_tree.predict(X_test)
        y_pred_train_dt2 = self.best_tree.predict(X_train)
        dt_results2 = self.metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, tipo_modelo)
        return dt_results2
    
    def ajuste_modelo(self, X_test, X_train, y_test, y_train):
        """
        Esta función realiza el ajuste del modelo y nos devuelve las metricas del modelo
        Args:
            X_test (_type_): variables predictoras de test
            X_train (_type_): variables predictoras de train
            y_test (_type_): variable respuesta de test
            y_train (_type_): variable respuesta de train
        """
        
        self.X_test = X_test
        self.X_train = X_train
        self.y_test = y_test
        self.y_train = y_train
        
        # iniciamos el método de Linear Regression
        
        tipo_modelo = input("Que modelo quieres hacer? 1: Regresion Lineal, 2: Decision Tree, 3: Random Forest")

        if tipo_modelo == "1":
            
            lr = LinearRegression()
            
            # fiteamos el modelo
            lr.fit(X_train, y_train)

            
            # hacemos las predicciones sobre los dos set de datos el X_test y el X_train
            y_pred_test = lr.predict(X_test)
            y_pred_train = lr.predict(X_train)
            
            lr_results = self.metricas(y_test, y_train, y_pred_test, y_pred_train, "Regresion lineal")
            
            return lr_results
            
            
        elif tipo_modelo == "2":
            # creamos el objeto del árbol
            regressor = DecisionTreeRegressor(random_state = 0) 
            
            # ajustamos el modelo
            regressor.fit(X_train, y_train)
            
            # hacemos las predicciones sobre los dos set de datos el X_test y el X_train
            y_pred_test = regressor.predict(X_test)
            y_pred_train = regressor.predict(X_train)
            
            dt_results = self.metricas(y_test, y_train, y_pred_test, y_pred_train, "Decision Tree")
            print("Las metricas del modelo son: ")
            display(dt_results)
            
            
            nuevo_modelo = input("¿quieres hacer un modelo nuevo: S/N?")
            
            if nuevo_modelo.upper() == "N":
                return dt_results
            else:
                parametros = regressor.get_params()
                claves_deseadas = ['max_depth', 'max_features', 'min_samples_leaf', 'min_samples_split' ]
                valores_deseados = {clave: parametros[clave] for clave in claves_deseadas}
                print(f"Los principales hiperparametros del modelo son: {valores_deseados}")

                nuevo_modelo = self.gridsearch("Decision Tree II",  X_test, X_train, y_test, y_train)
                print("Las nuevas metricas del modelo son: ")
                display(nuevo_modelo)

        elif tipo_modelo == "3":
            random_forest = self.gridsearch("Random Forest", X_test, X_train, y_test, y_train, RandomForestRegressor())
            display(random_forest)
        
    
    def metricas(self, y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
        """
        Esta función nos devuelve las metricas del modelo en un dataframe para poder compararlas con otros modelos

        Args:
            y_test (_type_): variable respuesta de test 
            y_train (_type_): variable respuesta de train
            y_test_pred (_type_): variable respuesta predicha de test
            y_train_pred (_type_): variable respuesta predicha de train
            tipo_modelo (_type_): tipo de modelo que estamos haciendo (regresion lineal, decision tree, random forest)

        Returns:
            _type_: df con las metricas del modelo en test y train
        """
    
    
        resultados = {'MAE': [metrics.mean_absolute_error(y_test, y_test_pred), metrics.mean_absolute_error(y_train, y_train_pred)],
                    'MSE': [metrics.mean_squared_error(y_test, y_test_pred), metrics.mean_squared_error(y_train, y_train_pred)],
                    'RMSE': [np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)), np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))],
                    'R2':  [metrics.r2_score(y_test, y_test_pred), metrics.r2_score(y_train, y_train_pred)],
                    "set": ["test", "train"]}
        df_metricas = pd.DataFrame(resultados)
        df_metricas["modelo"] = tipo_modelo
        return df_metricas
        

In [5]:
modelo = Ajuste_modelo_lineal(df, "price")

In [6]:
X_entrena, X_testear, y_entrena, y_testear = modelo.separar_datos()

In [7]:
metricas_regresion_lineal = modelo.ajuste_modelo(X_testear, X_entrena, y_testear, y_entrena)
metricas_regresion_lineal

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.112884,0.033559,0.183192,0.967139,test,Regresion lineal
1,0.113824,0.033014,0.181697,0.968166,train,Regresion lineal


In [33]:
metricas_regresion_lineal = modelo.ajuste_modelo(X_testear, X_entrena, y_testear, y_entrena)
metricas_regresion_lineal

Las metricas del modelo son: 


Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.08931591,0.0171493,0.1309553,0.983208,test,Decision Tree
1,2.055511e-17,2.3277780000000002e-32,1.525706e-16,1.0,train,Decision Tree


Los principales hiperparametros del modelo son: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
el mejor arbol es DecisionTreeRegressor(max_depth=11, max_features=21, min_samples_split=11)
Las nuevas metricas del modelo son: 


Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.100513,0.01889,0.137441,0.981503,test,Decision Tree II
1,0.090546,0.014781,0.121578,0.985747,train,Decision Tree II


In [8]:
metricas_regresion_lineal = modelo.ajuste_modelo(X_testear, X_entrena, y_testear, y_entrena)
metricas_regresion_lineal

Las metricas del modelo son: 


Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.08931591,0.0171493,0.1309553,0.983208,test,Decision Tree
1,2.055511e-17,2.3277780000000002e-32,1.525706e-16,1.0,train,Decision Tree


Los principales hiperparametros del modelo son: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
el mejor arbol es DecisionTreeRegressor(max_depth=11, max_features=21, min_samples_split=9)
Las nuevas metricas del modelo son: 


Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.09979,0.018002,0.13417,0.982373,test,Decision Tree II
1,0.08999,0.014699,0.12124,0.985826,train,Decision Tree II


In [15]:
# Reparto de datos en train y test
# ==============================================================================
# lo primero que hacemos es definir cual es nuestra variable Y y nuestras variables X
X = df.drop('price', axis =1)
y = df['price']


X_train, X_test, y_train, y_test = train_test_split(X, y, train_size   = 0.8, random_state = 42)

In [37]:
gs

In [16]:
# Generación del GridSearch
# ==============================================================================

# definimos un diccionario con los hiperparámetros que queremos testear. 
param = {"max_depth": [4, 5, 6, 7,11],
        "min_samples_split": [10, 21,50, 100],
        "max_features": [1,2,3,4,5,6,9]}

gs = GridSearchCV(
            estimator=DecisionTreeRegressor(),
            param_grid= param,
            cv=10,
            verbose=3,
            return_train_score = True,
            scoring="neg_mean_squared_error")


In [18]:
# create a regressor object
regressor = DecisionTreeRegressor(random_state = 0) 
  
# fit the regressor with X and Y data
regressor.fit(X_train, y_train)

In [19]:
# ajustamos el modelo de nuevo

%time
gs.fit(X_train, y_train)

CPU times: total: 0 ns
Wall time: 0 ns
Fitting 10 folds for each of 140 candidates, totalling 1400 fits
[CV 1/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.179, test=-0.177) total time=   0.0s
[CV 2/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.363, test=-0.366) total time=   0.0s
[CV 3/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.471, test=-0.484) total time=   0.0s
[CV 4/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.282, test=-0.287) total time=   0.0s
[CV 5/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.829, test=-0.817) total time=   0.0s
[CV 6/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.696, test=-0.697) total time=   0.0s
[CV 7/10] END max_depth=4, max_features=1, min_samples_split=10;, score=(train=-0.305, test=-0.310) total time=   0.0s
[CV 8/10] END max_depth=4, max_features=1, min_samples_split=10

In [24]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    resultados = {'MAE': [metrics.mean_absolute_error(y_test, y_test_pred), metrics.mean_absolute_error(y_train, y_train_pred)],
                'MSE': [metrics.mean_squared_error(y_test, y_test_pred), metrics.mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)), np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))],
                'R2':  [metrics.r2_score(y_test, y_test_pred), metrics.r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [20]:
# sacamos cual es el mejor de todos los que hemos testeado usando el método best_estimator_

best_tree = gs.best_estimator_
best_tree

In [23]:
y_pred_test_dt2 = best_tree.predict(X_test)
y_pred_train_dt2 = best_tree.predict(X_train)


In [25]:
dt_results2 = metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, "Decision tree II")

In [26]:
dt_results2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.117857,0.024971,0.158022,0.975549,test,Decision tree II
1,0.109189,0.020833,0.144337,0.979911,train,Decision tree II


In [33]:
df_test

Unnamed: 0,id,carat,depth,table,x,y,z,cut_map,color_D,color_E,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0,0.32,60.5,58.0,4.43,4.49,2.70,4,0,0,...,1,0,0,0,1,0,0,0,0,0
1,1,1.24,62.9,60.0,6.80,6.74,4.26,3,0,0,...,1,0,0,0,1,0,0,0,0,0
2,2,1.66,62.0,59.0,7.55,7.60,4.70,3,1,0,...,0,0,0,0,1,0,0,0,0,0
3,3,0.75,60.6,56.0,5.94,5.90,3.59,3,1,0,...,0,0,0,0,0,1,0,0,0,0
4,4,1.50,64.8,55.0,7.26,7.15,4.67,0,0,1,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,1.10,59.6,60.0,6.74,6.70,4.00,3,0,0,...,0,0,0,0,1,0,0,0,0,0
13481,13481,0.90,62.1,60.0,6.14,6.20,3.83,2,1,0,...,0,0,0,0,1,0,0,0,0,0
13482,13482,0.30,62.1,53.3,4.30,4.32,2.68,4,0,0,...,0,0,0,0,0,0,0,1,0,0
13483,13483,1.25,59.6,59.0,7.01,7.09,4.20,4,0,0,...,1,0,0,0,1,0,0,0,0,0


In [34]:
df

Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_map,color_D,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0,0.30,62.4,58.0,4.31,4.28,2.68,6.353,3,1,...,0,0,0,0,0,1,0,0,0,0
1,1,1.01,62.7,56.0,6.42,6.46,4.04,9.183,4,0,...,0,0,0,0,0,0,0,0,0,1
2,2,0.72,61.8,59.0,5.71,5.74,3.54,7.983,4,0,...,0,0,0,0,0,0,0,1,0,0
3,3,1.08,63.2,57.0,6.54,6.50,4.12,8.371,2,0,...,0,0,0,0,0,1,0,0,0,0
4,4,0.36,62.3,59.0,4.50,4.55,2.82,6.588,3,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,0.42,62.1,59.0,4.78,4.82,2.98,6.551,3,1,...,0,0,0,0,0,1,0,0,0,0
40451,40451,0.53,62.0,58.0,5.21,5.18,3.22,7.382,3,0,...,0,0,0,0,0,0,0,1,0,0
40452,40452,0.80,62.8,58.0,5.86,5.90,3.69,7.768,1,0,...,0,0,0,0,0,1,0,0,0,0
40453,40453,1.01,61.5,57.0,6.40,6.48,3.96,8.726,2,0,...,0,0,0,0,0,0,0,1,0,0


In [35]:
X_reality_submision = best_tree.predict(df_test)
X_reality_submision

array([6.09311111, 8.66983997, 9.37206051, ..., 6.54318637, 8.58553659,
       7.87026984])

In [36]:
submision1 = pd.DataFrame(X_reality_submision)
submision1.reset_index(inplace=True)
submision1.rename(columns={"index":"id",0:"price"}, inplace =True)
submision1

Unnamed: 0,id,price
0,0,6.093111
1,1,8.669840
2,2,9.372061
3,3,7.855717
4,4,9.127919
...,...,...
13480,13480,8.669840
13481,13481,8.349554
13482,13482,6.543186
13483,13483,8.585537


In [40]:
submision1.to_csv("../data/submission1.csv", index= False)

In [39]:
pd.read_csv("../data/submission1.csv")

Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_map,color_D,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0,0.30,62.4,58.0,4.31,4.28,2.68,6.353,3,1,...,0,0,0,0,0,1,0,0,0,0
1,1,1.01,62.7,56.0,6.42,6.46,4.04,9.183,4,0,...,0,0,0,0,0,0,0,0,0,1
2,2,0.72,61.8,59.0,5.71,5.74,3.54,7.983,4,0,...,0,0,0,0,0,0,0,1,0,0
3,3,1.08,63.2,57.0,6.54,6.50,4.12,8.371,2,0,...,0,0,0,0,0,1,0,0,0,0
4,4,0.36,62.3,59.0,4.50,4.55,2.82,6.588,3,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,0.42,62.1,59.0,4.78,4.82,2.98,6.551,3,1,...,0,0,0,0,0,1,0,0,0,0
40451,40451,0.53,62.0,58.0,5.21,5.18,3.22,7.382,3,0,...,0,0,0,0,0,0,0,1,0,0
40452,40452,0.80,62.8,58.0,5.86,5.90,3.69,7.768,1,0,...,0,0,0,0,0,1,0,0,0,0
40453,40453,1.01,61.5,57.0,6.40,6.48,3.96,8.726,2,0,...,0,0,0,0,0,0,0,1,0,0
