In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as ticker
import seaborn as sns
from sklearn.model_selection import train_test_split
plt.rcParams["figure.figsize"] = (10,8)


from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.model_selection import GridSearchCV
from sklearn import tree



from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn import metrics


# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("insurance_limpio.csv", index_col = 0)
df.head()

Unnamed: 0,bmi,children,smoker,charges,age_imputer,sex_Unknown,sex_female,sex_male,region_Unknown,region_northeast,region_northwest,region_southeast,region_southwest
0,0.43643,0,0,1725.5523,-1.559793,0,0,1,0,0,0,1,0
1,0.342298,1,0,4449.462,-0.814847,0,0,1,0,0,0,1,0
2,-0.161369,0,0,3866.8552,-0.516868,0,0,1,0,0,1,0,0
3,-0.545232,0,0,3756.6216,-0.591363,0,1,0,0,0,0,1,0
4,0.396088,0,0,8240.5896,0.526057,0,1,0,0,0,0,1,0


In [3]:
class Ajuste_modelo_lineal:
    
    
    def __init__(self, dataframe, variable_respuesta):
        self.dataframe = dataframe
        self.variable_respuesta = variable_respuesta
        
    def separar_datos(self):
        # lo primero que hacemos es definir cual es nuestra variable Y y nuestras variables X
        X = self.dataframe.drop(self.variable_respuesta, axis =1)
        y = self.dataframe[self.variable_respuesta]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size   = 0.8, random_state = 42)
        
        return X_train, X_test, y_train, y_test
    
    
    def gridsearch(self, tipo_modelo, X_test, X_train, y_test, y_train, modelo = DecisionTreeRegressor()):
        profundidad = int(input("Cual es la profundidad máxima que quieres"))
        features = int(input("¿Cual es el nº de features maximo que quieres?"))
        leaf = int(input("¿Cual es el min_sample_leaf que quieres?"))
        split = int(input("¿Cual es el min_samples_split que quieres?"))
        
        param = {"max_depth": range(1, profundidad +1, 2),
                "min_samples_split": range(1, split +1, 2),
                "min_samples_leaf": range(1, leaf +1, 2),
                "max_features": range(1, features + 1, 2)}

        gs = GridSearchCV(
                    estimator = modelo,
                    param_grid= param,
                    cv=10,
                    verbose = 0,
                    return_train_score = True,
                    scoring="neg_mean_squared_error")
        gs.fit(X_train, y_train)
        
        self.best_tree = gs.best_estimator_
        print(f"el mejor arbol es {self.best_tree}")
        
        y_pred_test_dt2 = self.best_tree.predict(X_test)
        y_pred_train_dt2 = self.best_tree.predict(X_train)
        dt_results2 = self.metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, tipo_modelo)
        return dt_results2
    
    def ajuste_modelo(self, X_test, X_train, y_test, y_train):
        
        self.X_test = X_test
        self.X_train = X_train
        self.y_test = y_test
        self.y_train = y_train
        
        # iniciamos el método de Linear Regression
        
        tipo_modelo = input("Que modelo quieres hacer? 1: Regresion Lineal, 2: Decision Tree, 3: Random Forest")

        if tipo_modelo == "1":
            
            lr = LinearRegression()
            
            # fiteamos el modelo
            lr.fit(X_train, y_train)

            
            # hacemos las predicciones sobre los dos set de datos el X_test y el X_train
            y_pred_test = lr.predict(X_test)
            y_pred_train = lr.predict(X_train)
            
            lr_results = self.metricas(y_test, y_train, y_pred_test, y_pred_train, "Regresion lineal")
            
            return lr_results
            
            
        elif tipo_modelo == "2":
            # creamos el objeto del árbol
            regressor = DecisionTreeRegressor(random_state = 0) 
            
            # ajustamos el modelo
            regressor.fit(X_train, y_train)
            
            # hacemos las predicciones sobre los dos set de datos el X_test y el X_train
            y_pred_test = regressor.predict(X_test)
            y_pred_train = regressor.predict(X_train)
            
            dt_results = self.metricas(y_test, y_train, y_pred_test, y_pred_train, "Decision Tree")
            print("Las metricas del modelo son: ")
            display(dt_results)
            
            
            nuevo_modelo = input("¿quieres hacer un modelo nuevo: S/N?")
            
            if nuevo_modelo.upper() == "N":
                return dt_results
            else:
                parametros = regressor.get_params()
                claves_deseadas = ['max_depth', 'max_features', 'min_samples_leaf', 'min_samples_split' ]
                valores_deseados = {clave: parametros[clave] for clave in claves_deseadas}
                print(f"Los principales hiperparametros del modelo son: {valores_deseados}")

                nuevo_modelo = self.gridsearch("Decision Tree II",  X_test, X_train, y_test, y_train)
                print("Las nuevas metricas del modelo son: ")
                display(nuevo_modelo)

        elif tipo_modelo == "3":
            random_forest = self.gridsearch("Random Forest", X_test, X_train, y_test, y_train, RandomForestRegressor())
            display(random_forest)
        
    
    def metricas(self, y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    
    
        resultados = {'MAE': [metrics.mean_absolute_error(y_test, y_test_pred), metrics.mean_absolute_error(y_train, y_train_pred)],
                    'MSE': [metrics.mean_squared_error(y_test, y_test_pred), metrics.mean_squared_error(y_train, y_train_pred)],
                    'RMSE': [np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)), np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))],
                    'R2':  [metrics.r2_score(y_test, y_test_pred), metrics.r2_score(y_train, y_train_pred)],
                    "set": ["test", "train"]}
        df_metricas = pd.DataFrame(resultados)
        df_metricas["modelo"] = tipo_modelo
        return df_metricas
        

        

In [4]:
modelo = Ajuste_modelo_lineal(df, "charges")

In [5]:
X_entrena, X_testear, y_entrena, y_testear = modelo.separar_datos()

In [6]:
metricas_regresion_lineal = modelo.ajuste_modelo(X_testear, X_entrena, y_testear, y_entrena)
metricas_regresion_lineal