In [None]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualizaciones
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import tree

# Para realizar la regresión lineal y la evaluación del modelo
# -----------------------------------------------------------------------
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from sklearn.model_selection import KFold,LeaveOneOut, cross_val_score

In [2]:
df = pd.read_csv("../datos/output/df_sin_nulos_min_max_iterative_target.csv", index_col=0)
df.head()

Unnamed: 0,price,size,bathrooms,province,municipality,distance,district,size_MinMax,distance_MinMax,outliers_ifo_0.01_100,...,price_725.0,price_728.0,price_730.0,price_733.0,price_735.0,price_740.0,price_745.0,price_747.0,price_749.0,price_750.0
0,750.0,60.0,694.552381,692.932011,705.594114,7037.0,701.253737,0.215054,0.121497,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,750.0,70.0,694.552381,692.932011,701.418584,16145.0,700.208637,0.268817,0.282949,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,400.0,67.0,679.963693,692.932011,654.819074,55041.0,645.657771,0.252688,0.972435,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,590.0,70.0,679.963693,692.932011,679.539684,56596.0,645.657771,0.268817,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,684.0,45.0,694.552381,692.932011,705.594114,10656.0,645.657771,0.134409,0.185649,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
x = df.drop(columns=["price"])

y = df[["price"]]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=42, shuffle=True)
modelo_lineal = LinearRegression()
modelo_lineal.fit(x_train, y_train)

In [None]:
y_test_pred=modelo_lineal.predict(x_test)
y_train_pred=modelo_lineal.predict(x_train)

In [None]:
def hacer_metrica(y_train, y_pred_train, y_test, y_pred_test):
    metricas = {
        'train': {
            'r2_score': r2_score(y_train, y_pred_train),
            'MAE': mean_absolute_error(y_train, y_pred_train),
            'MSE': mean_squared_error(y_train, y_pred_train),
            'RMSE': np.sqrt(mean_squared_error(y_train, y_pred_train))
        },
        'test': {
            'r2_score': r2_score(y_test, y_pred_test),
            'MAE': mean_absolute_error(y_test, y_pred_test),
            'MSE': mean_squared_error(y_test, y_pred_test),
            'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_test))

        }
    }

    return pd.DataFrame(metricas).T

In [None]:
kf= KFold(n_splits=5,
          shuffle=True,
          random_state=42)

score_r2= cross_val_score(modelo_lineal, x, y, cv=kf, scoring= "r2")
np.mean(score_r2)

In [None]:
df_coefs=pd.DataFrame(modelo_lineal.coef_, columns= x.columns).T
df_coefs.sort_values(by=0, key= abs, ascending=False)

In [None]:
modelo_arbol_vacio= DecisionTreeRegressor()         
modelo_arbol_vacio.fit(x_train, y_train)
y_pred_test_arb_vacio=modelo_arbol_vacio.predict(x_test)
y_pred_train_arb_vacio=modelo_arbol_vacio.predict(x_train)
df_metricas_arbol_vacio=hacer_metrica(y_train, y_pred_train_arb_vacio, y_test, y_pred_test_arb_vacio)
df_metricas_arbol_vacio

In [None]:
params_arbol = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [10, 50, 100, 200],
    'min_samples_leaf': [10, 50, 100, 200],
    'max_leaf_nodes': [4, 6, 8, 10, 20, 30]
}

grid_search_arbol=GridSearchCV(DecisionTreeRegressor(),
                               param_grid=params_arbol,
                               cv=10,
                               scoring= "neg_mean_squared_error",
                               n_jobs=-1)

grid_search_arbol.fit(x_train, y_train)

modelo_final_arbol = grid_search_arbol.best_estimator_

In [None]:
grid_search_arbol.best_params_

In [None]:
df_importancia_arbol = pd.DataFrame(modelo_final_arbol.feature_importances_, columns=["importancia"], index=x.columns).sort_values(by = "importancia", ascending=False)
df_importancia_arbol.head()

In [None]:
sns.barplot(x="importancia", y= df_importancia_arbol.index, hue=df_importancia_arbol.index, data=df_importancia_arbol)

In [None]:
y_pred_test_arbol = modelo_final_arbol.predict(x_test)
y_pred_train_arbol = modelo_final_arbol.predict(x_train)

df_metricas_arbol = hacer_metrica(y_train, y_pred_train_arbol, y_test, y_pred_test_arbol)
df_metricas_arbol