In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import r2_score 
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv("C:/Users/Clarita/Desktop/HENRY/vibi/Vibi_model/ETL/dataset/data_casa_depto.csv")

In [3]:
df.columns

Index(['Unnamed: 0', 'price', 'area', 'bath', 'room', 'parking', 'year',
       'property_type', 'near_cc', 'near_school', 'near_parks', 'near_avenue',
       'security', 'elevator', 'rest_area', 'pool', 'ranking'],
      dtype='object')

In [4]:
df = df.drop(columns=['Unnamed: 0'])

In [5]:
X = df.drop(columns=['price'])
Y = df["price"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

# Random Forest Regressor

In [7]:
linear_model = RandomForestRegressor()

linear_model.fit(X_train, y_train)

In [8]:
y_pred = linear_model.predict(X_test)
rmse_test= (mean_squared_error(y_test, y_pred, squared = False))
print(f'Raíz del error cuadrático medio en Test x: {round(rmse_test)}')

Raíz del error cuadrático medio en Test x: 27156


In [9]:
r2 = r2_score(y_test, y_pred) 
print('(R^2):', r2)

(R^2): 0.7601130453069594


In [45]:
param_grid = {
    'n_estimators': [100, 200, 300],           # Número de árboles en el ensamble
    'max_depth': [None, 10, 20, 30],          # Profundidad máxima de los árboles
    'min_samples_split': [2, 5, 10],         # Mínimo de muestras para dividir un nodo
    'min_samples_leaf': [1, 2, 4],           # Mínimo de muestras en las hojas
    'max_features': ['sqrt', 'log2'] # Número de características a considerar en cada división
}


In [46]:
#Instancio
random = RandomForestRegressor()

# Le pasamos la grilla que creamos
model = GridSearchCV(random, param_grid=param_grid, cv=5)

# Entrenamos
model.fit(X_train, y_train)

print("Mejores hiperparámetros: "+str(model.best_params_))
print("Mejor Score: "+str(model.best_score_)+'\n')

Mejores hiperparámetros: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Mejor Score: 0.7493312601832667



# KNN

In [36]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

# Supongamos que tienes tus datos de entrenamiento y prueba en X_train, y_train, X_test, y_test

# Divide tus datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Crea el modelo KNN Regressor
knn_regressor = KNeighborsRegressor(n_neighbors=5)  # Puedes ajustar el valor de "n_neighbors"

# Entrena el modelo
knn_regressor.fit(X_train, y_train)

# Realiza predicciones en el conjunto de prueba
y_pred = knn_regressor.predict(X_test)


r2 = r2_score(y_test, y_pred)

print(f"R-squared (R^2): {r2}")


Mean Squared Error (MSE): 1324593765.1514826
Mean Absolute Error (MAE): 26418.090131259116
R-squared (R^2): 0.5670450984431532


In [37]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9],            # Número de vecinos
    'weights': ['uniform', 'distance'],     # Peso de los vecinos (uniforme o por distancia)
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algoritmo de búsqueda de vecinos
    'leaf_size': [10, 20, 30],             # Tamaño de hoja para los algoritmos tree-based
    'p': [1, 2],                           # Parámetro para la métrica de distancia (1 para Manhattan, 2 para Euclidiana)
}


In [41]:
#Instancio
knn = KNeighborsRegressor()

# Le pasamos la grilla que creamos
model = GridSearchCV(knn, param_grid=param_grid, cv=5)

# Entrenamos
model.fit(X_train, y_train)

print("Mejores hiperparámetros: "+str(model.best_params_))
print("Mejor Score: "+str(model.best_score_)+'\n')

Mejores hiperparámetros: {'algorithm': 'brute', 'leaf_size': 10, 'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
Mejor Score: 0.626668542174819



# Gradient Boosting

In [17]:
from sklearn.ensemble import GradientBoostingRegressor

gb_regressor = GradientBoostingRegressor( loss= 'huber',n_estimators=300, learning_rate=0.1, max_depth=5, random_state=42,  min_samples_leaf= 4, min_samples_split= 5)

# Entrena el modelo
gb_regressor.fit(X_train, y_train)

# Realiza predicciones en el conjunto de prueba
y_pred = gb_regressor.predict(X_test)

r2 = r2_score(y_test, y_pred)

print(f"R-squared (R^2): {r2}")

R-squared (R^2): 0.7519951978861257


In [15]:
param_grid = {
    'n_estimators': [100, 200, 300],           # Número de estimadores (árboles) en el ensamble       
    'max_depth': [3, 4, 5, 6],               # Profundidad máxima de los árboles base
    'min_samples_split': [2, 5, 10],         # Mínimo de muestras para dividir un nodo interno
    'min_samples_leaf': [1, 2, 4],           # Mínimo de muestras en una hoja
    'loss': ['ls', 'lad', 'huber']         # Función de pérdida (least squares, least absolute deviation, huber)
}


In [16]:
#Instancio
gb = GradientBoostingRegressor()

# Le pasamos la grilla que creamos
model = GridSearchCV(gb, param_grid=param_grid, cv=5)

# Entrenamos
model.fit(X_train, y_train)

print("Mejores hiperparámetros: "+str(model.best_params_))
print("Mejor Score: "+str(model.best_score_)+'\n')

1080 fits failed out of a total of 1620.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Clarita\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Clarita\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\ensemble\_gb.py", line 420, in fit
    self._validate_params()
  File "c:\Users\Clarita\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Clarita\AppData\Local\Programs\Python\Pyt

Mejores hiperparámetros: {'loss': 'huber', 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 300}
Mejor Score: 0.7514647105511532

