# **Preparación y Limpieza Inicial**
[**Primera Parte - Limpieza Inicial**](https://drive.google.com/file/d/1jDo2VDj_l2IlT2bit-UM9_QBNjp6P7U_/view?usp=sharing)

# **Importar Dataset modificado**

In [1]:
# Importar librería
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn as sb

from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold
from numpy import arange
import plotly.express as px

In [2]:
# Se importa el kaggle.json
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"lauramartinezortiz","key":"d4a3e9eb11c8f40076c15784c6b41132"}'}

In [3]:
# Se crea una carpeta .kaggle donde se guarda la llave
!mkdir -p ~/.kaggle

# Se mueve la llave a la carpeta
!cp kaggle.json ~/.kaggle/

# Se permite el acceso
!chmod 600 ~/.kaggle/kaggle.json

# Descarga el dataset específico 
!kaggle datasets download -d lauramartinezortiz/df-clean-final --force

# Descomprime el dataset
!unzip df-clean-final.zip

Downloading df-clean-final.zip to /content
 45% 3.00M/6.67M [00:00<00:00, 5.60MB/s]
100% 6.67M/6.67M [00:00<00:00, 9.20MB/s]
Archive:  df-clean-final.zip
  inflating: df_clean.csv            


In [4]:
df = pd.read_csv('/content/df_clean.csv')
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,start_date,end_date,created_on,departments,bedrooms,bathrooms,price,property_type,operation_type,city
0,2020-10-07,2021-10-09,2020-10-07,Valle del Cauca,6,7,1300000000,Casa,Venta,Calima
1,2020-10-07,2021-01-06,2020-10-07,Valle del Cauca,3,7,2800000000,Casa,Venta,Cali
2,2020-10-07,2020-10-07,2020-10-07,Valle del Cauca,3,7,2800000000,Casa,Venta,Cali
3,2020-10-07,2021-04-12,2020-10-07,Valle del Cauca,5,8,3500000000,Casa,Venta,Cali
4,2020-10-07,9999-12-31,2020-10-07,Valle del Cauca,8,9,480000000,Casa,Venta,Cali


In [5]:
df.shape

(737345, 10)

# **Preparación con Pipelines**



In [6]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

df_clean = df.copy()

In [7]:
# Agrupar columnas cuyo size sea menor a 1000 en la categoría "Other"
class RenameValues(BaseEstimator, TransformerMixin):
    def __init__(self,columns):
        self.columns=columns

    def transform(self,X,y=None):
      group_sizes = X.groupby(self.columns).size()
      small_groups = group_sizes[group_sizes < 1000].index.tolist()
      X.loc[X[self.columns].isin(small_groups), self.columns] = "Other"
      return X

    def fit(self, X, y=None):
        return self

#df['departments'] = df['departments'].replace(df_depart[df_depart < 1000].index, 'Others')

In [8]:
# Eliminar columnas
class ColumnDropperTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,columns):
        self.columns=columns

    def fit(self, X, y=None):
        return self

    def transform(self,X,y=None):
        X_dropped = X.drop(self.columns, axis = 1)
        self.columns = X_dropped.columns
        return X_dropped
        #return X.drop(self.columns,axis=1)


# Seleccionar los Features
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y = None):
        return self

    def transform(self, X):
        return X.loc[:,self.columns]

In [9]:
#One Hot Encoder personalizado
class OneHotEncoderCustom(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.ohe = OneHotEncoder(drop='first', handle_unknown = 'ignore')

    def fit(self, X, y = None):
        X_ = X.loc[:,self.columns]
        self.ohe.fit(X_)
        return self

    def transform(self, X):
        X_ = X.loc[:,self.columns]
        X_transformed = pd.DataFrame(self.ohe.transform(X_).toarray(), columns= self.ohe.get_feature_names_out())
        X.drop(self.columns, axis= 1, inplace=True)
        X[self.ohe.get_feature_names_out()] = X_transformed[self.ohe.get_feature_names_out()].values
        return X

In [10]:
drop_columns = ['start_date',	'end_date',	'created_on']
drop_features = ColumnDropperTransformer(columns = drop_columns)

drop_pipeline = Pipeline([
    ("dropColumns", drop_features),
])

df_clean = drop_pipeline.fit_transform(df_clean)

In [11]:
df_clean

Unnamed: 0,departments,bedrooms,bathrooms,price,property_type,operation_type,city
0,Valle del Cauca,6,7,1300000000,Casa,Venta,Calima
1,Valle del Cauca,3,7,2800000000,Casa,Venta,Cali
2,Valle del Cauca,3,7,2800000000,Casa,Venta,Cali
3,Valle del Cauca,5,8,3500000000,Casa,Venta,Cali
4,Valle del Cauca,8,9,480000000,Casa,Venta,Cali
...,...,...,...,...,...,...,...
737340,Antioquia,2,1,700000,Apartamento,Arriendo,Medellín
737341,Antioquia,2,1,700000,Apartamento,Arriendo,Medellín
737342,Antioquia,2,1,700000,Apartamento,Arriendo,Medellín
737343,Atlántico,3,4,700000000,Casa,Venta,Barranquilla


In [12]:
# Columnas númericas
num_cols = ['bedrooms', 'bathrooms']

# Columnas para One Hot Encoding 
ohe_col = ['property_type', 'operation_type']

# Columnas categóricas
cat_col = ohe_col + ['departments', 'city']

In [13]:
# Pasos de procesamiento para variables numéricas 
num_features = FeatureSelector(columns = num_cols)

# Se construyen los pasos
num_pipe = Pipeline(steps = [
    ('num_feature', num_features)
])

In [14]:
# Pasos de procesamiento para variables categóricas
groupingDepart = RenameValues(columns ='departments')
groupingCity = RenameValues(columns ='city')
ohe = OneHotEncoderCustom(columns = cat_col)
cat_feature = FeatureSelector(columns = cat_col)

#Se construyen los pasos
cat_pipe = Pipeline(steps = [
   ('cat_feature', cat_feature),
   ('group1', groupingDepart),
   ('group2', groupingCity),
   ('ohe', ohe)
])

In [15]:
# Se combinan los pasos de procesamiento de las variables numéricas y categóricas 
combined_preprocessing = FeatureUnion([
    ('numericals', num_pipe),
    ('categoricals', cat_pipe),
])

In [16]:
combined_preprocessing

# **Modelos usando la libreria Pycaret**

In [20]:
#!pip install pycaret
from pycaret.regression import *

In [26]:
r1 = setup(df_clean, target = 'price', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,price
2,Target type,Regression
3,Original data shape,"(737345, 7)"
4,Transformed data shape,"(737345, 17)"
5,Transformed train set shape,"(516141, 17)"
6,Transformed test set shape,"(221204, 17)"
7,Numeric features,2
8,Categorical features,4
9,Preprocess,True


In [23]:
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,307664844.4711,3.5854668445681567e+18,1872804946.0523,0.1404,1.2843,10.945,3.026
xgboost,Extreme Gradient Boosting,308364822.7019,3.5845149065617715e+18,1873688230.9065,0.1386,1.4773,10.3408,46.996
rf,Random Forest Regressor,294448039.9168,3.594057222446428e+18,1875764835.4434,0.1369,0.7558,2.7347,95.386
et,Extra Trees Regressor,293777224.035,3.6234457734194447e+18,1884366231.3736,0.128,0.7564,2.7412,67.7
gbr,Gradient Boosting Regressor,332939968.4054,3.672291579160869e+18,1896713986.2122,0.1173,1.8894,15.9567,38.731
dt,Decision Tree Regressor,295479566.874,3.672103390554177e+18,1897443566.6627,0.1157,0.759,2.7545,3.258
ridge,Ridge Regression,431169531.3975,3.842492665374201e+18,1940833603.3167,0.0753,3.0555,77.8596,0.907
llar,Lasso Least Angle Regression,431171002.5706,3.8424927205807534e+18,1940833609.2747,0.0753,3.0555,77.8603,0.902
lasso,Lasso Regression,431171010.9986,3.8424927216342487e+18,1940833609.6295,0.0753,3.0555,77.8603,18.203
lr,Linear Regression,431171013.2003,3.8424927214784686e+18,1940833609.5802,0.0753,3.0555,77.8603,4.067


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

In [27]:
lightgbm = create_model('lightgbm')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,302325771.5433,2.831058465467848e+18,1682574950.9213,0.1413,1.2511,6.6339
1,305144690.6477,3.425235671282504e+18,1850739222.9276,0.1418,1.3339,4.4482
2,315608421.5403,4.773552433603849e+18,2184846089.2255,0.1066,1.3159,3.4942
3,321013166.5034,4.164328941702073e+18,2040668748.6464,0.1406,1.2793,4.0068
4,306469551.1384,2.781658025987963e+18,1667830334.8926,0.1628,1.2867,4.3377
5,303485770.0434,4.349706204273368e+18,2085594928.1376,0.1132,1.2622,66.1811
6,302125620.8454,2.2298756589752576e+18,1493276819.2721,0.1956,1.2908,4.0379
7,306025272.9103,2.88597418636165e+18,1698815524.5234,0.1231,1.289,3.8921
8,300859989.0271,2.6017251916368783e+18,1612986420.1651,0.1862,1.2565,3.5594
9,313590190.5113,5.811553666390175e+18,2410716421.8112,0.0924,1.2775,8.8587


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [28]:
save_model(lightgbm, model_name='prediction_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['bedrooms', 'bathrooms'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['departments', 'property_type',
                                              'operation_type', 'city'],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('onehot_encoding',
                  TransformerW...operty_type', 'operation_type'],
                                     transformer=OneHotEncoder(cols=['property_type',
                                                                     'operation_type'],
                                                               handle_missing='return_nan',
                                                               use_cat_names=True))),
                 ('rest_encodin

# **Implementación**

In [17]:
# Se crean los conjunto de datos de train y de valid
X = df_clean.drop('price', axis = 1)
y = df_clean['price']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=123)

## **Valor de Alpha para Ridge**

In [None]:
# Creación y entrenamiento del modelo (con búsqueda por CV del valor óptimo alpha)

rid = RidgeCV(
    alphas = np.logspace(-10, 2, 200),
    fit_intercept = True,
    store_cv_values = True)

complete_pipeline = Pipeline([
        ('preprocessing', combined_preprocessing),
        ('StandardScaler', StandardScaler()),
        ('Model Training', rid)
    ])

_ = complete_pipeline.fit(X = X_train, y = y_train)

In [None]:
# modelo.cv_values almacena el mse de cv para cada valor de alpha. Tiene
# dimensiones (n_samples, n_targets, n_alphas)

mse_cv = rid.cv_values_.reshape((-1, 200)).mean(axis=0)
mse_sd = rid.cv_values_.reshape((-1, 200)).std(axis=0)

# Se aplica la raíz cuadrada para pasar de mse a rmse
rmse_cv = np.sqrt(mse_cv)
rmse_sd = np.sqrt(mse_sd)

# Se identifica el óptimo y el óptimo + 1std
min_rmse     = np.min(rmse_cv)
sd_min_rmse  = rmse_sd[np.argmin(rmse_cv)]
min_rsme_1sd = np.max(rmse_cv[rmse_cv <= min_rmse + sd_min_rmse])
optimo       = rid.alphas[np.argmin(rmse_cv)]
optimo_1sd   = rid.alphas[rmse_cv == min_rsme_1sd]

In [None]:
# Mejor valor alpha encontrado
print(f"Mejor valor de alpha encontrado: {rid.alpha_}")

## **Valor de Alpha para Lasso**

In [None]:
from sklearn.linear_model import LassoCV
import warnings
warnings.filterwarnings("ignore")

lss = LassoCV(
            alphas = np.logspace(-10, 3, 200),
            cv = 10,
         )

complete_pipeline = Pipeline([
        ('preprocessing', combined_preprocessing),
        ('StandardScaler', StandardScaler()),
        ('Model Training', lss)
    ])

_ = complete_pipeline.fit(X = X_train, y = y_train)

In [None]:
# modelo.mse_path_ almacena el mse de cv para cada valor de alpha. Tiene
# dimensiones (n_alphas, n_folds)
mse_cv = lss.mse_path_.mean(axis=1)
mse_sd = lss.mse_path_.std(axis=1)

# Se aplica la raíz cuadrada para pasar de mse a rmse
rmse_cv = np.sqrt(mse_cv)
rmse_sd = np.sqrt(mse_sd)

# Se identifica el óptimo y el óptimo + 1std
min_rmse     = np.min(rmse_cv)
sd_min_rmse  = rmse_sd[np.argmin(rmse_cv)]
min_rsme_1sd = np.max(rmse_cv[rmse_cv <= min_rmse + sd_min_rmse])
optimo       = lss.alphas_[np.argmin(rmse_cv)]
optimo_1sd   = lss.alphas_[rmse_cv == min_rsme_1sd]

In [None]:
print(f"Mejor valor de alpha encontrado: {lss.alpha_}")

## **Modelos de regresión**

In [21]:
import warnings 
warnings.filterwarnings("ignore")

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import KFold, cross_val_score

# Inicializar modelos de regresión 
lr = LinearRegression()
rf = RandomForestRegressor()
rid = Ridge(alpha=100)
lss = Lasso(alpha=1000)
gbr = GradientBoostingRegressor(random_state=0)


models ={"Linear Regression":lr,
    "Ridge Regression":rid,
    "Lasso Regression":lss,
    "Random Forest Regression":rf,
    "Gradient Boosting Regressor": gbr}

scaler = StandardScaler()


for name, model in models.items():
  
    complete_pipeline = Pipeline([
        ('preprocessing', combined_preprocessing),
        ('StandardScaler', scaler),
        ('Model Training', model)
    ])

    # Fit los modelos
    complete_pipeline.fit(X_train, y_train)

    # Scores
    train_pred = complete_pipeline.predict(X_train)
    valid_pred = complete_pipeline.predict(X_valid)
        
    # Evaluación de los modelos
    print('*' * 50)
    print(f'{name} Training score:')
    print(f'MAE: {round(mean_absolute_error(y_train, train_pred),4)} | RMSE: {round(mean_squared_error(y_train, train_pred, squared=False),4)} | R2: {round(r2_score(y_train, train_pred),4)}')
    print('-' * 20)
    print(f'{name} Validation score:')
    print(f'MAE: {round(mean_absolute_error(y_valid, valid_pred),4)} | RMSE: {round(mean_squared_error(y_valid, valid_pred, squared=False),4)} | R2: {round(r2_score(y_valid, valid_pred),4)}')

**************************************************
Linear Regression Training score:
MAE: 431784847.386 | RMSE: 2014284662.0801 | R2: 0.0682
--------------------
Linear Regression Validation score:
MAE: 432917278.243 | RMSE: 1703413234.9297 | R2: 0.0832
**************************************************
Ridge Regression Training score:
MAE: 431706695.7248 | RMSE: 2014284791.6403 | R2: 0.0682
--------------------
Ridge Regression Validation score:
MAE: 432764016.915 | RMSE: 1703385893.7234 | R2: 0.0832
**************************************************
Lasso Regression Training score:
MAE: 431782482.083 | RMSE: 2014284682.2516 | R2: 0.0682
--------------------
Lasso Regression Validation score:
MAE: 432903844.5006 | RMSE: 1703410519.8837 | R2: 0.0832
**************************************************
Random Forest Regression Training score:
MAE: 291316442.3893 | RMSE: 1916041014.9671 | R2: 0.1569
--------------------
Random Forest Regression Validation score:
MAE: 299364622.7867 | RMSE:

In [22]:
import plotly.express as px

scores = {"Linear Regression": {"R2": 0.0832}, "Ridge Regression": {"R2": 0.0832},
          "Lasso Regression": {"R2": 0.0832}, "Random Forest Regression": {"R2": 0.1223}, 
          "Gradient Boosting Regressor": {"R2": 0.1235}}

models = list(scores.keys()) # the names of the models 
metrics = list(scores[models[0]].keys()) # the names of the metrics 
x = np.arange(len(models)) # the label locations 
width = 0.2 # the width of the bars 
multiplier = 0 
fig = px.bar() 
for metric in metrics:
  offset = width * multiplier 
  values = [scores[model][metric] 
            for model in models]

colors = ['lightgreen' if x == max(values) else 'lightblue' for x in values]

fig.add_bar(name=metric, x=models, y=values,  text=values, textposition="auto", marker_color=colors)
multiplier += 1
fig.update_layout(yaxis_title='Scores', title='Model performance by R2', barmode='group') 
fig.show()


In [None]:

scores = {"Linear Regression": {"MAE": 432917278.243 , "RMSE": 1703413234.9297 }, "Ridge Regression": {"MAE": 432764016.915 , "RMSE": 1703385893.7234},
          "Lasso Regression": {"MAE": 432903844.5006 , "RMSE": 1703410519.8837}, "Random Forest Regression": {"MAE": 299600926.7617, "RMSE": 1666619300.6394}, 
          "Gradient Boosting Regressor": {"MAE": 332017387.5271, "RMSE": 1665517099.5419}}

models = list(scores.keys()) # the names of the models 
metrics = list(scores[models[0]].keys()) # the names of the metrics 

df = pd.DataFrame(scores) # convert the dictionary to a dataframe 
df = df.T # transpose the dataframe 
df = df.reset_index().rename(columns={'index': 'Model'}) # reset the index and rename the column 
fig = px.bar(df, x='Model', y=metrics, facet_col='variable', labels={'variable': 'Metric'}, title='Model performance by metric', facet_col_wrap=3)
fig.show()