## Entrenamiento del modelo

### Importación de librerías

Datos y gráficos

In [12]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle
import yaml
import os

Preprocesado y modelado

In [13]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score

### Warnings y display de la dataframe

In [15]:
pd.set_option("max_colwidth", 50)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

import warnings
warnings.filterwarnings("ignore")

### Importación de dataframes

In [16]:
directorio_actual = os.getcwd()

In [17]:
'''Creo las X_train e y_train a partir de df_train.
   Creo las X_test e y_test a partir de df_test.
   Ambas previamente importado
   Por ultimo, creo las X e y con todo el train y test,
   para entrenar mimodelo final y generalizar mejor ante nuevos input.'''

df_num_NS = pd.read_csv(os.path.join(directorio_actual, '..', 'data','processed', 'df_num_NS.csv'))
df_train_NS = pd.read_csv(os.path.join(directorio_actual, '..', 'data','train', 'df_train_NS.csv'))
df_test_NS = pd.read_csv(os.path.join(directorio_actual, '..', 'data','test', 'df_test_NS.csv'))

X = df_num_NS.drop(['price'],axis=1)
y = df_num_NS['price']
X_train_NS = df_train_NS.drop(['price'],axis=1)
y_train_NS = df_train_NS['price']
X_test_NS = df_test_NS.drop(['price'],axis=1)
y_test_NS = df_test_NS['price']

### Entrenamiento

In [18]:
X

Unnamed: 0,size,precio_area / distrito,parking,codigo_tipo,piscina,total_rooms
0,66.0,8.200000,0,2,0,3
1,85.0,13.694444,0,2,0,4
2,41.0,9.947368,0,2,0,2
3,40.0,13.694444,0,2,0,2
4,256.0,7.666667,0,7,0,6
...,...,...,...,...,...,...
986,84.0,6.217391,1,2,0,5
987,83.0,6.217391,0,2,0,4
988,130.0,6.217391,0,3,0,4
989,102.0,6.217391,0,2,0,5


In [24]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [72]:
kmeans = KMeans(n_clusters=75) 
clusters = kmeans.fit_predict(X_scaled)

In [65]:
X_scaled.shape

(991, 6)

In [66]:
df_num_NS['cluster'] = clusters

In [67]:
precio_medio_por_cluster = df_num_NS.groupby(['cluster'],as_index=False)['price'].agg('mean')

In [68]:
mapeo_cluster = dict(zip(precio_medio_por_cluster['cluster'], precio_medio_por_cluster['price']))

In [69]:
df_num_NS['prediccion'] = df_num_NS['cluster'].map(mapeo_cluster)

In [70]:
print(mean_absolute_error(df_num_NS['price'],df_num_NS['prediccion']))
print(mean_absolute_percentage_error(df_num_NS['price'],df_num_NS['prediccion']))

196.89358619389958
0.16935392432053117


In [71]:
df_num_NS[df_num_NS['cluster'] == 3]

Unnamed: 0,size,precio_area / distrito,parking,codigo_tipo,piscina,total_rooms,price,cluster,prediccion
22,123.0,10.666667,1,2,0,4,750.0,3,990.0
34,122.0,11.588235,1,2,0,5,1290.0,3,990.0
40,84.0,10.777778,1,2,0,3,750.0,3,990.0
55,115.0,11.035088,1,2,0,5,1180.0,3,990.0
58,80.0,11.588235,1,2,0,4,795.0,3,990.0
73,80.0,10.666667,1,2,0,4,750.0,3,990.0
162,110.0,9.947368,1,2,0,4,1200.0,3,990.0
175,90.0,11.588235,1,2,0,5,900.0,3,990.0
255,99.0,11.588235,1,2,0,5,1265.0,3,990.0
276,58.0,10.954545,1,2,0,4,950.0,3,990.0


In [42]:
precio_medio_por_cluster.sort_values(by='price')

Unnamed: 0,cluster,price
177,177,300.0
230,230,375.0
138,138,472.5
46,46,500.0
183,183,500.0
86,86,520.0
135,135,525.0
31,31,548.75
110,110,550.0
209,209,550.0
