# Scaler

### Importar las librerias necesarias

In [29]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler ,MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression

In [9]:
df = pd.read_csv('regression_data_clean.csv')
df.head()

Unnamed: 0,bedrooms,bathrooms,floors,view,grade,sqft_above,lat,price,sqft_total
0,3,1.0,1.0,0,7,1180,47.5112,221900,1340
1,3,2.25,2.0,0,7,2170,47.721,538000,2090
2,2,1.0,1.0,0,6,770,47.7379,180000,2720
3,4,3.0,1.0,0,7,1050,47.5208,604000,2270
4,3,2.0,1.0,0,8,1680,47.6168,510000,1800


El primer paso es separar los datos de train y test de nuestra variable dependiente de las independientes

In [13]:
X = df.drop('price',axis=1)
y = df['price']

X.head()

Unnamed: 0,bedrooms,bathrooms,floors,view,grade,sqft_above,lat,sqft_total
0,3,1.0,1.0,0,7,1180,47.5112,1340
1,3,2.25,2.0,0,7,2170,47.721,2090
2,2,1.0,1.0,0,6,770,47.7379,2720
3,4,3.0,1.0,0,7,1050,47.5208,2270
4,3,2.0,1.0,0,8,1680,47.6168,1800


In [14]:
y.head()

0    221900
1    538000
2    180000
3    604000
4    510000
Name: price, dtype: int64

Como podemos ver, han quedado separados la variable dependiente price de las independientes (que son el resto).

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=22)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((17277, 8), (4320, 8), (17277,), (4320,))

Hacemos el StandardScaler y el MinMaxScaler para poner todos los datos a la misma escala.

In [16]:
sc = StandardScaler().fit(X_train)
mm = MinMaxScaler().fit(X_train)

Transformamos nuestro dataset utilizando el Standard y el MinMax Scaler para ver con cual de los dos funciona mejor

In [18]:
X_train_sc = sc.transform(X_train)
X_test_sc = sc.transform(X_test)
X_train_mm = mm.transform(X_train)
X_test_mm = mm.transform(X_test)

# Modelo de Regresión Lineal

Utilizamos este modelo de regresión porque es el que se utiliza para la predicción de un número

In [19]:
ln = LinearRegression()
ln_sc = LinearRegression()
ln_mm = LinearRegression()

In [20]:
ln.fit(X_train,y_train)
ln_sc.fit(X_train_sc,y_train)
ln_mm.fit(X_train_mm,y_train)

Ya tenemos el modelo hecho, ahora van las predicciones

In [21]:
preds=ln.predict(X_test)
preds_sc=ln_sc.predict(X_test_sc)
preds_mm=ln_mm.predict(X_test_mm)

In [22]:
preds[:10]

array([ 357250.8293811 ,  517142.60426739,  430960.12314068,
        487292.28824419,  214207.49216063,  312569.03842538,
        470543.06071096, 1913626.22082937,  746891.98212442,
        130541.65818232])

In [23]:
preds_sc[:10]

array([ 357250.82938111,  517142.60426739,  430960.12314068,
        487292.28824419,  214207.49216063,  312569.03842538,
        470543.06071096, 1913626.22082937,  746891.98212441,
        130541.65818232])

In [24]:
preds_mm[:10]

array([ 357250.82938111,  517142.60426739,  430960.12314068,
        487292.28824419,  214207.49216063,  312569.03842538,
        470543.06071097, 1913626.22082937,  746891.98212441,
        130541.65818232])

Ahora vamos a evaluar nuestro modelo

In [25]:
ln.score(X_train,y_train),ln_sc.score(X_train_sc,y_train),ln_mm.score(X_train_mm,y_train)

(0.622558943413623, 0.6225589434136221, 0.6225589434136221)

In [27]:
ln.score(X_test,y_test),ln_sc.score(X_test_sc,y_test),ln_mm.score(X_test_mm,y_test)

(0.6368073828866516, 0.6368073828866503, 0.6368073828866505)

In [30]:
mse(y_train,ln.predict(X_train),squared=False)

225400.98674684382

In [31]:
mse(y_test,preds,squared=False)

222524.03510643146

Dado que nos da un error de más de 220000€ vamos a probar con otros modelo

In [32]:
from sklearn.ensemble import RandomForestRegressor

In [33]:
rf = RandomForestRegressor()
rf_sc = RandomForestRegressor()
rf_mm = RandomForestRegressor()

In [34]:
rf.fit(X_train,y_train)
rf_sc.fit(X_train_sc,y_train)
rf_mm.fit(X_train_mm,y_train)

In [36]:
predsrf=rf.predict(X_test)
predsrf_sc=rf_sc.predict(X_test_sc)
predsrf_mm=rf_mm.predict(X_test_mm)

In [38]:
rf.score(X_train,y_train),rf_sc.score(X_train_sc,y_train),rf_mm.score(X_train_mm,y_train)

(0.9704108035157478, 0.9697354005895716, 0.9677801584858423)

In [39]:
rf.score(X_test,y_test),rf_sc.score(X_test_sc,y_test),rf_mm.score(X_test_mm,y_test)

(0.7874453887874178, 0.7893313895557882, 0.7822075798899677)

Hemos visto que con el modelo de RandomForest hemos logrado con los Train aproximadamente un 97 % de acierto y con los Test aproximadamente un 78%. En el caso de Test (que es el que nos interesa) ha sido más efectiva en StandardScaler la transformacion en Test.

In [42]:
mse(y_train,rf_sc.predict(X_train_sc),squared=False)

63826.15877378667

In [43]:
mse(y_test,rf_sc.predict(X_test_sc),squared=False)

169476.05107448186

Hemos reducido el error con este nuevo modelo en 50000€ aproximadamente, pero el error sigue siendo de un poco menos de 170000€