In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.model_selection import train_test_split

#### Escalamiento de los datos

In [2]:
#Cargamos la base de datos
x = pd.read_csv('dataEntreno.csv')
x.head()

Unnamed: 0,production_budget,worldwide_gross,title_year,aspect_ratio,duration,cast_total_facebook_likes,budget,imdb_score
0,425000000.0,2783919000.0,2009.0,1.78,178.0,4834.0,237000000.0,7.9
1,306000000.0,2058662000.0,2002.130733,2.126976,108.577186,143.0,40455390.0,7.1
2,300000000.0,963420400.0,2007.0,2.35,169.0,48350.0,300000000.0,7.1
3,300000000.0,879620900.0,2015.0,2.35,148.0,11700.0,245000000.0,6.8
4,275000000.0,1084439000.0,2012.0,2.35,164.0,106759.0,250000000.0,8.5


In [3]:
#Extraemos nuestra variable target
y = x['worldwide_gross']
#Eliminamos la variable de nuestra base
x.drop('worldwide_gross',axis=1,inplace=True)
x.tail()

Unnamed: 0,production_budget,title_year,aspect_ratio,duration,cast_total_facebook_likes,budget,imdb_score
4099,7000.0,2004.0,1.85,77.0,368.0,7000.0,7.0
4100,7000.0,2005.0,2.126976,80.0,0.0,7000.0,6.3
4101,7000.0,2005.0,2.126976,84.0,93.0,3250.0,7.8
4102,3967.0,2012.0,2.35,100.0,2386.0,40455390.0,6.3
4103,1100.0,2004.0,1.85,90.0,163.0,1100.0,6.6


In [5]:
#Creamos nuestos tabulares y features(target) de entreno y de prueba
x_train, x_test, y_train, y_test = train_test_split(x,y)

In [6]:
#El 75% es para entreno y el 25% para prueba
len(x_train)/len(x)

0.75

Existen distintas estrategias de escalamiento de tus features, pero la más común es la estandarización donde convertimos la variable para que la distribución de esta siga una distribución que es Gaussiana de media 0 y de desviación estandar 1.

In [11]:
from sklearn.preprocessing import StandardScaler
#Instancio el estimador
scaler = StandardScaler()
#Entreno el estimador
scaler.fit(x_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [12]:
#Hiperparametro
scaler.mean_

array([3.37146278e+07, 2.00211534e+03, 2.12332107e+00, 1.08324574e+02,
       1.00662131e+04, 4.14009312e+07, 6.44733593e+00])

In [15]:
scaler.scale_

array([4.08784900e+07, 1.19284123e+01, 7.20354005e-01, 2.30603995e+01,
       1.57512215e+04, 2.27446838e+08, 1.08019761e+00])

In [16]:
x.values

array([[4.25000000e+08, 2.00900000e+03, 1.78000000e+00, ...,
        4.83400000e+03, 2.37000000e+08, 7.90000000e+00],
       [3.06000000e+08, 2.00213073e+03, 2.12697615e+00, ...,
        1.43000000e+02, 4.04553863e+07, 7.10000000e+00],
       [3.00000000e+08, 2.00700000e+03, 2.35000000e+00, ...,
        4.83500000e+04, 3.00000000e+08, 7.10000000e+00],
       ...,
       [7.00000000e+03, 2.00500000e+03, 2.12697615e+00, ...,
        9.30000000e+01, 3.25000000e+03, 7.80000000e+00],
       [3.96700000e+03, 2.01200000e+03, 2.35000000e+00, ...,
        2.38600000e+03, 4.04553863e+07, 6.30000000e+00],
       [1.10000000e+03, 2.00400000e+03, 1.85000000e+00, ...,
        1.63000000e+02, 1.10000000e+03, 6.60000000e+00]])

In [17]:
scaler.transform(x_train)

array([[-0.81557875, -1.77017161, -1.04576509, ..., -0.58746003,
        -0.18037591,  1.06708629],
       [ 0.15375744, -0.26116945, -0.37942605, ..., -0.39649072,
         0.08177326, -0.50669982],
       [ 0.64301231, -0.3450029 ,  0.31467713, ..., -0.33986019,
         0.08177326,  0.0487541 ],
       ...,
       [ 0.64301231, -0.00966909,  0.31467713, ...,  2.72726701,
         0.08177326, -0.22897286],
       [-0.65351308,  0.57716509,  0.31467713, ..., -0.5491138 ,
        -0.00415721, -1.06215374],
       [-0.66819072,  0.40949818,  0.31467713, ..., -0.47311969,
        -0.16223981,  1.15966195]])

In [18]:
x_train_scale, x_test_scaled = (scaler.transform(x_train), scaler.transform(x_test))

In [21]:
from sklearn.linear_model import Lasso

In [22]:
model = Lasso()
model_scaled = Lasso()
model.fit(x_train,y_train)
model_scaled.fit(x_train_scale,y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [24]:
#Los modelos de regresión no se ven afectados por el escalamiento de las features. Los de clasificación sí.
print(model.score(x_test,y_test))
print(model_scaled.score(x_test_scaled,y_test))

0.5906687555817498
0.5906687550067926


#### Simplificar las transformaciones con pipelines


Para hacer el código más reproducible, y para evitar tener que aplicar multiples veces una misma transformación es recomendable utilizar  sklearn.pipeline.make_pipeline que permite encadenar transformaciones en los modelos.


In [25]:
from sklearn.pipeline import make_pipeline

In [26]:
model_scaled = make_pipeline(StandardScaler(), Lasso())
model_scaled.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('lasso',
                 Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                       max_iter=1000, normalize=False, positive=False,
                       precompute=False, random_state=None, selection='cyclic',
                       tol=0.0001, warm_start=False))],
         verbose=False)

In [27]:
print(model_scaled.score(x_test,y_test))

0.5906687550067926


#### Crear nuevas features de forma automática

In [28]:
a = np.arange(6).reshape(3,2)
a

array([[0, 1],
       [2, 3],
       [4, 5]])

In [29]:
from sklearn.preprocessing import PolynomialFeatures

In [31]:
transformer = PolynomialFeatures(2)
transformer

PolynomialFeatures(degree=2, include_bias=True, interaction_only=False,
                   order='C')

In [32]:
transformer.fit_transform(a)

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

In [33]:
x.shape

(4104, 7)

In [34]:
transformer = PolynomialFeatures(2)
transformer.fit_transform(x).shape

(4104, 36)

In [36]:
model_poly = make_pipeline(PolynomialFeatures(2),Lasso())
model_poly.fit(x_train,y_train)
model_poly.score(x_test,y_test)

  positive)


0.6711938203865291

In [38]:
model = Lasso()
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.5906687555817498