# Escalamiento de los datos

In [4]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


Diversos algoritmos son sensibles a la escala en la que viene cada feature. Re -escalarios
puede traer significativas mejoras de rendimiento.

Existen distintas estrategias de escalamiento de tus features, pero las mas comun
es la estandarizacion donde convertimos la variable para que la distribucion de esta siga
una distribucion que es Gaussiana de media 0 y de desviacion estandar 1.

In [5]:
from sklearn.model_selection import train_test_split

X = pd.read_csv('/home/algoritmia/Dropbox/ml_intro/vol/intermediate_results/X.csv')
y = X['worldwide_gross']
x = X.drop('worldwide_gross',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y)


In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [9]:
scaler.mean_

array([3.38261998e+07, 9.69089414e+07, 3.00278760e+05, 5.33081351e+05,
       3.64162547e+05, 1.01728181e+04, 3.91151722e+07, 6.44896036e+00,
       4.47312770e+07])

In [10]:
scaler.scale_

array([4.18357537e+07, 1.79061047e+08, 1.08360549e+07, 1.27428707e+07,
       1.25188005e+07, 1.63833228e+04, 2.29012615e+08, 1.07188836e+00,
       6.69530840e+07])

In [11]:
scaler.transform(X_train)

array([[-0.45000264, -0.2786619 , -0.0275254 , ..., -0.10530063,
         0.79396294, -0.34872149],
       [-0.37829365, -0.27783982, -0.02752549, ..., -0.09220091,
         0.60737634, -0.01311146],
       [ 0.05196035, -0.17279689, -0.02752559, ..., -0.01360262,
        -0.04567674,  0.08188334],
       ...,
       [-0.78703494, -0.53889559, -0.0275254 , ..., -0.16686929,
        -0.04567674, -0.66799961],
       [ 3.37447728,  1.10285427, -0.02752503, ...,  0.59335084,
         0.42078975,  1.73787224],
       [-0.45000264,  0.17897224, -0.0275254 , ..., -0.09656748,
        -0.04567674,  0.1432495 ]])

In [12]:
X_train_scaled, X_test_scaled = (scaler.transform(X_train), scaler.transform(X_test))


In [13]:
from sklearn.linear_model import Lasso
model  = Lasso()
model_scaled = Lasso()
model.fit(X_train, y_train)
model_scaled.fit(X_train_scaled,y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [14]:
print(model.score(X_test,y_test))
print(model_scaled.score(X_test_scaled, y_test))

0.9999999999999892
0.9999999134142821


Los modelos  de regresion no se ven afectados por el escalamiento de las features. Los de clasificacion si.


# Simplificar las transformaciones con pipelines

Para hacer tu codgio mas reproducible,y para evitar tener que aplicar multiples veces unas misma transformacion te recomendamos 
utilizar sklearn.pipeline.make_pipeline que permite encadenar transformaciones a tus modelos.

In [18]:
from sklearn.pipeline import make_pipeline
model_scaled = make_pipeline(StandardScaler(),Lasso())

model_scaled.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lasso', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [19]:
print(model_scaled.score(X_test,y_test))

0.9999999134142821


Crear nuevas features de forma automatica

In [20]:
A = np.arange(6).reshape(3,2)
A

array([[0, 1],
       [2, 3],
       [4, 5]])

In [22]:
from sklearn.preprocessing import PolynomialFeatures

transformer = PolynomialFeatures(2)
transformer.fit(A)
transformer.transform(A)

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

In [23]:
X.shape

(4104, 9)

In [24]:
transformer = PolynomialFeatures(2)
transformer.fit_transform(X).shape

(4104, 55)

In [26]:
model_poly = make_pipeline(PolynomialFeatures(2),Lasso())
model_poly.fit(X_train,y_train)
model.score(X_test,y_test)

0.9999999999999892

In [27]:
model = Lasso()
model.fit(X_train,y_train)
model.score(X_test, y_test)

0.9999999999999892

# Encoding binario

In [56]:
categoricals = pd.read_csv('/home/algoritmia/Dropbox/ml_intro/vol/intermediate_results/intermediate_results/categoricals.csv').set_index('Unnamed: 0')

In [57]:
categoricals.head(2)

Unnamed: 0_level_0,actor_1_name,director_name
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,CCH Pounder,James Cameron
1,Doug Walker,Doug Walker


In [58]:
categoricals = categoricals.reset_index(drop=True).fillna(0)

In [59]:
X_binenc = pd.concat([X,categoricals],axis=1)

In [60]:
X_binenc.head()

Unnamed: 0,production_budget,worldwide_gross,title_year,aspect_ratio,duration.1,cast_total_facebook_likes,budget,imdb_score,gross,actor_1_name,director_name
0,425000000.0,2783919000.0,2009.0,1.78,178.0,4834.0,237000000.0,7.9,760505800.0,CCH Pounder,James Cameron
1,306000000.0,2058662000.0,591165600.0,591165600.0,591165600.0,143.0,591165600.0,7.1,591165600.0,Doug Walker,Doug Walker
2,300000000.0,963420400.0,2007.0,2.35,169.0,48350.0,300000000.0,7.1,309404200.0,Johnny Depp,Gore Verbinski
3,300000000.0,879620900.0,2015.0,2.35,148.0,11700.0,245000000.0,6.8,200074200.0,Christoph Waltz,Sam Mendes
4,275000000.0,1084439000.0,2012.0,2.35,164.0,106759.0,250000000.0,8.5,448130600.0,Tom Hardy,Christopher Nolan


In [61]:
import category_encoders as ce
encoder = ce.BinaryEncoder(cols=['actor_1_name','director_name'])


In [62]:
encoder.fit_transform(X_binenc).shape

(4104, 33)

In [63]:
X_binenc = encoder.fit_transform(X_binenc)

In [64]:
Xb_train, Xb_test, y_train,y_test = train_test_split(X_binenc,y)

In [65]:
X_train,  X_teset = (Xb_train[X.columns], Xb_test[X.columns])

In [66]:
model_binenc = Lasso()
model = Lasso()

In [67]:
model_binenc.fit(Xb_train,y_train)
model.fit(X_train,y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [68]:
print(model_binenc.score(Xb_test,y_test))
print(model.score(X_test,y_test))

0.9999999999999553
-0.8401418070142138


# Seleccion de features y la maldicion de la dimensionalidad

In [69]:
pd.read_csv('/home/algoritmia/Dropbox/ml_intro/vol/intermediate_results/intermediate_results/X_opening.csv').head()

Unnamed: 0,production_budget,worldwide_gross,title_year,aspect_ratio,duration.1,cast_total_facebook_likes,budget,imdb_score,opening_gross,screens
0,425000000.0,2783919000.0,2009.0,1.78,178.0,4834.0,237000000.0,7.9,77025481.0,3452.0
1,300000000.0,963420400.0,2007.0,2.35,169.0,48350.0,300000000.0,7.1,139802190.0,4362.0
2,300000000.0,879620900.0,2015.0,2.35,148.0,11700.0,245000000.0,6.8,70403148.0,3929.0
3,275000000.0,1084439000.0,2012.0,2.35,164.0,106759.0,250000000.0,8.5,160887295.0,4404.0
4,275000000.0,260002100.0,2013.0,2.35,150.0,45757.0,215000000.0,6.5,29210849.0,3904.0
