In [0]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline

In [0]:
x = pd.read_csv('x.csv')
y = x['worldwide_gross']
x = x.drop('worldwide_gross', axis=1)

In [0]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

# **Escalamiento de datos**

---



In [0]:
# Scaler escala los valores: (x - u) / s donde 
#   x es el dato actual, u la media de la columna y s es la desviacion estandar
scaler = StandardScaler()

In [7]:
scaler.fit(x_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [8]:
# Media de las columnas del scaler
scaler.mean_

array([3.34502979e+07, 2.00196531e+03, 2.11915565e+00, 1.08655634e+02,
       1.04581361e+04, 4.06340720e+07, 6.46062378e+00])

In [9]:
# Desviacion estandar de las columnas del scaler
scaler.scale_

array([4.08276544e+07, 1.21316097e+01, 6.74860523e-01, 2.30916137e+01,
       1.95464649e+04, 2.26977035e+08, 1.07740809e+00])

In [10]:
# Valores de x_train transformados por el scale segun la media y std del train
scaler.transform(x_train)

array([[ 0.65028722,  0.25014739,  0.34206231, ...,  0.50596688,
         0.08532109, -0.98442159],
       [-0.06001564, -0.57414553,  0.34206231, ..., -0.42131077,
        -0.15038558,  0.77906991],
       [-0.35393407, -1.39843845, -1.39162926, ..., -0.3697925 ,
        -0.09531392,  1.89285401],
       ...,
       [-0.6600991 ,  0.99201102,  0.34206231, ..., -0.41619475,
        -0.15038558, -0.98442159],
       [ 0.22165618,  0.74472315,  0.34206231, ...,  0.88895174,
        -0.00279355,  0.12936251],
       [-0.39067387,  0.49743527, -0.39883152, ..., -0.49329309,
        -0.077691  , -0.24189885]])

In [0]:
# Valores escalados de x_train y x_test
x_train_scaled, x_test_scaled = scaler.transform(x_train), scaler.transform(x_test)

In [0]:
# Modelos Lasso entrenados con x_train y x_train_scaled para comparacion
model = Lasso().fit(x_train, y_train)
model_scaled = Lasso().fit(x_train_scaled, y_train)

In [13]:
# Comparacion de scores de los modelos
print(model.score(x_test, y_test))
print(model_scaled.score(x_test_scaled, y_test))

0.6126007313059247
0.6126007303506361


# **Pipelines**

---



In [0]:
# Pipelines encadenan varios objetos estimadores
model_scaled = make_pipeline(StandardScaler(), Lasso())

In [15]:
# Se entrenan todos los objetos estimadores a la vez
model_scaled.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('lasso',
                 Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                       max_iter=1000, normalize=False, positive=False,
                       precompute=False, random_state=None, selection='cyclic',
                       tol=0.0001, warm_start=False))],
         verbose=False)

In [16]:
model_scaled.score(x_test, y_test)

0.6126007303506361

# **Polinomial features**
---

In [0]:
from sklearn.preprocessing import PolynomialFeatures

In [28]:
transformer = PolynomialFeatures(2) # Grado del polinomio: 2
transformer.fit_transform(np.arange(6).reshape(3, 2))

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

In [29]:
# La dimension aumenta con este transformer: cuidado
transformer = PolynomialFeatures(2)
transformer.fit_transform(x).shape

(4104, 36)

In [32]:
# PolynomialFeatures es por defecto de grado 2
model_poly = make_pipeline(PolynomialFeatures(), Lasso())
model_poly.fit(x_train, y_train)
model_poly.score(x_test, y_test)

  positive)


0.7126575918078968

# **Crear features categoricas**

---



**One-hot encoding**

In [36]:
a_df = pd.DataFrame([['mx', 'male'], ['co', 'female'], ['co', 'male'], ['br', 'female'], ['mx', 'male']], columns=['country', 'genre'])
a_df

Unnamed: 0,country,genre
0,mx,male
1,co,female
2,co,male
3,br,female
4,mx,male


In [37]:
pd.get_dummies(a_df)

Unnamed: 0,country_br,country_co,country_mx,genre_female,genre_male
0,0,0,1,0,1
1,0,1,0,1,0
2,0,1,0,0,1
3,1,0,0,1,0
4,0,0,1,0,1


**Encoding binario**

In [41]:
# Libreria con encoding binario (aun no disponible nativamente, es state of the art)
!pip install category_encoders 

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/a0/52/c54191ad3782de633ea3d6ee3bb2837bda0cf3bc97644bb6375cf14150a0/category_encoders-2.1.0-py2.py3-none-any.whl (100kB)
[K     |███▎                            | 10kB 20.7MB/s eta 0:00:01[K     |██████▌                         | 20kB 1.7MB/s eta 0:00:01[K     |█████████▉                      | 30kB 2.3MB/s eta 0:00:01[K     |█████████████                   | 40kB 1.7MB/s eta 0:00:01[K     |████████████████▍               | 51kB 1.9MB/s eta 0:00:01[K     |███████████████████▋            | 61kB 2.2MB/s eta 0:00:01[K     |██████████████████████▉         | 71kB 2.4MB/s eta 0:00:01[K     |██████████████████████████▏     | 81kB 2.6MB/s eta 0:00:01[K     |█████████████████████████████▍  | 92kB 2.9MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 2.4MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.1.0


In [42]:
# Columnas a encodear: actor1_name y director_name
categoricals = pd.read_csv('categoricals.csv').set_index('Unnamed: 0')
categoricals.head()

Unnamed: 0_level_0,actor_1_name,director_name
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,CCH Pounder,James Cameron
1,Doug Walker,Doug Walker
2,Johnny Depp,Gore Verbinski
3,Christoph Waltz,Sam Mendes
4,Tom Hardy,Christopher Nolan


In [43]:
categoricals = categoricals.reset_index(drop=True).fillna(0)
categoricals.head()

Unnamed: 0,actor_1_name,director_name
0,CCH Pounder,James Cameron
1,Doug Walker,Doug Walker
2,Johnny Depp,Gore Verbinski
3,Christoph Waltz,Sam Mendes
4,Tom Hardy,Christopher Nolan


In [44]:
x_binenc = pd.concat([x, categoricals], axis=1)
x_binenc.head()

Unnamed: 0,production_budget,title_year,aspect_ratio,duration.1,cast_total_facebook_likes,budget,imdb_score,actor_1_name,director_name
0,425000000.0,2009.0,1.78,178.0,4834.0,237000000.0,7.9,CCH Pounder,James Cameron
1,306000000.0,2002.130733,2.126976,108.577186,143.0,40455390.0,7.1,Doug Walker,Doug Walker
2,300000000.0,2007.0,2.35,169.0,48350.0,300000000.0,7.1,Johnny Depp,Gore Verbinski
3,300000000.0,2015.0,2.35,148.0,11700.0,245000000.0,6.8,Christoph Waltz,Sam Mendes
4,275000000.0,2012.0,2.35,164.0,106759.0,250000000.0,8.5,Tom Hardy,Christopher Nolan


In [0]:
import category_encoders as ce

In [0]:
# Objeto estimador del encoder
encoder = ce.BinaryEncoder(cols=['actor_1_name','director_name'])

In [49]:
# Aumenta el numero de columnas al encodear
encoder.fit_transform(x_binenc).shape

(4104, 31)

In [0]:
# Dataset ya encodeado
x_binenc = encoder.fit_transform(x_binenc)

Score con el nuevo encoding

In [0]:
xb_train, xb_test, y_train, y_test = train_test_split(x_binenc, y)

In [0]:
x_train, x_test = xb_train[x.columns], xb_test[x.columns]

In [0]:
model_binenc = Lasso().fit(xb_train, y_train)
model = Lasso().fit(x_train, y_train)

In [57]:
print(model_binenc.score(xb_test, y_test))
print(model.score(x_test, y_test))
# No nos ayudo el binary encoding pues tienen score muy parecido

0.584304143727147
0.5820756396078482


# **Conocimiento experto**

---
Una gran parte del diseño de las features pasa por un **conocimiento espécifico del dominio en el que se esta trabajando**.
Por ejemplo para analizar una imagen nuestro cerebro no se concentra en los millones de pixeles de una imagen, pero sólo en algunos relevantes como los de los contornos. Durante un buen tiempo **los sistemas de visión de computadores encodeaban features que traducían este conocimiento experto (contornos).**
Una de las únicas formas de obtener este conocimiento de forma sistemática es ir a bucear en repositorios de papers de Machine Learning como Arxiv, y estudiar la investigación que se ha hecho sobre el dominio específico.

Piramide de maslow del ml:

^

| Algoritmo

| Features

| Buena data (lo mas importante)

--> Probaremos con agregar el ingreso al estreno y en cuantas pantallas se estrenó