In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.model_selection import train_test_split

#### Escalamiento de los datos

In [3]:
#Cargamos la base de datos
x = pd.read_csv('dataEntreno.csv')
x.head()

Unnamed: 0,production_budget,worldwide_gross,title_year,aspect_ratio,duration,cast_total_facebook_likes,budget,imdb_score,gross
0,425000000.0,2783919000.0,2009.0,1.78,178.0,4834.0,237000000.0,7.9,760505800.0
1,306000000.0,2058662000.0,2002.102628,2.126683,108.552703,143.0,37123390.0,7.1,48190010.0
2,300000000.0,963420400.0,2007.0,2.35,169.0,48350.0,300000000.0,7.1,309404200.0
3,300000000.0,879620900.0,2015.0,2.35,148.0,11700.0,245000000.0,6.8,200074200.0
4,275000000.0,1084439000.0,2012.0,2.35,164.0,106759.0,250000000.0,8.5,448130600.0


In [4]:
#Extraemos nuestra variable target
y = x['worldwide_gross']
#Eliminamos la variable de nuestra base
x.drop('worldwide_gross',axis=1,inplace=True)
x.tail()

Unnamed: 0,production_budget,title_year,aspect_ratio,duration,cast_total_facebook_likes,budget,imdb_score,gross
4056,7000.0,2004.0,1.85,77.0,368.0,7000.0,7.0,424760.0
4057,7000.0,2005.0,2.126683,80.0,0.0,7000.0,6.3,70071.0
4058,7000.0,2005.0,2.126683,84.0,93.0,3250.0,7.8,48190010.0
4059,3967.0,2012.0,2.35,100.0,2386.0,37123390.0,6.3,10443.0
4060,1100.0,2004.0,1.85,90.0,163.0,1100.0,6.6,85222.0


In [5]:
#Creamos nuestos tabulares y features(target) de entreno y de prueba
x_train, x_test, y_train, y_test = train_test_split(x,y)

In [6]:
#El 75% es para entreno y el 25% para prueba
len(x_train)/len(x)

0.7498153164245259

Existen distintas estrategias de escalamiento de tus features, pero la más común es la estandarización donde convertimos la variable para que la distribución de esta siga una distribución que es Gaussiana de media 0 y de desviación estandar 1.

In [7]:
from sklearn.preprocessing import StandardScaler
#Instancio el estimador
scaler = StandardScaler()
#Entreno el estimador
scaler.fit(x_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [8]:
#Hiperparametro
scaler.mean_

array([3.34547374e+07, 2.00209762e+03, 2.13191743e+00, 1.08914606e+02,
       1.01700056e+04, 3.68931467e+07, 6.47665025e+00, 4.87418045e+07])

In [9]:
scaler.scale_

array([4.15348276e+07, 1.21539416e+01, 7.66392784e-01, 2.35801926e+01,
       1.58619667e+04, 6.13814409e+07, 1.06834891e+00, 6.51414719e+07])

In [10]:
x.values

array([[4.25000000e+08, 2.00900000e+03, 1.78000000e+00, ...,
        2.37000000e+08, 7.90000000e+00, 7.60505847e+08],
       [3.06000000e+08, 2.00210263e+03, 2.12668290e+00, ...,
        3.71233913e+07, 7.10000000e+00, 4.81900113e+07],
       [3.00000000e+08, 2.00700000e+03, 2.35000000e+00, ...,
        3.00000000e+08, 7.10000000e+00, 3.09404152e+08],
       ...,
       [7.00000000e+03, 2.00500000e+03, 2.12668290e+00, ...,
        3.25000000e+03, 7.80000000e+00, 4.81900113e+07],
       [3.96700000e+03, 2.01200000e+03, 2.35000000e+00, ...,
        3.71233913e+07, 6.30000000e+00, 1.04430000e+04],
       [1.10000000e+03, 2.00400000e+03, 1.85000000e+00, ...,
        1.10000000e+03, 6.60000000e+00, 8.52220000e+04]])

In [11]:
scaler.transform(x_train)

array([[-0.14336733, -1.57131096,  0.28455718, ..., -0.1530291 ,
         0.1154583 ,  0.29410136],
       [-0.61285285, -0.2548656 , -0.3678498 , ..., -0.4707147 ,
        -0.72696311, -0.47432802],
       [-0.70915757, -2.06497797, -0.9941605 , ..., -0.53588098,
         1.51949399, -0.14647818],
       ...,
       [-0.34801487,  0.73246842, -0.3678498 , ..., -0.27521587,
        -0.72696311,  0.79137069],
       [ 0.87987033,  0.81474625,  0.28455718, ...,  0.62082044,
        -0.53975835, -0.15708062],
       [ 0.27796582, -0.41942127,  0.28455718, ...,  0.13207336,
        -1.47578214, -0.71517058]])

In [12]:
x_train_scale, x_test_scaled = (scaler.transform(x_train), scaler.transform(x_test))

In [13]:
from sklearn.linear_model import Lasso

In [14]:
model = Lasso()
model_scaled = Lasso()
model.fit(x_train,y_train)
model_scaled.fit(x_train_scale,y_train)

  positive)


Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [15]:
#Los modelos de regresión no se ven afectados por el escalamiento de las features. Los de clasificación sí.
print(model.score(x_test,y_test))
print(model_scaled.score(x_test_scaled,y_test))

0.8500724882603596
0.8500724903025733


#### Simplificar las transformaciones con pipelines


Para hacer el código más reproducible, y para evitar tener que aplicar multiples veces una misma transformación es recomendable utilizar  sklearn.pipeline.make_pipeline que permite encadenar transformaciones en los modelos.


In [16]:
from sklearn.pipeline import make_pipeline

In [17]:
model_scaled = make_pipeline(StandardScaler(), Lasso())
model_scaled.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('lasso',
                 Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                       max_iter=1000, normalize=False, positive=False,
                       precompute=False, random_state=None, selection='cyclic',
                       tol=0.0001, warm_start=False))],
         verbose=False)

In [18]:
print(model_scaled.score(x_test,y_test))

0.8500724903025733


#### Crear nuevas features de forma automática

In [19]:
a = np.arange(6).reshape(3,2)
a

array([[0, 1],
       [2, 3],
       [4, 5]])

In [20]:
from sklearn.preprocessing import PolynomialFeatures

In [21]:
transformer = PolynomialFeatures(2)
transformer

PolynomialFeatures(degree=2, include_bias=True, interaction_only=False,
                   order='C')

In [22]:
transformer.fit_transform(a)

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

In [23]:
x.shape

(4061, 8)

Fit and transform se hacen al mismo tiempo con fit_transform, fit_trasform puede generar mas fiatures, puede ser riesgoso

In [24]:
transformer = PolynomialFeatures(2)
transformer.fit_transform(x).shape

(4061, 45)

In [25]:
model_poly = make_pipeline(PolynomialFeatures(2),Lasso())
model_poly.fit(x_train,y_train)
model_poly.score(x_test,y_test)

  positive)


0.8096398406060525

In [26]:
model = Lasso()
model.fit(x_train,y_train)
model.score(x_test,y_test)

  positive)


0.8500724882603596

# Creando feature catgoricas

Encoding one-hot

In [27]:
data = pd.DataFrame([['Chile','Colombia','Colombia'],['Hombre','Mujer','Hombre','Mujer']])
data = data.T
data.columns = pd.Index(['pais','genero'])
data

Unnamed: 0,pais,genero
0,Chile,Hombre
1,Colombia,Mujer
2,Colombia,Hombre
3,,Mujer


In [28]:
pd.get_dummies(data)

Unnamed: 0,pais_Chile,pais_Colombia,genero_Hombre,genero_Mujer
0,1,0,1,0
1,0,1,0,1
2,0,1,1,0
3,0,0,0,1


In [29]:
movies_obj = pd.read_csv('./movies_obj.csv')
movies_obj

Unnamed: 0,movie_title,color,language,country,genres,content_rating,plot_keywords,director_name,actor_1_name,actor_2_name,actor_3_name
0,Avatar,Color,English,USA,Action|Adventure|Fantasy|Sci-Fi,PG-13,avatar|future|marine|native|paraplegic,James Cameron,CCH Pounder,Joel David Moore,Wes Studi
1,Pirates of the Caribbean: At World's End,Color,English,USA,Action|Adventure|Fantasy,PG-13,goddess|marriage ceremony|marriage proposal|pi...,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport
2,Spectre,Color,English,UK,Action|Adventure|Thriller,PG-13,bomb|espionage|sequel|spy|terrorist,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman
3,The Dark Knight Rises,Color,English,USA,Action|Thriller,PG-13,deception|imprisonment|lawlessness|police offi...,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt
4,Star Wars: Episode VII - The Force Awakens ...,,,,Documentary,,,Doug Walker,Doug Walker,Rob Walker,
5,John Carter,Color,English,USA,Action|Adventure|Sci-Fi,PG-13,alien|american civil war|male nipple|mars|prin...,Andrew Stanton,Daryl Sabara,Samantha Morton,Polly Walker
6,Spider-Man 3,Color,English,USA,Action|Adventure|Romance,PG-13,sandman|spider man|symbiote|venom|villain,Sam Raimi,J.K. Simmons,James Franco,Kirsten Dunst
7,Tangled,Color,English,USA,Adventure|Animation|Comedy|Family|Fantasy|Musi...,PG,17th century|based on fairy tale|disney|flower...,Nathan Greno,Brad Garrett,Donna Murphy,M.C. Gainey
8,Avengers: Age of Ultron,Color,English,USA,Action|Adventure|Sci-Fi,PG-13,artificial intelligence|based on comic book|ca...,Joss Whedon,Chris Hemsworth,Robert Downey Jr.,Scarlett Johansson
9,Harry Potter and the Half-Blood Prince,Color,English,UK,Adventure|Family|Fantasy|Mystery,PG,blood|book|love|potion|professor,David Yates,Alan Rickman,Daniel Radcliffe,Rupert Grint


#### Es util si el top 20 se encuentra en gran parte de los datos.

In [30]:
movies_obj.apply(pd.Series.nunique).sort_values()

color                2
content_rating      18
language            47
country             65
genres             914
actor_1_name      2097
director_name     2398
actor_2_name      3032
actor_3_name      3521
plot_keywords     4760
movie_title       4917
dtype: int64

# Encoding Binario

In [31]:
#!pip install category_encoders

In [32]:
categorias = pd.read_csv('./categoricals.csv').set_index('Unnamed: 0')
categorias.head()

Unnamed: 0_level_0,actor_1_name,director_name
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,CCH Pounder,James Cameron
1,Doug Walker,Doug Walker
2,Johnny Depp,Gore Verbinski
3,Christoph Waltz,Sam Mendes
4,Tom Hardy,Christopher Nolan


In [33]:
categorias = categorias.reset_index(drop=True).fillna(0)
categorias

Unnamed: 0,actor_1_name,director_name
0,CCH Pounder,James Cameron
1,Doug Walker,Doug Walker
2,Johnny Depp,Gore Verbinski
3,Christoph Waltz,Sam Mendes
4,Tom Hardy,Christopher Nolan
5,Johnny Depp,Gore Verbinski
6,Daryl Sabara,Andrew Stanton
7,Brad Garrett,Nathan Greno
8,J.K. Simmons,Sam Raimi
9,J.K. Simmons,Sam Raimi


In [34]:
X_binenc = pd.concat([x, categorias], axis=1)

In [35]:
X_binenc.shape

(4104, 10)

In [36]:
import category_encoders as ce
encoder = ce.BinaryEncoder(cols=['actor_1_name','director_name'])
encoder

BinaryEncoder(cols=['actor_1_name', 'director_name'], drop_invariant=False,
              handle_missing='value', handle_unknown='value', mapping=None,
              return_df=True, verbose=0)

In [37]:
encoder.fit_transform(X_binenc).shape

(4104, 32)

In [38]:
X_binenc = encoder.fit_transform(X_binenc)
X_binenc.head()

Unnamed: 0,production_budget,title_year,aspect_ratio,duration,cast_total_facebook_likes,budget,imdb_score,gross,actor_1_name_0,actor_1_name_1,...,director_name_2,director_name_3,director_name_4,director_name_5,director_name_6,director_name_7,director_name_8,director_name_9,director_name_10,director_name_11
0,425000000.0,2009.0,1.78,178.0,4834.0,237000000.0,7.9,760505800.0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,306000000.0,2002.102628,2.126683,108.552703,143.0,37123390.0,7.1,48190010.0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,300000000.0,2007.0,2.35,169.0,48350.0,300000000.0,7.1,309404200.0,0,0,...,0,0,0,0,0,0,0,0,1,1
3,300000000.0,2015.0,2.35,148.0,11700.0,245000000.0,6.8,200074200.0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,275000000.0,2012.0,2.35,164.0,106759.0,250000000.0,8.5,448130600.0,0,0,...,0,0,0,0,0,0,0,1,0,1


In [39]:
y.shape

(4061,)

In [40]:
X_binenc.shape

(4104, 32)

In [41]:
X_binenc = X_binenc[X_binenc['production_budget'].notnull()]
X_binenc.shape

(4061, 32)

In [42]:
Xb_train, Xb_test, y_train, y_test = train_test_split(X_binenc,y)

In [43]:
X_train, X_test = (Xb_train[x.columns], Xb_test[x.columns])

In [44]:
model_binenc = Lasso()
model = Lasso()

In [45]:
model_binenc.fit(Xb_train, y_train)
model.fit(X_train, y_train)

  positive)


Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [46]:
print(model_binenc.score(Xb_test, y_test))
print(model.score(X_test, y_test))

0.782778571196947
0.7803605049879538


# Conocimiento experto