In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [5]:
X = pd.read_csv('Data/intermediate_resutls.csv')
X = X.drop(X.columns[0], axis = 1)
X = X.drop('gross', axis = 1)
y = X['worldwide_gross']
X = X.drop('worldwide_gross', axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [8]:
print(scaler.mean_)
print(scaler.scale_)

[3.16129216e+07 2.65458587e+05 4.17577507e+05 2.54803599e+05
 1.01725444e+04 3.62793681e+07 6.40687348e+00]
[4.05394857e+07 1.04235846e+07 1.11570695e+07 1.09829742e+07
 1.98365640e+04 2.21352519e+08 1.10828892e+00]


In [10]:
X_train_scaled, X_test_scaled = (scaler.transform(X_train), scaler.transform(X_test)) # acabamos de entrenar los datos

In [12]:
from sklearn.linear_model import Lasso

model = Lasso()
model_scaled = Lasso()

model.fit(X_train, y_train)
model_scaled.fit(X_train_scaled, y_train)



Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [14]:
print(model.score(X_test, y_test))
print(model_scaled.score(X_test_scaled, y_test)) # las regresiones son inmunes a los reescalamientosm, pero es ilustrativo

0.5706629317553091
0.5706629296343213


## Simplificar las transformaciones con Pipeline

In [16]:
from sklearn.pipeline import make_pipeline

model_scaled = make_pipeline(StandardScaler(), 
                             Lasso())
model_scaled.fit(X_train, y_train)

model.score(X_test, y_test)

0.5706629317553091

## Crear automáticamente nuevas features

In [18]:
A = np.arange(6).reshape(3, 2)
A

array([[0, 1],
       [2, 3],
       [4, 5]])

In [21]:
from sklearn.preprocessing import PolynomialFeatures

transformer = PolynomialFeatures()
transformer.fit(A)
transformer.transform(A)

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

In [23]:
X.shape

(4385, 7)

In [25]:
transformer = PolynomialFeatures(2)
transformer.fit_transform(X).shape

(4385, 36)

In [35]:
polynomial_transformer = PolynomialFeatures()
polynomial_transformer.fit(X_train)
X_train_polynomial, X_test_polynomial = (polynomial_transformer.transform(X_train), 
                                         polynomial_transformer.transform(X_test))

model_poly = Lasso()
model_poly.fit(X_train_polynomial, y_train)
print(model_poly.score(X_test_polynomial, y_test))

0.620900283328502




# Creación de Features Categóricas

## One hot encodding

In [39]:
df = pd.DataFrame([['Chile', 'Colombia', 'España', 'Venezuela', 'Ecuador'], ['A', 'B', 'A', 'B', 'C']])
df = df.T
df.columns = pd.Index(['Paises', 'Letra'])
df

Unnamed: 0,Paises,Letra
0,Chile,A
1,Colombia,B
2,España,A
3,Venezuela,B
4,Ecuador,C


In [41]:
pd.get_dummies(df)

Unnamed: 0,Paises_Chile,Paises_Colombia,Paises_Ecuador,Paises_España,Paises_Venezuela,Letra_A,Letra_B,Letra_C
0,1,0,0,0,0,1,0,0
1,0,1,0,0,0,0,1,0
2,0,0,0,1,0,1,0,0
3,0,0,0,0,1,0,1,0
4,0,0,1,0,0,0,0,1


In [43]:
movies = pd.read_csv('Data/peliculas.csv')

In [44]:
movies.apply(pd.Series.nunique).sort_values() # por defecto iteramos por columnas, contando el número de valores únicos

color                           2
content_rating                 18
aspect_ratio                   22
language                       47
country                        65
imdb_score                     78
title_year                     91
duration                      191
duration.1                    191
budget                        439
genres                        914
actor_1_name                 2097
director_name                2398
actor_2_name                 3032
actor_3_name                 3521
cast_total_facebook_likes    3978
gross                        4035
plot_keywords                4760
movie_title                  4917
dtype: int64

In [59]:
categoricals = movies[['actor_1_name','director_name']]
print(categoricals.head(2))
categoricals.to_csv('Data/categoricals.csv')

  actor_1_name   director_name
0  CCH Pounder   James Cameron
1  Johnny Depp  Gore Verbinski


In [60]:
categoricals.index

RangeIndex(start=0, stop=5043, step=1)

In [61]:
categoricals.reset_index(drop=True).fillna(0);

In [54]:
!pip install category_encoders

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/a0/52/c54191ad3782de633ea3d6ee3bb2837bda0cf3bc97644bb6375cf14150a0/category_encoders-2.1.0-py2.py3-none-any.whl (100kB)
[K    100% |████████████████████████████████| 102kB 2.2MB/s a 0:00:011
Collecting scikit-learn>=0.20.0 (from category_encoders)
  Using cached https://files.pythonhosted.org/packages/cf/b8/706e496d8b1207c1da154a7fe82753a2385edc1435ec524afa6c1baafed6/scikit_learn-0.21.3-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl
Collecting joblib>=0.11 (from scikit-learn>=0.20.0->category_encoders)
[?25l  Downloading https://files.pythonhosted.org/packages/8f/42/155696f85f344c066e17af287359c9786b436b1bf86029bb3411283274f3/joblib-0.14.0-py2.py3-none-any.whl (294kB)
[K    100% |████████████████████████████████| 296kB 6.5MB/s ta 0:00:011
Installing collected packages: joblib, scikit-learn, category-encoders
  Found existing installat

In [55]:
!pip install --upgrade pip

Collecting pip
[?25l  Downloading https://files.pythonhosted.org/packages/4a/08/6ca123073af4ebc4c5488a5bc8a010ac57aa39ce4d3c8a931ad504de4185/pip-19.3-py2.py3-none-any.whl (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 6.3MB/s eta 0:00:01
[?25hInstalling collected packages: pip
  Found existing installation: pip 19.0.1
    Uninstalling pip-19.0.1:
      Successfully uninstalled pip-19.0.1
Successfully installed pip-19.3


In [57]:
X_binary_encoding = pd.concat([X, categoricals])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


In [67]:
import category_encoders as ce

encoder = ce.BinaryEncoder(cols=['actor_1_name', 'director_name'])
X_binary_encoding = encoder.fit_transform(X_binary_encoding)

In [68]:
X_binary_encoding.shape

(9428, 33)

In [69]:
Xb_train, Xb_test, y_train, y_test = train_test_split(X_binary_encoding, y)

ValueError: Found input variables with inconsistent numbers of samples: [9428, 4385]

In [None]:
X_train, X_test = (Xb_train[X.columns], Xb_test[X.columns])
# y ahora podemos ajustar el modelo


## me queda pendiente arrelgar este entuerto