## Data Preparation

In [1]:
import pandas as pd

In [2]:
movies_data_path = '../dataset/movies.csv'
finantial_data_path = '../dataset/finantials.csv'
opening_data_path = '../dataset/opening_gross.csv'

In [3]:
fin_data = pd.read_csv(finantial_data_path)
movie_data = pd.read_csv(movies_data_path)
opening_data = pd.read_csv(opening_data_path)

In [9]:
fin_data.head()

Unnamed: 0,movie_title,production_budget,worldwide_gross
0,Avatar,425000000,2783918982
1,Star Wars: Episode VII - The Force Awakens ...,306000000,2058662225
2,Pirates of the Caribbean: At World's End,300000000,963420425
3,Spectre,300000000,879620923
4,The Dark Knight Rises,275000000,1084439099


In [10]:
movie_data.head()

Unnamed: 0,title_year,aspect_ratio,duration,budget,imdb_score,gross,movie_title
0,2009.0,1.78,178.0,237000000.0,7.9,760505847.0,Avatar
1,2007.0,2.35,169.0,300000000.0,7.1,309404152.0,Pirates of the Caribbean: At World's End
2,2015.0,2.35,148.0,245000000.0,6.8,200074175.0,Spectre
3,2012.0,2.35,164.0,250000000.0,8.5,448130642.0,The Dark Knight Rises
4,,,,,7.1,,Star Wars: Episode VII - The Force Awakens ...


In [13]:
movie_data.columns

Index(['title_year', 'aspect_ratio', 'duration', 'budget', 'imdb_score',
       'gross', 'movie_title'],
      dtype='object')

In [11]:
opening_data.head()

Unnamed: 0,movie_title,opening_gross,screens
0,10 Days in a Madhouse,2451.0,10.0
1,10 Things I Hate About You,8330681.0,2271.0
2,102 Dalmatians,19883351.0,2704.0
3,12 Rounds,5329240.0,2331.0
4,12 Years a Slave,923715.0,19.0


In [16]:
numeric_columns_mask = (movie_data.dtypes == float) | (movie_data.dtypes == int)
numeric_columns = [column for column in numeric_columns_mask.index if numeric_columns_mask[column]]
movie_data = movie_data[numeric_columns+['movie_title']]

In [17]:
movie_data.head()


Unnamed: 0,title_year,aspect_ratio,duration,budget,imdb_score,gross,movie_title
0,2009.0,1.78,178.0,237000000.0,7.9,760505847.0,Avatar
1,2007.0,2.35,169.0,300000000.0,7.1,309404152.0,Pirates of the Caribbean: At World's End
2,2015.0,2.35,148.0,245000000.0,6.8,200074175.0,Spectre
3,2012.0,2.35,164.0,250000000.0,8.5,448130642.0,The Dark Knight Rises
4,,,,,7.1,,Star Wars: Episode VII - The Force Awakens ...


In [7]:
numeric_columns

['title_year', 'aspect_ratio', 'duration', 'budget', 'imdb_score', 'gross']

In [5]:
fin_data = fin_data[['movie_title','production_budget','worldwide_gross']]

In [20]:
fin_movie_data = pd.merge(fin_data, movie_data, on= 'movie_title', how='left')
# fin_movie_data = pd.merge(fin_data, movie_data, on='movie_title', how='inner')


In [25]:
fin_movie_data.head(2)

Unnamed: 0,movie_title,production_budget,worldwide_gross,title_year,aspect_ratio,duration,budget,imdb_score,gross
0,Avatar,425000000,2783918982,2009.0,1.78,178.0,237000000.0,7.9,760505847.0
1,Star Wars: Episode VII - The Force Awakens ...,306000000,2058662225,,,,,7.1,


In [26]:
fin_movie_data.shape

(4385, 9)

In [22]:
full_movie_data = pd.merge(opening_data,fin_movie_data, on = 'movie_title', how='left')

In [24]:
full_movie_data.head(2)

Unnamed: 0,movie_title,opening_gross,screens,production_budget,worldwide_gross,title_year,aspect_ratio,duration,budget,imdb_score,gross
0,10 Days in a Madhouse,2451.0,10.0,12000000,14616,2015.0,1.85,111.0,12000000.0,7.5,14616.0
1,10 Things I Hate About You,8330681.0,2271.0,13000000,60414025,1999.0,1.85,97.0,16000000.0,7.2,38176108.0


In [27]:
full_movie_data.shape

(2304, 11)

In [8]:
full_movie_data[(full_movie_data.worldwide_gross != 0) & (full_movie_data.worldwide_gross.notnull())].shape

(2304, 12)

In [28]:
full_movie_data = full_movie_data.drop(['movie_title','gross'],axis=1) #remove columns that means axis=1

In [29]:
full_movie_data.columns #check

Index(['opening_gross', 'screens', 'production_budget', 'worldwide_gross',
       'title_year', 'aspect_ratio', 'duration', 'budget', 'imdb_score'],
      dtype='object')

## Modeling

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_validate
import numpy as np

In [32]:
X = full_movie_data.drop(['worldwide_gross'], axis = 1)
y = full_movie_data['worldwide_gross']

In [33]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', GradientBoostingRegressor()) #necesitamos una regresión para predecir nuestro target $$$$, that will be how much $.$
]) #La primera etapa del pipeline es un SimpleImputer, que se utiliza para rellenar los valores faltantes (representados por np.nan), no os preocupeís, hijos míos

In [35]:
pipeline

Pipeline(steps=[('imputer', SimpleImputer()),
                ('core_model', GradientBoostingRegressor())])

In [34]:
results = cross_validate(pipeline ,X,y,return_train_score=True,cv=10)
results # por defecto se usa utiliza el score R^2 para evaluar su desempeño en la tarea de regresión. Usted puede cambiar a la métrica según desee
# sea uun f1 score, sea un accuracy, recall, etc etc. Todo dependerá del tipo de métrica que desee monitorear y obvi dependerá de 1. sus datos, 2. su aplicación y
# 3. su clasificación, no es lo mismo las métricas de una regresión a las métricas de una clasificación por ejemplo

# R2, o R-squared, es una medida de la bondad de ajuste para modelos de regresión. Es una estadística que indica qué tan bien los valores observados se 
# ajustan a los valores predichos por el modelo. 
# El valor de R2 se encuentra entre 0 y 1, donde 0 indica que el modelo no ajusta los datos en absoluto y 1 indica un ajuste perfecto :D

{'fit_time': array([0.36270928, 0.261585  , 0.25275445, 0.27090216, 0.25029945,
        0.24367714, 0.24576497, 0.25102448, 0.23590374, 0.24129224]),
 'score_time': array([0.00299835, 0.0011158 , 0.00200057, 0.00400043, 0.00299931,
        0.00199962, 0.00300217, 0.00199914, 0.0019567 , 0.00225258]),
 'test_score': array([0.65921863, 0.84938212, 0.64125353, 0.77902154, 0.77457965,
        0.85500452, 0.76327521, 0.85640853, 0.66934473, 0.65200824]),
 'train_score': array([0.91105346, 0.9140962 , 0.9183163 , 0.91613755, 0.91832576,
        0.91492969, 0.91866643, 0.91378583, 0.9201035 , 0.91384172])}

In [36]:
train_score = np.mean(results['train_score'])
test_score = np.mean(results['test_score'])
assert train_score > 0.7
assert test_score > 0.65
print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')

Train Score: 0.9159256441060745
Test Score: 0.7499496681557053


## Hyperparameter tunning

In [37]:
from sklearn.model_selection import GridSearchCV

In [54]:
#param_tunning = {'core_model__n_estimators': range(20,501,20)} 

param_tunning = {
    'core_model__n_estimators': range(20, 501, 20),
    'core_model__learning_rate': [0.001, 0.01, 0.1],
    'core_model__max_depth': [3, 5, 7],
    'core_model__min_samples_split': [2, 5, 10],
    'core_model__min_samples_leaf': [1, 2, 4],
    'core_model__max_features': ['auto', 'sqrt', 'log2'],
    'imputer__strategy': ['mean', 'median', 'most_frequent']
}

estimator = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', GradientBoostingRegressor())
]) 
#GradientBoostingRegressor, algunos otros hiperparámetros que podrían considerar para la búsqueda en cuadrícula incluyen:
# learning_rate: la tasa de aprendizaje utilizada por el modelo, que controla la contribución de cada árbol al modelo final.
# max_depth: la profundidad máxima de los árboles utilizados por el modelo.
# min_samples_split: el número mínimo de muestras requeridas para dividir un nodo interno.
# min_samples_leaf: el número mínimo de muestras requeridas en cada hoja del árbol.

In [55]:
grid_search= GridSearchCV(estimator,
                       param_grid = param_tunning,
                       scoring='r2',
                       cv=5) 

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.35,random_state= 42)

In [57]:
grid_search.fit(X_train, y_train)

In [None]:
final_result = cross_validate(grid_search.best_estimator_,X_train,y_train,return_train_score=True,cv=7)

In [None]:
train_score = np.mean(final_result['train_score'])
test_score = np.mean(final_result['test_score'])
assert train_score > 0.7
assert test_score > 0.65
print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')

Train Score: 0.9657619269135428
Test Score: 0.7678304526025582


In [46]:
grid_search.best_estimator_.get_params()

{'memory': None,
 'steps': [('imputer', SimpleImputer()),
  ('core_model', GradientBoostingRegressor(n_estimators=220))],
 'verbose': False,
 'imputer': SimpleImputer(),
 'core_model': GradientBoostingRegressor(n_estimators=220),
 'imputer__add_indicator': False,
 'imputer__copy': True,
 'imputer__fill_value': None,
 'imputer__missing_values': nan,
 'imputer__strategy': 'mean',
 'imputer__verbose': 0,
 'core_model__alpha': 0.9,
 'core_model__ccp_alpha': 0.0,
 'core_model__criterion': 'friedman_mse',
 'core_model__init': None,
 'core_model__learning_rate': 0.1,
 'core_model__loss': 'squared_error',
 'core_model__max_depth': 3,
 'core_model__max_features': None,
 'core_model__max_leaf_nodes': None,
 'core_model__min_impurity_decrease': 0.0,
 'core_model__min_samples_leaf': 1,
 'core_model__min_samples_split': 2,
 'core_model__min_weight_fraction_leaf': 0.0,
 'core_model__n_estimators': 220,
 'core_model__n_iter_no_change': None,
 'core_model__random_state': None,
 'core_model__subsample'

In [47]:
estimator = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', GradientBoostingRegressor(n_estimators=220,
                                             alpha=0.9,
                                             ccp_alpha=0.0,
                                             criterion='friedman_mse',
                                             init=None,
                                             learning_rate=0.1,
                                             loss='squared_error',
                                             max_depth=3,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_iter_no_change=None,
                                             random_state=None,
                                             subsample=1.0,
                                             tol=0.0001,
                                             validation_fraction=0.1,
                                             verbose=0,
                                             warm_start=False))
])

In [48]:
estimator.fit(X_train,y_train)

Pipeline(steps=[('imputer', SimpleImputer()),
                ('core_model', GradientBoostingRegressor(n_estimators=220))])

In [49]:
estimator.score(X_test, y_test)

0.7393156171926245

## Saving model

In [29]:
from joblib import dump

In [30]:
dump(estimator, '../model/model.pkl')

['../model/model.pkl']

In [31]:
X_train.columns

Index(['opening_gross', 'screens', 'production_budget', 'title_year',
       'aspect_ratio', 'duration', 'cast_total_facebook_likes', 'budget',
       'imdb_score'],
      dtype='object')