## Data Preparation

In [1]:
import pandas as pd

In [2]:
movies_data_path = '../dataset/movies.csv'
finantial_data_path = '../dataset/finantials.csv'
opening_data_path = '../dataset/opening_gross.csv'

In [3]:
fin_data = pd.read_csv(finantial_data_path)
movie_data = pd.read_csv(movies_data_path)
opening_data = pd.read_csv(opening_data_path)

In [4]:
numeric_columns_mask = (movie_data.dtypes == float) | (movie_data.dtypes == int)
numeric_columns = [column for column in numeric_columns_mask.index if numeric_columns_mask[column]]
movie_data = movie_data[numeric_columns+['movie_title']]

In [5]:
fin_data = fin_data[['movie_title','production_budget','worldwide_gross']]

In [6]:
fin_movie_data = pd.merge(fin_data, movie_data, on= 'movie_title', how='left')

In [7]:
full_movie_data = pd.merge( opening_data,fin_movie_data, on = 'movie_title', how='left')

In [8]:
full_movie_data[(full_movie_data.worldwide_gross != 0) & (full_movie_data.worldwide_gross.notnull())].shape

(2304, 12)

In [9]:
full_movie_data = full_movie_data.drop(['movie_title','gross'],axis=1)

In [10]:
full_movie_data.columns

Index(['opening_gross', 'screens', 'production_budget', 'worldwide_gross',
       'title_year', 'aspect_ratio', 'duration', 'cast_total_facebook_likes',
       'budget', 'imdb_score'],
      dtype='object')

## Modeling

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_validate
import numpy as np

In [12]:
X = full_movie_data.drop(['worldwide_gross'], axis = 1) #features
y = full_movie_data['worldwide_gross'] #target 

In [13]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')), #replace missing values by mean
    ('core_model', GradientBoostingRegressor()) #predictor 
])

In [14]:
results = cross_validate(pipeline ,X,y,return_train_score=True,cv=10)
results

{'fit_time': array([0.24035621, 0.23592114, 0.23371911, 0.23473215, 0.23472214,
        0.23327398, 0.23654914, 0.23504424, 0.23386312, 0.23308325]),
 'score_time': array([0.00163579, 0.0014298 , 0.00145888, 0.00158095, 0.00156808,
        0.00145292, 0.00157905, 0.00156903, 0.00148511, 0.0012989 ]),
 'test_score': array([0.67507711, 0.85333532, 0.64312267, 0.77644952, 0.78430318,
        0.86473545, 0.74957352, 0.87581079, 0.67566384, 0.65825827]),
 'train_score': array([0.91673951, 0.91581777, 0.9228721 , 0.91654412, 0.92172829,
        0.91476722, 0.92151444, 0.91734995, 0.92320705, 0.91766026])}

In [15]:
train_score = np.mean(results['train_score'])
test_score = np.mean(results['test_score'])
assert train_score > 0.7
assert test_score > 0.65
print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')

Train Score: 0.918820072167686
Test Score: 0.7556329678622858


**¿Tenés dudas respecto a cross validation?**

La validación cruzada (cross-validation) es una técnica utilizada en el aprendizaje automático para evaluar el rendimiento de un modelo. Consiste en dividir los datos en conjuntos de entrenamiento y prueba de manera iterativa, de modo que cada instancia se utiliza tanto para entrenar como para probar el modelo.

Es importante porque nos permite obtener una estimación más robusta y confiable del rendimiento del modelo, ya que evita la dependencia de una única división de los datos. Al utilizar múltiples divisiones, se reduce la posibilidad de que el resultado se vea afectado por una partición de datos específica.

En resumen, la validación cruzada nos ayuda a evaluar y seleccionar modelos de manera más precisa, proporcionando una medida más realista de su rendimiento en datos no vistos. Esto es esencial para tomar decisiones informadas sobre la capacidad de generalización del modelo y su aplicación en situaciones del mundo real.


## Hyperparameter tunning

In [16]:
from sklearn.model_selection import GridSearchCV

In [17]:
param_tunning = {'core_model__n_estimators': range(20,501,20)} 

In [18]:
estimator = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', GradientBoostingRegressor())
])

In [19]:
grid_search= GridSearchCV(estimator,
                       param_grid = param_tunning,
                       scoring='r2',
                       cv=5) 

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.35,random_state= 42)

In [21]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('imputer', SimpleImputer()),
                                       ('core_model',
                                        GradientBoostingRegressor())]),
             param_grid={'core_model__n_estimators': range(20, 501, 20)},
             scoring='r2')

In [22]:
final_result = cross_validate(grid_search.best_estimator_,X_train,y_train,return_train_score=True,cv=7)

In [23]:
train_score = np.mean(final_result['train_score'])
test_score = np.mean(final_result['test_score'])
assert train_score > 0.7
assert test_score > 0.65
print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')

Train Score: 0.9637696047888953
Test Score: 0.7612770083750863


In [24]:
grid_search.best_estimator_.get_params()

{'memory': None,
 'steps': [('imputer', SimpleImputer()),
  ('core_model', GradientBoostingRegressor(n_estimators=200))],
 'verbose': False,
 'imputer': SimpleImputer(),
 'core_model': GradientBoostingRegressor(n_estimators=200),
 'imputer__add_indicator': False,
 'imputer__copy': True,
 'imputer__fill_value': None,
 'imputer__missing_values': nan,
 'imputer__strategy': 'mean',
 'imputer__verbose': 0,
 'core_model__alpha': 0.9,
 'core_model__ccp_alpha': 0.0,
 'core_model__criterion': 'friedman_mse',
 'core_model__init': None,
 'core_model__learning_rate': 0.1,
 'core_model__loss': 'squared_error',
 'core_model__max_depth': 3,
 'core_model__max_features': None,
 'core_model__max_leaf_nodes': None,
 'core_model__min_impurity_decrease': 0.0,
 'core_model__min_samples_leaf': 1,
 'core_model__min_samples_split': 2,
 'core_model__min_weight_fraction_leaf': 0.0,
 'core_model__n_estimators': 200,
 'core_model__n_iter_no_change': None,
 'core_model__random_state': None,
 'core_model__subsample'

In [25]:
estimator = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', GradientBoostingRegressor(n_estimators=220,
                                             alpha=0.9,
                                             ccp_alpha=0.0,
                                             criterion='friedman_mse',
                                             init=None,
                                             learning_rate=0.1,
                                             loss='squared_error',
                                             max_depth=3,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_iter_no_change=None,
                                             random_state=None,
                                             subsample=1.0,
                                             tol=0.0001,
                                             validation_fraction=0.1,
                                             verbose=0,
                                             warm_start=False))
])

In [26]:
estimator.fit(X_train,y_train)

Pipeline(steps=[('imputer', SimpleImputer()),
                ('core_model', GradientBoostingRegressor(n_estimators=220))])

In [27]:
estimator.score(X_test, y_test)

0.7342214863431248

## Saving model

In [29]:
from joblib import dump

In [30]:
dump(estimator, '../model/model.pkl') #save model to pickle 

['../model/model.pkl']

In [29]:
X_train.columns # ten esto en cuenta para la creación del pipeline de procesamiento! con las mismas features que uses en train, deben ser
# las mismas cuando vayamos a hacer el deploy. es decir, sean el input para el request vía api - model-as-a-service. 

Index(['opening_gross', 'screens', 'production_budget', 'title_year',
       'aspect_ratio', 'duration', 'cast_total_facebook_likes', 'budget',
       'imdb_score'],
      dtype='object')