# Refazendo imports e manipulação dos dados

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier


dataset = pd.read_csv('db/movie_dataset.csv', sep=',',  index_col=0, na_values='?')

dataset = dataset.loc[:, ~dataset.columns.isin(['original_title', 'overview', 'tagline', 'title', 'homepage', 'crew', 'keywords', 'original_language', 'id'])]
dataset = dataset.loc[dataset['status'] == 'Released']

dataset_base = dataset.loc[:, dataset.columns.isin(['budget', 'revenue', 'popularity', 'runtime', 'vote_average', 'vote_count'])]
noise = np.random.randint(0, 101, size=dataset_base['revenue'].shape[0])
dataset_base['revenue'] = dataset_base['revenue'] + noise

dataset_base_rev_class = pd.qcut(dataset_base['revenue'], q=4, labels=['very low', 'low', 'medium', 'high'])
dataset_base['revenue'] = dataset_base_rev_class

dataset_dummies = dataset
dataset_dummies = pd.get_dummies(dataset_dummies)
dataset_dummies['revenue'] = dataset_dummies['revenue'] +noise
dataset_dummies_rev = pd.qcut(dataset_dummies['revenue'], q=4, labels=['very low', 'low', 'medium', 'high'])
dataset_dummies['revenue'] = dataset_dummies_rev

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_base['revenue'] = dataset_base['revenue'] + noise
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_base['revenue'] = dataset_base_rev_class


# Multi-layer perceptron

### Dataset base

In [2]:
dataset_base = dataset_base.loc[:, dataset_base.columns != 'vote_average']
dataset_base = dataset_base.loc[:, dataset_base.columns != 'runtime']

rev_pred_paramX = dataset_base.loc[:, dataset_base.columns != 'revenue'] 
rev_pred_paramY = np.array(dataset_base.loc[:, dataset_base.columns == 'revenue']).ravel()

X_train, X_test, y_train, y_test = train_test_split(rev_pred_paramX, rev_pred_paramY, test_size=0.10, stratify=rev_pred_paramY, random_state=42)

rev_pred_paramX.fillna(1, inplace=True)
rev_pred_paramY = np.nan_to_num(rev_pred_paramY, nan=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rev_pred_paramX.fillna(1, inplace=True)


In [3]:
# reescala os valores entre 0 e 1 utilizando o valor minimo e máximo de acada atributo
rev_pred_paramX_scaled = pd.DataFrame(MinMaxScaler().fit_transform(rev_pred_paramX), columns=rev_pred_paramX.columns)

In [4]:
parameters = {'hidden_layer_sizes' : [(5), (8), (15), (5, 3), (8, 5), (10, 5)],
              'max_iter' : [3000], 'random_state' : [42]}

mlp = MLPClassifier()
gs_mlp = GridSearchCV(mlp, parameters, cv=10, scoring='accuracy')

gs_mlp.fit(rev_pred_paramX_scaled, rev_pred_paramY)

view = ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
results = pd.DataFrame(gs_mlp.cv_results_)
results[view].sort_values(by='rank_test_score')

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
4,"{'hidden_layer_sizes': (8, 5), 'max_iter': 300...",0.564957,0.102198,1
2,"{'hidden_layer_sizes': 15, 'max_iter': 3000, '...",0.555986,0.103633,2
1,"{'hidden_layer_sizes': 8, 'max_iter': 3000, 'r...",0.552623,0.112907,3
0,"{'hidden_layer_sizes': 5, 'max_iter': 3000, 'r...",0.551124,0.130479,4
5,"{'hidden_layer_sizes': (10, 5), 'max_iter': 30...",0.547865,0.106999,5
3,"{'hidden_layer_sizes': (5, 3), 'max_iter': 300...",0.255475,0.001971,6


Esse algoritmo demora cerca de 2 minutos para concluir o treinamento e teste.

Ele possui uma acurácia que gira em torno dos 56%

### Dataset com dummies

In [5]:
dataset_dummies = dataset_dummies.loc[:, dataset_dummies.columns != 'vote_average']
dataset_dummies = dataset_dummies.loc[:, dataset_dummies.columns != 'runtime']

rev_pred_paramX = dataset_dummies.loc[:, dataset_dummies.columns != 'revenue'] 
rev_pred_paramY = np.array(dataset_dummies.loc[:, dataset_dummies.columns == 'revenue']).ravel()


rev_pred_paramX.fillna(0, inplace=True)
rev_pred_paramY = np.nan_to_num(rev_pred_paramY, nan=0)

rev_pred_paramX_scaled = pd.DataFrame(MinMaxScaler().fit_transform(rev_pred_paramX), columns=rev_pred_paramX.columns)

parameters = {'hidden_layer_sizes' : [(5), (8), (15), (5, 3), (8, 5), (10, 5)],
              'max_iter' : [100], 'random_state' : [42]}

mlp = MLPClassifier()
gs_mlp = GridSearchCV(mlp, parameters, cv=10, scoring='accuracy')

gs_mlp.fit(rev_pred_paramX_scaled, rev_pred_paramY)

view = ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
results = pd.DataFrame(gs_mlp.cv_results_)
results[view].sort_values(by='rank_test_score')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rev_pred_paramX.fillna(0, inplace=True)




Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
2,"{'hidden_layer_sizes': 15, 'max_iter': 1, 'ran...",0.275721,0.021257,1
1,"{'hidden_layer_sizes': 8, 'max_iter': 1, 'rand...",0.251096,0.001154,2
0,"{'hidden_layer_sizes': 5, 'max_iter': 1, 'rand...",0.250052,0.000593,3
4,"{'hidden_layer_sizes': (8, 5), 'max_iter': 1, ...",0.250052,0.000593,3
3,"{'hidden_layer_sizes': (5, 3), 'max_iter': 1, ...",0.249843,0.00074,5
5,"{'hidden_layer_sizes': (10, 5), 'max_iter': 1,...",0.249843,0.00074,5


O dataset com dummies não roda em um tempo útil (demora tempo superior a 1h), então ele não passou no teste para obter uma score significativa.