# Refazendo imports e manipulação dos dados

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import seaborn as sns

dataset = pd.read_csv('db/movie_dataset.csv', sep=',',  index_col=0, na_values='?')

dataset = dataset.loc[:, ~dataset.columns.isin(['original_title', 'overview', 'tagline', 'title', 'homepage', 'crew', 'keywords', 'original_language', 'id'])]
dataset = dataset.loc[dataset['status'] == 'Released']
#dataset = dataset.loc[dataset['revenue'] != 0]

dataset_base = dataset.loc[:, dataset.columns.isin(['budget', 'revenue', 'popularity', 'runtime', 'vote_average', 'vote_count'])]
noise = np.random.randint(0, 101, size=dataset_base['revenue'].shape[0])
dataset_base['revenue'] = dataset_base['revenue'] + noise

dataset_base_rev_class = pd.qcut(dataset_base['revenue'], q=4, labels=['very low', 'low', 'medium', 'high'])
dataset_base['revenue'] = dataset_base_rev_class

dataset_dummies = dataset
dataset_dummies = pd.get_dummies(dataset_dummies)
dataset_dummies['revenue'] = dataset_dummies['revenue'] +noise
dataset_dummies_rev = pd.qcut(dataset_dummies['revenue'], q=4, labels=['very low', 'low', 'medium', 'high'])
dataset_dummies['revenue'] = dataset_dummies_rev

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_base['revenue'] = dataset_base['revenue'] + noise
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_base['revenue'] = dataset_base_rev_class


# Arvore de decisão

Pode-se observar que alguns dados possuem maior correlação com a receita do filme, como por exemplo vote_count e popularity. E outros não apresentam correlação tão forte como vote_average ( exceto em casos extremos )

In [2]:
dataset_base = dataset_base.loc[:, dataset_base.columns != 'vote_average']
dataset_base = dataset_base.loc[:, dataset_base.columns != 'runtime']

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

rev_pred_paramX = dataset_base.loc[:, dataset_base.columns != 'revenue'] 
rev_pred_paramY = np.array(dataset_base.loc[:, dataset_base.columns == 'revenue']).ravel()

X_train, X_test, y_train, y_test = train_test_split(rev_pred_paramX, rev_pred_paramY, test_size=0.10, stratify=rev_pred_paramY, random_state=42)

clf = DecisionTreeClassifier(criterion='gini', random_state=42, min_samples_split=40)

clf.fit(X_train, y_train)
predict = clf.predict(X_test)

acc = accuracy_score(y_test, predict)
print(f'A acurácia do Classificador é: {acc :.2%}')

A acurácia do Classificador é: 54.37%


A remoção de vote_average e runtime, juntamente com o aumento do número mínimo de amostras necessário para o split, melhoraram a acurácia do algoritmo de 50% para 54%-60%.

Podemos também testar o classificador com as opções de diretores, atores e gêneros.

In [4]:
dataset_dummies

Unnamed: 0_level_0,budget,popularity,revenue,runtime,vote_average,vote_count,genres_Action,genres_Action Adventure,genres_Action Adventure Animation Comedy Family,genres_Action Adventure Animation Comedy Science Fiction,...,director_Zackary Adler,director_Zak Penn,director_Zal Batmanglij,director_Zhang Yimou,director_Zoran Lisinac,director_\u00c0lex Pastor,director_\u00c1lex de la Iglesia,director_\u00c9mile Gaudreault,director_\u00c9ric Tessier,director_\u00c9tienne Faure
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,237000000,150.437577,high,162.0,7.2,11800,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,300000000,139.082615,high,169.0,6.9,4500,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,245000000,107.376788,high,148.0,6.3,4466,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,250000000,112.312950,high,165.0,7.6,9106,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,260000000,43.926995,high,132.0,6.1,2124,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,220000,14.269792,low,81.0,6.6,238,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4799,9000,0.642552,low,85.0,5.9,5,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4800,0,1.444476,very low,120.0,7.0,6,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4801,0,0.857008,very low,98.0,5.7,7,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [5]:
dataset_dummies = dataset_dummies.loc[:, dataset_dummies.columns != 'vote_average']
dataset_dummies = dataset_dummies.loc[:, dataset_dummies.columns != 'runtime']

rev_pred_paramX = dataset_dummies.loc[:, dataset_dummies.columns != 'revenue']
rev_pred_paramY = np.array(dataset_dummies.loc[:, dataset_dummies.columns == 'revenue']).ravel()

rev_pred_paramX.fillna(1, inplace=True)
rev_pred_paramY = np.nan_to_num(rev_pred_paramY, nan=1)

X_train, X_test, y_train, y_test = train_test_split(rev_pred_paramX, rev_pred_paramY, test_size=0.10, stratify=rev_pred_paramY, random_state=42)

clf = DecisionTreeClassifier(criterion='gini', random_state=42, min_impurity_decrease = 0.007)

clf.fit(X_train, y_train)
predict = clf.predict(X_test)

acc = accuracy_score(y_test, predict)
print(f'A acurácia do Classificador é: {acc :.2%}')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rev_pred_paramX.fillna(1, inplace=True)


A acurácia do Classificador é: 60.62%


Com as novas mudanças, a acurácia com os diretores atores e gêneros subiu de 57%-59% para 60%-63%