In [None]:
#=============================================
#Author:Vladimir Kurnosov
#Create date:8Feb19
#Description: Looks at what the characteristics of
#the films that bring in the most revenue are.
#=============================================

In [None]:
import os
import psycopg2
import numpy as np
import pandas as pd
import random
import sklearn
from sklearn import tree
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pydotplus
#import StringIO
import pyodbc
import io

In [None]:
conn = pyodbc.connect("DSN=DVDRental")
films_all_numbers = pd.read_sql("SELECT * FROM total_revs_by_film_numbers_only", conn)
conn.close()

In [None]:
#Exploratory Data Analysis for films_all_numbers
films_all_numbers.dtypes #review variables types

In [None]:
films_all_numbers.describe() #analysis of data

In [None]:
films_all_numbers[["total_revs", "length"]].corr() #correlation of two variables

In [None]:
films_all_numbers.corr() #correlation of all variables

In [None]:
films_all_numbers[["total_revs", "length"]].cov() #covariance of two variables

In [None]:
films_all_numbers.cov() #covariance of all variables

In [None]:
films_all_numbers["total_revs"].hist() #histogram of total revs

In [None]:
### train on whole data

#Neccessary for reproducability
random.seed(5)

#train model
film_tree = tree.DecisionTreeRegressor(min_samples_leaf=10)
X = films_all_numbers.drop("total_revs", axis=1)
y = films_all_numbers["total_revs"]
film_tree = film_tree.fit(X, y)

In [None]:
#get predictions and RMSE
predictions = film_tree.predict(X)
print("Error = {}".format((np.sum((y-predictions)**2)/len(predictions))**0.5))

In [None]:
#save graphical representation to file
#this is optional
#if you want to do this block of code,
#Install Graphviz and add the executable to your path varaible
#For windows, the EXE installs in 
#C:\Program Files (x86)\Graphviz2.38\bin
#Restart Jupyter Lab
dotfile = io.StringIO()
tree.export_graphviz(film_tree, out_file=dotfile, feature_names = X.columns, rounded = True, proportion = False, precision = 2, filled = True)
graph = pydotplus.graph_from_dot_data(dotfile.getvalue())
graph.write_png("film_tree_plot_python.png")
#export_graphviz(estimator_nonlimited, out_file='tree_nonlimited.dot', feature_names = iris.feature_names,class_names = iris.target_names,rounded = True, proportion = False, precision = 2, filled = True)

In [None]:
### split to train/test dataset

#Neccessary for reproducability
random.seed(5)

#split dataset
size_test = 0.2
is_test = np.random.rand(len(films_all_numbers)) < size_test
films_test = films_all_numbers[is_test]
films_train = films_all_numbers[~is_test]

In [None]:
#train model
film_tree = tree.DecisionTreeRegressor(min_samples_leaf=20)
X = films_train.drop("total_revs", axis=1)
y = films_train["total_revs"]
film_tree = film_tree.fit(X, y)

In [None]:
#get predictions and RMSE
#train error
X = films_train.drop("total_revs", axis=1)
y = films_train["total_revs"]
predictions = film_tree.predict(X)
print("Train error = {}".format((np.sum((y-predictions)**2)/len(predictions))**0.5))
#test error
X = films_test.drop("total_revs", axis=1)
y = films_test["total_revs"]
predictions = film_tree.predict(films_test.drop("total_revs", axis=1))
print("Train error = {}".format((np.sum((y-predictions)**2)/len(predictions))**0.5))