In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('data/movie_metadata.csv')

In [3]:
for column in df.select_dtypes(['float64','int64']).columns:
    df[column] = df[column].fillna(value=np.mean(df[column]))

In [4]:
df.dropna(inplace=True)

In [5]:
df.drop(columns=['aspect_ratio','title_year','facenumber_in_poster','director_facebook_likes','actor_3_facebook_likes','actor_1_facebook_likes','gross','cast_total_facebook_likes','actor_2_facebook_likes', 'budget'],inplace=True)

In [6]:
for column in ['genres','color','content_rating']:
    dummies = pd.DataFrame(pd.Series(df[column]).str.get_dummies())
    df = pd.concat([df,dummies], axis = 1)
    df.drop([column],axis=1,inplace=True)

In [7]:
from sklearn import preprocessing
le_actors = preprocessing.LabelEncoder()

In [8]:
all_actors = pd.concat([df['actor_1_name'], df['actor_2_name'], df['actor_3_name']], ignore_index=True)


le_actors.fit(all_actors)
le_actors.classes_

numeric_actor_1 = le_actors.transform(df['actor_1_name'])
df['actor_1_name'] = numeric_actor_1


numeric_actor_2 = le_actors.transform(df['actor_2_name'])
df['actor_2_name'] = numeric_actor_2

numeric_actor_3 = le_actors.transform(df['actor_3_name'])
df['actor_3_name'] = numeric_actor_3

In [9]:
le_director_name = preprocessing.LabelEncoder()
le_director_name.fit(df['director_name'])
le_director_name.classes_

numeric_director = le_director_name.transform(df['director_name'])
df['director_name'] = numeric_director

In [10]:
df.drop(['movie_imdb_link', 'movie_title'], axis=1,inplace=True)

In [11]:
le_language = preprocessing.LabelEncoder()
le_language.fit(df['language'])
le_language.classes_

numeric_language = le_language.transform(df['language'])
df['language'] = numeric_language

In [12]:
le_country = preprocessing.LabelEncoder()
le_country.fit(df['country'])
le_country.classes_

numeric_country = le_country.transform(df['country'])
df['country'] = numeric_country

# SVR

SVR é um método de regressão supervisionado de machine learning. Ele é derivado do Support Vector Machine. Alguns elementos básicos do SVM ainda estão presentes como **maximun margin**. O SVR utiliza a função epsilon para ignorar os errors das obsevações que estão presentes dentro da **maximun margin**, também chamada de epsilon _intensive band_. Na execução do SVR as features são mapeadas para um espaço dimensional de _m_ dimensões usando um mapeamento não linear, então o modelo é construido utilizando esse espaço dimensional.


Em conjunto com a epsilon _intensive loss function_ é possível obter uma boa generalização dos dados
Referência: < http://kernelsvm.tripod.com/ >

In [13]:
df_test = df.drop(['plot_keywords'], axis=1)
X = df_test.drop(['imdb_score'],axis=1)
y = df_test['imdb_score']

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

from  sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3)

from sklearn.linear_model import LinearRegression

from sklearn.svm import SVR



In [18]:
results = []
for x in np.arange(0.1,11,0.1):
    svr = SVR(kernel='linear', C=x)
    svr.fit(X_train,y_train)
    results.append(svr.score(X_test,y_test))

In [19]:
results

[0.38243633536862476,
 0.39126498714326824,
 0.3958653488147389,
 0.3982238244543465,
 0.3985274894659182,
 0.39986122941450775,
 0.39987467750722755,
 0.40036642391465305,
 0.40067220183206703,
 0.4007659114553798,
 0.4010479789533794,
 0.40125704934440387,
 0.4013191189167947,
 0.40130149019483746,
 0.4013810685520306,
 0.401407531276199,
 0.40178321476991796,
 0.4019272208137654,
 0.4020993599433518,
 0.4023386046470203,
 0.40263474275624167,
 0.402958362944286,
 0.40316702217542877,
 0.40323539776580875,
 0.4033813617585343,
 0.40353832764423514,
 0.40345271106255837,
 0.4031813829993308,
 0.40315265050765914,
 0.4030974320872531,
 0.4032530259304956,
 0.40306433361558786,
 0.4030219367928397,
 0.40303915442630556,
 0.4030108513346887,
 0.40316041987255136,
 0.40327580541187524,
 0.40341740568779527,
 0.40343106268063905,
 0.4033718660526795,
 0.4033503905269419,
 0.403331374143563,
 0.4033917127590144,
 0.40335856070939796,
 0.4032041865456101,
 0.40318298178713263,
 0.40323870971

In [31]:
svr = SVR(kernel='rbf')
svr.fit(X_train,y_train)
svr.score(X_test,y_test)

0.017358448162723583

In [30]:
from sklearn.svm import LinearSVR
svrl = LinearSVR()
svrl.fit(X_train,y_train)
svrl.score(X_test,y_test)

-4911.292485030349

In [28]:
df = df[['imdb_score','director_facebook_likes','duration','actor_1_facebook_likes','actor_2_facebook_likes','actor_3_facebook_likes','facenumber_in_poster','budget']]
df.dropna(inplace=True)
X = df.drop(['imdb_score'],axis=1)
y = df['imdb_score']
from  sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [29]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train,y_train)
rfr.score(X_test,y_test)

0.1583088666507414

# K-fold

## Linear Regression

In [16]:
from sklearn.model_selection import cross_val_score
cv_result = cross_val_score(LinearRegression(),X,y,cv=5)
print(np.mean(cv_result))

0.39817386175261066


## SVR Linear

In [22]:
cv_result = cross_val_score(SVR(kernel='linear'), X, y, cv=5)

In [23]:
np.mean(cv_result)

-12492514.265843943

## SVR rbf

In [24]:
cv_result = cross_val_score(SVR(kernel='rbf'), X, y, cv=5)

In [25]:
np.mean(cv_result)

0.006028421304931131