# Pre processing

In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [22]:
df = pd.read_csv('data/movie_metadata.csv')

In [23]:
for column in df.select_dtypes(['float64','int64']).columns:
    df[column] = df[column].fillna(value=np.mean(df[column]))

In [24]:
df.dropna(inplace=True)

In [25]:
df.drop(columns=['aspect_ratio','title_year','facenumber_in_poster','director_facebook_likes','actor_3_facebook_likes','actor_1_facebook_likes','gross','cast_total_facebook_likes','actor_2_facebook_likes', 'budget'],inplace=True)

In [26]:
for column in ['genres','color','content_rating']:
    dummies = pd.DataFrame(pd.Series(df[column]).str.get_dummies())
    df = pd.concat([df,dummies], axis = 1)
    df.drop([column],axis=1,inplace=True)

In [27]:
from sklearn import preprocessing
le_actors = preprocessing.LabelEncoder()

In [28]:
all_actors = pd.concat([df['actor_1_name'], df['actor_2_name'], df['actor_3_name']], ignore_index=True)


le_actors.fit(all_actors)
le_actors.classes_

numeric_actor_1 = le_actors.transform(df['actor_1_name'])
df['actor_1_name'] = numeric_actor_1


numeric_actor_2 = le_actors.transform(df['actor_2_name'])
df['actor_2_name'] = numeric_actor_2

numeric_actor_3 = le_actors.transform(df['actor_3_name'])
df['actor_3_name'] = numeric_actor_3

In [29]:
le_director_name = preprocessing.LabelEncoder()
le_director_name.fit(df['director_name'])
le_director_name.classes_

numeric_director = le_director_name.transform(df['director_name'])
df['director_name'] = numeric_director

In [30]:
df.drop(['movie_imdb_link', 'movie_title'], axis=1,inplace=True)

In [31]:
le_language = preprocessing.LabelEncoder()
le_language.fit(df['language'])
le_language.classes_

numeric_language = le_language.transform(df['language'])
df['language'] = numeric_language

In [32]:
le_country = preprocessing.LabelEncoder()
le_country.fit(df['country'])
le_country.classes_

numeric_country = le_country.transform(df['country'])
df['country'] = numeric_country

# SVR

SVR é um método de regressão supervisionado de machine learning. Ele é derivado do Support Vector Machine. Alguns elementos básicos do SVM ainda estão presentes como **maximun margin**. O SVR utiliza a função epsilon para ignorar os errors das obsevações que estão presentes dentro da **maximun margin**, também chamada de epsilon _intensive band_. Na execução do SVR as features são mapeadas para um espaço dimensional de _m_ dimensões usando um mapeamento não linear, então o modelo é construido utilizando esse espaço dimensional.


Em conjunto com a epsilon _intensive loss function_ é possível obter uma boa generalização dos dados
Referência: < http://kernelsvm.tripod.com/ >

In [33]:
df_test = df.drop(['plot_keywords'], axis=1)
X = df_test.drop(['imdb_score'],axis=1)
y = df_test['imdb_score']

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

from  sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3)

from sklearn.linear_model import LinearRegression

from sklearn.svm import SVR



In [34]:
results = []
for x in np.arange(0.1,11,0.1):
    svr = SVR(kernel='linear', C=x)
    svr.fit(X_train,y_train)
    results.append(svr.score(X_test,y_test))

In [35]:
results

[0.4203766380369594,
 0.43348025639862764,
 0.43792469153531743,
 0.44002360459614437,
 0.4402143650699555,
 0.44076864923689313,
 0.4415849408928934,
 0.4425812156914996,
 0.44319589664925413,
 0.443172280075121,
 0.4425979298419597,
 0.44271648035315164,
 0.4430310304346373,
 0.44314198143648253,
 0.44329950339563884,
 0.4432265328627142,
 0.44336592333464353,
 0.4433131549488376,
 0.44335672139923954,
 0.44344087498508156,
 0.4435425441393996,
 0.44356606334524806,
 0.44366839230985894,
 0.44382991614448736,
 0.44390646409901735,
 0.44392593571013195,
 0.44401748173485706,
 0.44407143208917477,
 0.4441174748774489,
 0.44403058330107986,
 0.4440698587742396,
 0.4440775947919034,
 0.4440475093807421,
 0.4440381133925411,
 0.4439249843993813,
 0.44385437231133107,
 0.4438501081312738,
 0.44383354766915556,
 0.44383839631489613,
 0.44381315702203294,
 0.4437093059811699,
 0.4436419404044233,
 0.44359274691329176,
 0.44360627592082763,
 0.4436824806224199,
 0.4435798075037559,
 0.4435518

In [36]:
svr = SVR(kernel='rbf')
svr.fit(X_train,y_train)
svr.score(X_test,y_test)

0.4004169783009675

In [37]:
from sklearn.svm import LinearSVR
svrl = LinearSVR()
svrl.fit(X_train,y_train)
svrl.score(X_test,y_test)

0.43359904943948324

# K-fold

## Linear Regression

In [38]:
from sklearn.model_selection import cross_val_score
cv_result = cross_val_score(LinearRegression(),X,y,cv=5)
print(np.mean(cv_result))

0.39817386175261066


## SVR (rbf)

In [39]:
results = []
for x in np.arange(0.1,11,0.1):
    svr = SVR(kernel='rbf', C=x)
    svr.fit(X_train,y_train)
    results.append(svr.score(X_test,y_test))

In [41]:
results

[0.2247539197206837,
 0.2877743782970116,
 0.3233624661903498,
 0.3456727759470095,
 0.3608177980631976,
 0.3722973307443075,
 0.38021443999574067,
 0.3877109014679996,
 0.39439472189735175,
 0.4004169783009675,
 0.4046690119141616,
 0.40885782017838884,
 0.4119715600994419,
 0.4147677051689962,
 0.41739373482539716,
 0.419493592383899,
 0.4223389588872305,
 0.4241818438297451,
 0.42621668482157105,
 0.42759003104217014,
 0.42889073870718736,
 0.4303367055037257,
 0.43218284130151585,
 0.43371002796852953,
 0.4350914717716665,
 0.43589775712380113,
 0.43703378487293565,
 0.43900642048257976,
 0.44024417470312194,
 0.44101926847270995,
 0.44223539295392733,
 0.4432666376985398,
 0.4441827683270248,
 0.44505868867963644,
 0.445930079851967,
 0.44668385574094993,
 0.4476136068390433,
 0.448305860536419,
 0.4490905885547949,
 0.44979500019979707,
 0.4502670219183106,
 0.4507711039900116,
 0.4513072558139805,
 0.45175356893297347,
 0.4520546388936828,
 0.45232976300199756,
 0.45293597197540