# Pre processing

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('data/movie_metadata.csv')

In [3]:
for column in df.select_dtypes(['float64','int64']).columns:
    df[column] = df[column].fillna(value=np.mean(df[column]))

In [4]:
df.dropna(inplace=True)

In [5]:
df.drop(columns=['aspect_ratio','title_year','facenumber_in_poster','director_facebook_likes','actor_3_facebook_likes','actor_1_facebook_likes','gross','cast_total_facebook_likes','actor_2_facebook_likes', 'budget'],inplace=True)

In [6]:
for column in ['genres','color','content_rating']:
    dummies = pd.DataFrame(pd.Series(df[column]).str.get_dummies())
    df = pd.concat([df,dummies], axis = 1)
    df.drop([column],axis=1,inplace=True)

In [7]:
from sklearn import preprocessing
le_actors = preprocessing.LabelEncoder()

In [8]:
all_actors = pd.concat([df['actor_1_name'], df['actor_2_name'], df['actor_3_name']], ignore_index=True)


le_actors.fit(all_actors)
le_actors.classes_

numeric_actor_1 = le_actors.transform(df['actor_1_name'])
df['actor_1_name'] = numeric_actor_1


numeric_actor_2 = le_actors.transform(df['actor_2_name'])
df['actor_2_name'] = numeric_actor_2

numeric_actor_3 = le_actors.transform(df['actor_3_name'])
df['actor_3_name'] = numeric_actor_3

In [9]:
le_director_name = preprocessing.LabelEncoder()
le_director_name.fit(df['director_name'])
le_director_name.classes_

numeric_director = le_director_name.transform(df['director_name'])
df['director_name'] = numeric_director

In [10]:
df.drop(['movie_imdb_link', 'movie_title'], axis=1,inplace=True)

In [11]:
le_language = preprocessing.LabelEncoder()
le_language.fit(df['language'])
le_language.classes_

numeric_language = le_language.transform(df['language'])
df['language'] = numeric_language

In [12]:
le_country = preprocessing.LabelEncoder()
le_country.fit(df['country'])
le_country.classes_

numeric_country = le_country.transform(df['country'])
df['country'] = numeric_country

# SVR

SVR é um método de regressão supervisionado de machine learning. Ele é derivado do Support Vector Machine. Alguns elementos básicos do SVM ainda estão presentes como **maximun margin**. O SVR utiliza a função epsilon para ignorar os errors das obsevações que estão presentes dentro da **maximun margin**, também chamada de epsilon _intensive band_. Na execução do SVR as features são mapeadas para um espaço dimensional de _m_ dimensões usando um mapeamento não linear, então o modelo é construido utilizando esse espaço dimensional.


Em conjunto com a epsilon _intensive loss function_ é possível obter uma boa generalização dos dados
Referência: < http://kernelsvm.tripod.com/ >

In [13]:
df_test = df.drop(['plot_keywords'], axis=1)
X = df_test.drop(['imdb_score'],axis=1)
y = df_test['imdb_score']

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

from  sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3)

from sklearn.linear_model import LinearRegression

from sklearn.svm import SVR



In [14]:
results = []
for x in np.arange(0.1,11,0.1):
    svr = SVR(kernel='linear', C=x)
    svr.fit(X_train,y_train)
    results.append(svr.score(X_test,y_test))

In [15]:
results

[0.3929707338434455,
 0.4021618316000198,
 0.4071313011314531,
 0.4092096276263281,
 0.4097946143083376,
 0.41124914460327544,
 0.41211772041276323,
 0.41289995126488443,
 0.4136321831514781,
 0.4138709687578012,
 0.413932157950654,
 0.4140669950606789,
 0.4143370655672289,
 0.4148048657035499,
 0.4149609358724558,
 0.41514198122047996,
 0.41526859214957457,
 0.41535551465012044,
 0.415288395254368,
 0.41518582092851225,
 0.4153155735614727,
 0.4152851446040393,
 0.4155338348083057,
 0.415829502784293,
 0.4165941695326623,
 0.4166329907794656,
 0.4168468778154647,
 0.4172839965612746,
 0.41730929704471875,
 0.4173658796183431,
 0.41747070574071576,
 0.41744120893296077,
 0.4174676154729238,
 0.4174973775230123,
 0.4175140293874852,
 0.41754277779951876,
 0.41763377954802383,
 0.41759329143479096,
 0.41762684277642576,
 0.4177135070369804,
 0.418076749093125,
 0.41813019549129393,
 0.41822853539100946,
 0.41819538567206954,
 0.41819909768564395,
 0.4180957160227263,
 0.4180574909476131,

In [16]:
svr = SVR(kernel='rbf')
svr.fit(X_train,y_train)
svr.score(X_test,y_test)

0.37675439384430776

In [17]:
from sklearn.svm import LinearSVR
svrl = LinearSVR()
svrl.fit(X_train,y_train)
svrl.score(X_test,y_test)

0.417062604323689

# K-fold

## Linear Regression

In [18]:
from sklearn.model_selection import cross_val_score
cv_result = cross_val_score(LinearRegression(),X,y,cv=5)
print(np.mean(cv_result))

0.39817386175261066
