In [1]:
import numpy as np
import pandas as pd
import os
import math as m
import matplotlib.pyplot as plt
import time

## Import Data Functions

In [2]:
df = pd.read_csv(r"..\data\dataset_new_score_v3.csv").drop(columns="Unnamed: 0")
df.columns

Index(['titre', 'budget', 'recette', 'duree', 'realisateur', 'casting',
       'compagnies_production', 'suite', 'Comedy', 'Drama', 'Horror',
       'Thriller', 'Action', 'Adventure', 'Science Fiction', 'Fantasy',
       'Family', 'Crime', 'Romance', 'Animation', 'Mystery', 'War', 'Music',
       'Western', 'History', 'Documentary', 'TV Movie', 'benefice global',
       'saison', 'casting_score_sum', 'casting_score_mean',
       'casting_score_med', 'casting_score_min', 'casting_score_max',
       'director_score_sum', 'director_score_mean', 'director_score_med',
       'director_score_min', 'director_score_max', 'compagnies_score_sum',
       'compagnies_score_mean', 'compagnies_score_med', 'compagnies_score_min',
       'compagnies_score_max'],
      dtype='object')

## Datasets and Scaling

In [25]:
from sklearn.model_selection import train_test_split

TEST_RATIO = 0.9
 
X_train, X_test, Y_train, Y_test = train_test_split(df.drop(columns=["titre", "budget","recette", "realisateur", "casting", "compagnies_production", "benefice global"]), df["benefice global"], test_size = TEST_RATIO)

In [26]:
from sklearn.preprocessing import StandardScaler

X_train_f = StandardScaler().fit(np.array(X_train)).transform(np.array(X_train))


X_test_f = StandardScaler().fit(np.array(X_test)).transform(np.array(X_test))

print("Done")

Done


## Model

In [31]:
from sklearn import svm

start = time.time()
regr = svm.SVR(gamma=0.01, C=10)
regr.fit(X_train_f, Y_train)
end = time.time()
elapsed = end - start

print("Temps exécution : ", elapsed)

Temps exécution :  0.037999629974365234


In [32]:
from sklearn.metrics import mean_absolute_percentage_error 
from sklearn.metrics import mean_absolute_error

#Get the time of execution
start = time.time()


score_mape = mean_absolute_percentage_error(Y_test, regr.predict(X_test_f))
score_mae = mean_absolute_error(Y_test, regr.predict(X_test_f))


end = time.time()
elapsed = end - start

print("Temps exécution : ", elapsed)
print("MAPE : ", score_mape)
print("MAE : ", score_mae)

Temps exécution :  0.8742508888244629
MAPE :  1463959550109096.5
MAE :  231.1523608555414


In [33]:
result =  pd.DataFrame({
    "Test" : Y_test,
    "Predicted" : regr.predict(X_test_f)
}
)

result.describe()

Unnamed: 0,Test,Predicted
count,4226.0,4226.0
mean,214.603999,65.078986
std,564.914385,34.835354
min,-98.96539,7.604218
25%,-28.0,38.715981
50%,78.605514,56.231263
75%,259.970875,86.24488
max,12400.2821,189.188618


In [34]:
from sklearn.metrics import mean_absolute_percentage_error 
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

#Get the time of execution
start = time.time()

scorer = make_scorer(mean_absolute_error)
score = cross_val_score(regr, X_train_f, Y_train, cv=10, scoring=scorer)


end = time.time()
elapsed = end - start

print("Temps exécution : ", elapsed)
print("%0.2f MAE with a standard deviation of %0.2f" % (score.mean(), score.std()))


Temps exécution :  0.2692759037017822
237.71 MAE with a standard deviation of 66.83


## Tuning some shit

In [None]:
K = 10 #crossvalidation
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_percentage_error 
from sklearn.metrics import make_scorer

# Paramètres à piper so good
scorer = make_scorer(mean_absolute_percentage_error)
parameters = [{'kernel': ['rbf'], 'gamma': [1e-4, 1e-3, 0.01, 0.1, 0.2, 0.5, 0.6, 0.9],'C': [1, 10, 100, 1000, 10000]}]
print("ça tune de la thune")
svm = GridSearchCV(svm(epsilon = 0.01), parameters, cv = K, scorer = scorer)
svm.fit(X_test_f, Y_test_f)

# On check les scores
print("Grid scores")
means = svm.cv_results_['mean_test_score']
stds = svm.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, svm.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))

## HomeMade CrossValidation

In [None]:
def training(seed):
    df_train, df_verify, df_test = create_samples(0.8, 10000, 1000, 1000, seed)
    X_train, Y_train = df_train.drop(columns=["Configuration", "Instance", "Solution", "Cost"]), df_train['Cost']
    X_verify, Y_verify = df_verify.drop(columns=["Configuration", "Instance", "Solution", "Cost"]), df_verify['Cost']
    X_test, Y_test = df_test.drop(columns=["Configuration", "Instance", "Solution", "Cost"]), df_test['Cost']

    X_train_f = StandardScaler().fit(X_train).transform(X_train)
    Y_train_f = StandardScaler().fit(Y_train.values.reshape(-1,1)).transform(Y_train.values.reshape(-1,1))

    X_test_f = StandardScaler().fit(X_test).transform(X_test)
    Y_test_f = StandardScaler().fit(Y_test.values.reshape(-1,1)).transform(Y_test.values.reshape(-1,1))

    regr = svm.SVR(gamma=0.01, C=1)
    regr.fit(X_train_f, Y_train_f)
    score_mape = mean_absolute_percentage_error(Y_test_f, regr.predict(X_test_f))
    score_mae = mean_absolute_error(Y_test_f, regr.predict(X_test_f))


    return score_mape, score_mae



scores_mape, scores_mae = [], []


for seed in range(10):
    score_mape, score_mae = training(seed)
    scores_mape.append(score_mape)
    scores_mae.append(score_mae)



print("Mean scores mape : " , np.mean(scores_mape))
print("Mean scores mae : " , np.mean(scores_mae))
    