In [11]:
import numpy as np
import pandas as pd
import os
import math as m
import matplotlib.pyplot as plt
import time

## Import Data Functions

In [12]:
def import_dataset(path):
    df = pd.read_csv(path, sep=';')
    df.columns = ["Name", "Cost","S1","S2","S3","S4","S5","S6","S7","S8","S9","S10","S11","S12","S13","S14","S15","S16","S17", "S18"]
    return df

def reduce_size(df_tot):
    for col in df_tot.columns :
        if col not in ["Configuration", "Instance", "Solution", "Cost"]:
            df_tot[col] = pd.to_numeric(df_tot[col], errors="ignore", downcast='float')
        else : 
            df_tot[col] = pd.to_numeric(df_tot[col], errors="ignore", downcast='integer')
    return df_tot

def tirage_repartition(indice, nb):
    tirage = []
    bond = int((len(indice)-1)/(nb-1))
    for k in range(0, len(indice), bond):
        tirage.append(indice[k])
    
    for element in tirage:
        indice.remove(element)

    return tirage, indice
    
def create_samples(p_train, nb_train, nb_test, nb_verify, seed):
    # Mettre la même seed pour avoir des résultats constants
    np.random.seed(seed)

    # Création des variables utiles
    fichiers_2113 = os.listdir('../HGS/2113/')
    fichiers_2213 = os.listdir('../HGS/2213/')
    fichiers_3113 = os.listdir('../HGS/3113/')
    fichiers_3213 = os.listdir('../HGS/3213/')

    #Shuffle 
    np.random.shuffle(fichiers_2113)
    np.random.shuffle(fichiers_2213)
    np.random.shuffle(fichiers_3113)
    np.random.shuffle(fichiers_3213)

    #Choix des fichiers trains et test (On arrondie au supérieur)
    train_list = {}
    test_list = {}
    train_list["2113"], test_list["2113"] = fichiers_2113[:m.ceil(p_train*len(fichiers_2113))], fichiers_2113[m.ceil(p_train*len(fichiers_2113)):]
    train_list["2213"], test_list["2213"] = fichiers_2213[:m.ceil(p_train*len(fichiers_2213))], fichiers_2213[m.ceil(p_train*len(fichiers_2213)):]
    train_list["3113"], test_list["3113"] = fichiers_3113[:m.ceil(p_train*len(fichiers_3113))], fichiers_3113[m.ceil(p_train*len(fichiers_3113)):]
    train_list["3213"], test_list["3213"] = fichiers_3213[:m.ceil(p_train*len(fichiers_3213))], fichiers_3213[m.ceil(p_train*len(fichiers_3213)):]

    #On créer les trois data_frames
    colonnes = ["Configuration", "Instance", "Solution", "Cost","S1","S2","S3","S4","S5","S6","S7","S8","S9","S10","S11","S12","S13","S14","S15","S16","S17", "S18"]
    lignes_train = []
    lignes_verify = []
    lignes_test = []


    for instance in train_list.keys():
        fichiers_train = train_list[instance]
        fichiers_test = test_list[instance]

        for path_fichier in fichiers_train:
            df = import_dataset(f"../HGS/{instance}/" + path_fichier)
            path_fichier = path_fichier.split("_")
            num = path_fichier[-1].replace(".csv", "")

            #On trie le dataset pour avoir un échantillon représentatif 
            df = df.sort_values(by = "Cost", ascending=False)
            df = df.reset_index().drop(columns=["index"])


            indice = [k for k in range(df.shape[0])]
            tirage_train, indice = tirage_repartition(indice, nb_train)
            tirage_verify, indice = tirage_repartition(indice, nb_verify)

            for i in range(nb_train):
                row_index = tirage_train.pop(0)
                row = list(df.iloc[row_index])
                row.pop(0)
                row = [int(instance), int(num), row_index] + row 
                lignes_train.append(row)
                
            for i in range(nb_verify):
                row_index = tirage_verify.pop(0)
                row = list(df.iloc[row_index])
                row.pop(0)
                row = [int(instance), int(num), row_index] + row 
                lignes_verify.append(row)
                

        for path_fichier in fichiers_test:
            df = import_dataset(f"../HGS/{instance}/" + path_fichier)
            path_fichier = path_fichier.split("_")
            num = path_fichier[-1].replace(".csv", "")

            #On trie le dataset pour avoir un échantillon représentatif 
            df = df.sort_values(by = "Cost", ascending=False)
            df = df.reset_index().drop(columns=["index"])
            indice = [k for k in range(df.shape[0])]
            tirage_test, indice = tirage_repartition(indice, nb_test)
            tirage_verify, indice = tirage_repartition(indice, nb_verify)

            for i in range(nb_test):
                row_index = tirage_test.pop(0)
                row = list(df.iloc[row_index])
                row.pop(0)
                row = [int(instance), int(num), row_index] + row 
                lignes_test.append(row)
            for i in range(nb_verify):
                row_index = tirage_verify.pop(0)
                row = list(df.iloc[row_index])
                row.pop(0)
                row = [int(instance), int(num), row_index] + row 
                lignes_verify.append(row)
        


    #Création des data frames

    df_train = pd.DataFrame(lignes_train, columns=colonnes)
    df_verify = pd.DataFrame(lignes_verify, columns=colonnes)
    df_test = pd.DataFrame(lignes_test, columns=colonnes)

    df_train = df_train.drop(columns=['S7'])
    df_verify = df_verify.drop(columns=['S7'])
    df_test = df_test.drop(columns=['S7'])
    return reduce_size(df_train), reduce_size(df_verify), reduce_size(df_test)


## Datasets and Scaling

In [13]:
df_train, df_verify, df_test = create_samples(0.8, 10000, 1000, 1000, 1)

In [14]:
X_train, Y_train = df_train.drop(columns=["Configuration", "Instance", "Solution", "Cost"]), df_train['Cost']
X_verify, Y_verify = df_verify.drop(columns=["Configuration", "Instance", "Solution", "Cost"]), df_verify['Cost']
X_test, Y_test = df_test.drop(columns=["Configuration", "Instance", "Solution", "Cost"]), df_test['Cost']

In [15]:
from sklearn.preprocessing import StandardScaler

X_train_f = StandardScaler().fit(X_train).transform(X_train)
Y_train_f = StandardScaler().fit(Y_train.values.reshape(-1,1)).transform(Y_train.values.reshape(-1,1))

X_verify_f = StandardScaler().fit(X_verify).transform(X_verify)
Y_verify_f = StandardScaler().fit(Y_verify.values.reshape(-1,1)).transform(Y_verify.values.reshape(-1,1))

X_test_f = StandardScaler().fit(X_test).transform(X_test)
Y_test_f = StandardScaler().fit(Y_test.values.reshape(-1,1)).transform(Y_test.values.reshape(-1,1))

print("Done")

Done


## Model

In [19]:
from sklearn.tree import DecisionTreeRegressor

start = time.time()
regr = DecisionTreeRegressor(random_state=0, max_depth=9, criterion='squared_error')
regr.fit(X_train_f, Y_train_f)
end = time.time()
elapsed = end - start

print("Temps exécution : ", elapsed)


Temps exécution :  9.064994812011719


In [20]:
from sklearn.metrics import mean_absolute_percentage_error 
from sklearn.metrics import mean_absolute_error

#Get the time of execution
start = time.time()


score_mape = mean_absolute_percentage_error(Y_test_f, regr.predict(X_test_f))
score_mae = mean_absolute_error(Y_test_f, regr.predict(X_test_f))


end = time.time()
elapsed = end - start

print("Temps exécution : ", elapsed)
print("MAPE : ", score_mape)
print("MAE : ", score_mae)

Temps exécution :  0.005999326705932617
MAPE :  0.29975145321996705
MAE :  0.20373678260012035


Cross validation

In [21]:
from sklearn.metrics import mean_absolute_percentage_error 
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

#Get the time of execution
start = time.time()

scorer = make_scorer(mean_absolute_percentage_error)
score = cross_val_score(regr, X_train_f, Y_train_f, cv=10, scoring=scorer)


end = time.time()
elapsed = end - start

print("Temps exécution : ", elapsed)
print("%0.2f MAPD with a standard deviation of %0.2f" % (score.mean(), score.std()))


Temps exécution :  10.135998249053955
0.35 MAPD with a standard deviation of 0.47


## Tuning some shit

In [20]:
K = 10 #crossvalidation
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_percentage_error 
from sklearn.metrics import make_scorer

# Paramètres à piper so good
scorer = make_scorer(mean_absolute_percentage_error)
parameters = [{'max_depth': [k for k in range(3,10)], 'criterion': ["squared_error","friedman_mse","absolute_error","poisson"]}]
print("ça tune de la thune")
regr_t = GridSearchCV(DecisionTreeRegressor(), parameters, cv = K, scoring = scorer)
regr_t.fit(X_test_f, Y_test_f)

# On check les scores
print("Grid scores")
means = regr_t.cv_results_['mean_test_score']
stds = regr_t.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, regr_t.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))

ça tune de la thune


35 fits failed out of a total of 140.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\kerac\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\kerac\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\tree\_classes.py", line 1342, in fit
    super().fit(
  File "c:\Users\kerac\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\tree\_classes.py", line 185, in fit
    raise ValueError(
ValueError: Some value(s) of y are negative which is not allowed for Poisson regression.

 0.61

Grid scores
0.672 (+/-1.525) for {'criterion': 'squared_error', 'max_depth': 3}
0.611 (+/-1.179) for {'criterion': 'squared_error', 'max_depth': 4}
0.622 (+/-1.251) for {'criterion': 'squared_error', 'max_depth': 5}
0.590 (+/-1.247) for {'criterion': 'squared_error', 'max_depth': 6}
0.593 (+/-1.217) for {'criterion': 'squared_error', 'max_depth': 7}
0.615 (+/-1.245) for {'criterion': 'squared_error', 'max_depth': 8}
0.612 (+/-1.259) for {'criterion': 'squared_error', 'max_depth': 9}
0.672 (+/-1.525) for {'criterion': 'friedman_mse', 'max_depth': 3}
0.611 (+/-1.179) for {'criterion': 'friedman_mse', 'max_depth': 4}
0.599 (+/-1.268) for {'criterion': 'friedman_mse', 'max_depth': 5}
0.618 (+/-1.211) for {'criterion': 'friedman_mse', 'max_depth': 6}
0.575 (+/-1.206) for {'criterion': 'friedman_mse', 'max_depth': 7}
0.606 (+/-1.204) for {'criterion': 'friedman_mse', 'max_depth': 8}
0.575 (+/-1.239) for {'criterion': 'friedman_mse', 'max_depth': 9}
0.824 (+/-1.562) for {'criterion': 'absolut

## HomeMade CrossValidation

In [None]:
def training(seed):
    df_train, df_verify, df_test = create_samples(0.8, 10000, 1000, 1000, seed)
    X_train, Y_train = df_train.drop(columns=["Configuration", "Instance", "Solution", "Cost"]), df_train['Cost']
    X_verify, Y_verify = df_verify.drop(columns=["Configuration", "Instance", "Solution", "Cost"]), df_verify['Cost']
    X_test, Y_test = df_test.drop(columns=["Configuration", "Instance", "Solution", "Cost"]), df_test['Cost']

    X_train_f = StandardScaler().fit(X_train).transform(X_train)
    Y_train_f = StandardScaler().fit(Y_train.values.reshape(-1,1)).transform(Y_train.values.reshape(-1,1))

    X_test_f = StandardScaler().fit(X_test).transform(X_test)
    Y_test_f = StandardScaler().fit(Y_test.values.reshape(-1,1)).transform(Y_test.values.reshape(-1,1))

    regr = DecisionTreeRegressor(random_state=0, max_depth=6, criterion='squared_error')
    regr.fit(X_train_f, Y_train_f)
    score_mape = mean_absolute_percentage_error(Y_test_f, regr.predict(X_test_f))
    score_mae = mean_absolute_error(Y_test_f, regr.predict(X_test_f))


    return score_mape, score_mae



scores_mape, scores_mae = [], []


for seed in range(10):
    score_mape, score_mae = training(seed)
    scores_mape.append(score_mape)
    scores_mae.append(score_mae)



print("Mean scores mape : " , np.mean(scores_mape))
print("Mean scores mae : " , np.mean(scores_mae))
    