## Function to show the results

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

def show_results(ori, pred):
    x_ax = range(len(ori))
    plt.plot(x_ax, ori, linewidth=1.2, label="original")
    plt.plot(x_ax, pred, linewidth=1, label="predicted")
    plt.title("y-true and y-predicted data")
    plt.xlabel('X-axis')
    plt.ylabel('Y-axis')
    plt.legend(loc='best', fancybox=True, shadow=True)
    plt.grid(True)
    plt.show()

def show_confusion_matrix(ori, pred):
    ConfusionMatrixDisplay.from_predictions(ori, pred)
    plt.show()


Plusieurs algorithmes sont testés, afin de faciliter les tests, vous pouvez spécifier lequel utiliser ici

In [None]:
DO_DECISION_TREE = False
DO_GRADIENT_BOOSTING = True
DO_ADABOOST_WITH_DT = False

## Get the data 

In [None]:
from Preprocessing import getData
x_train, x_val, x_test, y_train, y_val, y_test = getData()

x_train = x_train.squeeze()
x_test = x_test.squeeze()
x_test = x_test.squeeze()
y_test = y_test.squeeze()

## Version brut

In [None]:
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier

# decision tree
if DO_DECISION_TREE:
    print('decision tree classifier')
    dt = tree.DecisionTreeClassifier()

    dt.fit(x_train, y_train)
    score = dt.score(x_train, y_train)
    print('score:', score)

    pred = dt.predict(x_test)
    accuracy = accuracy_score(y_test, pred)
    print('accuracy:', accuracy)

    show_results(y_test, pred)
    show_confusion_matrix(y_test, pred) # dunno why the confusion matrix appears twice
    

# GradientBoostingClassifier
if DO_GRADIENT_BOOSTING:
    print('gradient boosting classifier')
    gs = GradientBoostingClassifier()
    gs.fit(x_train, y_train)
    score = gs.score(x_train, y_train)
    print('score:', score)

    pred = gs.predict(x_test)
    accuracy = accuracy_score(y_test, pred)
    print('accuracy:', accuracy)

    show_results(y_test, pred)
    show_confusion_matrix(y_test, pred) # dunno why the confusion matrix appears twice

# Adaboost + dt
if DO_ADABOOST_WITH_DT:
    print('Adaboost with decision tree')
    ada_dt = AdaBoostClassifier(tree.DecisionTreeClassifier())
    ada_dt.fit(x_train, y_train)
    score = ada_dt.score(x_train, y_train)
    print('score:', score)

    pred = ada_dt.predict(x_test)
    accuracy = accuracy_score(y_test, pred)
    print('accuracy:', accuracy)

    show_results(y_test, pred)
    show_confusion_matrix(y_test, pred) # dunno why the confusion matrix appears twice

# not working
# # Adaboost + gb
# if DO_ADABOOST_WITH_GB:
#     print('Adaboost with gradient boosting')
#     ada_gb = AdaBoostClassifier(GradientBoostingClassifier())
#     ada_gb.fit(x_train, y_train)
#     score = ada_gb.score(x_train, y_train)
#     print('score:', score)

#     pred = ada_gb.predict(x_test)
#     accuracy = accuracy_score(y_test, pred)
#     print('accuracy:', accuracy)

#     show_results(y_test, pred)
#     show_confusion_matrix(y_test, pred) # dunno why the confusion matrix appears twice

maintenant essayons avec un grid search

commençons par installer une pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# decision tree classifier
if DO_DECISION_TREE:
    dt_pipe = Pipeline(steps=[
                        ('dec_tree', tree.DecisionTreeClassifier()),
                    ])

    dt_params = dict(
                dec_tree__criterion = ['gini', 'entropy'],
                dec_tree__max_depth = [i for i in range (1, 20)]
                )

# gradient boosting classifier
if DO_GRADIENT_BOOSTING:
    gb_pipe = Pipeline(steps=[
                        ('gra_boost', GradientBoostingClassifier()),
                    ])

    gb_params = dict(
                gra_boost__criterion = ['friedman_mse', 'squared_error'],
                gra_boost__max_depth = [i for i in range (1, 3)] # it takes a long time to test with higher values
                )
    # n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0

# if DO_ADABOOST_WITH_GS_DT:
#     ada_dt_pipe = Pipeline(steps=[
#                         ('ada_dt', AdaBoostClassifier()),
#                     ])

#     ada_dt_params = dict(
#                 ada_dt__base_estimator = [tree.DecisionTreeClassifier()],
#                 ada_dt__algorithm = ['SAMME', 'SAMME.R'],
#                 )


Et maintenant faisons les grid search

Si vous ne voulez pas les faire, vous pouvez sauter cette étape et lancer le shortcut à la place

In [None]:
# decision tree grid search
if DO_DECISION_TREE:
    gs_dt = GridSearchCV(dt_pipe, dt_params)
    gs_dt.fit(x_train, y_train)

    # pca__n_components = gsc.best_estimator_.get_params()['pca__n_components']
    dec_tree__criterion = gs_dt.best_estimator_.get_params()['dec_tree__criterion']
    dec_tree__max_depth = gs_dt.best_estimator_.get_params()['dec_tree__max_depth']

    print('decision tree classifier')
    print('Best Criterion:', dec_tree__criterion)
    print('Best max_depth:', dec_tree__max_depth)


# gradient boosting grid search
if DO_GRADIENT_BOOSTING:
    gs_gb = GridSearchCV(gb_pipe, gb_params)
    gs_gb.fit(x_train, y_train)

    gra_boost__criterion = gs_gb.best_estimator_.get_params()['gra_boost__criterion']
    gra_boost__max_depth = gs_gb.best_estimator_.get_params()['gra_boost__max_depth']


    print('gradient boosting classifier')
    print('Best Criterion:', gra_boost__criterion)
    print('Best max_depth:', gra_boost__max_depth)


# decision tree grid search
if DO_ADABOOST_WITH_DT:
    gs_ada_dt = GridSearchCV(ada_dt_pipe, ada_dt_params)
    gs_ada_dt.fit(x_train, y_train)

    ada_dt__algorithm = gs_ada_dt.best_estimator_.get_params()['ada_dt__algorithm']

    print('Ada boost + decision tree classifier')
    print('Best Algorithm:', ada_dt__algorithm)

shortcut (dé-commenter pour l'utiliser)

In [None]:
# # the gs_dt is not a gridsearch, only a decision tree classifier
# # the name is used to keep a consistent naming for later code, when taking this shortcut
# if DO_DECISION_TREE:
#     dec_tree__criterion = 'gini'
#     dec_tree__max_depth = 3
#     gs_dt = tree.DecisionTreeClassifier(criterion=dec_tree__criterion, max_depth=dec_tree__max_depth)

# # the gs_gb is not a gridsearch, only a gradient boosting classifier
# # the name is used to keep a consistent naming for later code, when taking this shortcut
# if DO_GRADIENT_BOOSTING:
#     gra_boost__criterion = 'friedman_mse'
#     gra_boost__max_depth = 1

#     gs_gb = GradientBoostingClassifier(criterion=gra_boost__criterion, max_depth=gra_boost__max_depth)
#     gs_gb.fit(x_train, y_train)

In [None]:
if DO_DECISION_TREE:
    print('decision tree classifier')
    print('score:', gs_dt.score(x_train, y_train))
    y_pred = gs_dt.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)

    print('accuracy:', accuracy)
    print('dunno why it is scaled even if we removed it in the pipeline')
    show_results(y_test, y_pred)
    show_confusion_matrix(y_test, y_pred)


if DO_GRADIENT_BOOSTING:
    print('gradient boosting classifier')
    print('score:', gs_gb.score(x_train, y_train))
    y_pred = gs_gb.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)

    print('accuracy::', accuracy)
    show_results(y_test, y_pred)
    show_confusion_matrix(y_test, y_pred)

if DO_ADABOOST_WITH_DT:
    print('Ada boost + decision tree classifier')
    print('score:', gs_ada_dt.score(x_train, y_train))
    y_pred = gs_ada_dt.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)

    print('accuracy:', accuracy)
    show_results(y_test, y_pred)
    show_confusion_matrix(y_test, y_pred)