# Decision Tree for faces classification

## Fetch the data

In [None]:
import os
import pandas as pd

In [None]:
os.getcwd()     # be in the root of the project

In [None]:
os.chdir("../../")

In [None]:
PATH_TO_DATA = "CK+_lands/CK+_centered"
PATH_TO_LABEL = "CK+_lands/CK+/emotion.csv"

In [None]:
def load_last_faces(data, labels, only_labelled=True):
    def fuse_df(base_df, visage_dir, tmp_id, data, target=-1):
        coords = {str(i):x for i, x in enumerate(data.to_numpy())}
        new_line = {
            "subject": visage_dir,
            "file": tmp_id,
        }
        new_line = new_line | coords
        new_line = pd.DataFrame(new_line, index=[0])

        # new_line["coords"] = new_line.iloc[:, 2:].to_numpy().tolist()

        # new_line = new_line.drop([str(i) for i in range(136)], axis=1)
        return pd.concat([base_df, new_line])

    last_faces = pd.DataFrame()
    labels = pd.read_csv(labels, delimiter=";")

    for visage_dir in os.listdir(data):
        visage_data = pd.read_csv(os.path.join(data, visage_dir, "omlands.csv"), delimiter=";", header=None)
        tmp_id = visage_data.iloc[0, 0]
        for row_id in visage_data.index:
            if visage_data.iloc[row_id, 0] != tmp_id:
                last_faces = fuse_df(last_faces, visage_dir, tmp_id, visage_data.iloc[row_id-1, 1:])
                tmp_id = visage_data.iloc[row_id, 0]
        last_faces = fuse_df(last_faces, visage_dir, tmp_id, visage_data.iloc[row_id-1, 1:])


    last_faces = pd.merge(last_faces, labels, on=["subject", "file"], how="inner" if only_labelled else "outer")
    
    return last_faces

In [None]:
data = load_last_faces(PATH_TO_DATA, PATH_TO_LABEL)

In [None]:
data.loc[:, "0":"135"]

In [None]:
data.where(data["subject"] == "S005").dropna()

## Training 

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split

In [None]:
import numpy as np

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.loc[:, "0":"135"], data["emotion"])

In [None]:
tree = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)

In [None]:
tree.score(X_train, y_train)

In [None]:
tree.score(X_test, y_test)

In [None]:
plot_tree(tree)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rd_forest = RandomForestClassifier(max_depth=1, n_estimators=1000).fit(X_train, y_train)

In [None]:
rd_forest.score(X_train, y_train)

In [None]:
rd_forest.score(X_test, y_test)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(RandomForestClassifier(max_depth=5, n_estimators=1000), data.loc[:, "0":"135"], data["emotion"].to_numpy()).mean()

In [None]:
cross_val_score(DecisionTreeClassifier(), data.loc[:, "0":"135"], data["emotion"].to_numpy()).mean()

In [None]:
100 / len(data["emotion"].unique())

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler().fit(data.loc[:, "0":"135"])

In [None]:
data.loc[:, "0":"135"]

In [None]:
data.loc[:, "0":"135"] = scaler.transform(data.loc[:, "0":"135"])

In [None]:
data.loc[:, "0":"135"]

In [None]:
cross_val_score(RandomForestClassifier(max_depth=5, n_estimators=1000), data.loc[:, "0":"135"], data["emotion"].to_numpy()).mean()

In [None]:
cross_val_score(DecisionTreeClassifier(), data.loc[:, "0":"135"], data["emotion"].to_numpy()).mean()

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    "max_depth": np.arange(2, 11),
    "criterion": ["gini", "entropy"],
    "n_estimators": [100, 500, 1000]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_estimator_

In [None]:
best_rf = RandomForestClassifier(max_depth=8, criterion="entropy")
cross_val_score(best_rf, data.loc[:, "0":"135"], data["emotion"].to_numpy()).mean()

## Test avec Adaboost parce que j'ai vu un truc sur statquest

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
cross_val_score(AdaBoostClassifier(), data.loc[:, "0":"135"], data["emotion"].to_numpy()).mean()