In [1]:
## Kaggle Competition 2 - Simo Hakim - 20096040

ratio = 0.8
###################### -- Imports -- ######################
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
import xgboost as xgb

import threading
np.random.seed(42)
np.set_printoptions(precision=2, suppress=True)
###################### -- Helper Functions -- ######################

### Pour sauver mes prédictions :
def save_predictions_to_csv(filename, ids, predictions):
    with open(filename, 'w') as file:
        file.write("id,label\n")
        for id, pred in zip(ids, predictions):
            file.write(f"{id},{pred}\n")

### Pour plotter mes resultats :
def plot_results_XGB(results):
    learning_rates = [x[0] for x in results]
    max_depths = [x[1] for x in results]
    accuracies = [x[2] for x in results]

    plt.figure(figsize=(10, 6))
    for lr in set(learning_rates):
        specific_lr_depths = [depth for depth, l_rate in zip(max_depths, learning_rates) if l_rate == lr]
        specific_acc = [acc for acc, l_rate in zip(accuracies, learning_rates) if l_rate == lr]
        plt.plot(specific_lr_depths, specific_acc, label=f'Learning Rate {lr}')

    plt.title(f'Accuracy for different max depths and learning rates')
    plt.xlabel('Max Depth')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

###################### -- Data Handling -- ######################

# Importation des données
train = np.genfromtxt('sign_mnist_train.csv', delimiter=',', skip_header=1)
test = np.genfromtxt('test.csv', delimiter=',', skip_header=1)

# Préparation des labels d'entraînement
train_labels = train[:, 0]  # Première colonne pour les labels
train_pixels = np.delete(train, 0, axis=1)  # Suppression de la colonne des labels

# Conversion en labels one-hot pour la régression logistique
unique_labels = np.unique(train_labels)
train_labels_one_hot = np.zeros((train.shape[0], len(unique_labels)))
for i, label in enumerate(unique_labels):
    train_labels_one_hot[train_labels == label, i] = 1

# Séparation des pixels pour les images A et B dans les données de test
test_pixels_a = test[:, 1:785]  # Colonnes 1 à 784 pour Image A
test_pixels_b = test[:, 785:]  # Colonnes 785 à 1568 pour Image B

# Normalisation des données d'entraînement
train_mean = train_pixels.mean(axis=0)
train_std = train_pixels.std(axis=0)
train_pixels_normalized = (train_pixels - train_mean) / train_std

# Division des données d'entraînement en ensembles d'entraînement et de validation
split_ratio = int(ratio * len(train_pixels_normalized))
split_train_pixels = train_pixels_normalized[:split_ratio]
split_validation_pixels = train_pixels_normalized[split_ratio:]
split_train_labels = train_labels[:split_ratio]
split_validation_labels = train_labels[split_ratio:]
split_train_labels_one_hot = train_labels_one_hot[:split_ratio]
split_validation_labels_one_hot = train_labels_one_hot[split_ratio:]
# Division des pixels de validation en images A et B
split_validation_pixels_a = split_validation_pixels[:, :784]  # Première moitié pour l'image A
split_validation_pixels_b = split_validation_pixels[:, 784:]  # Seconde moitié pour l'image B

# Normalisation des images de test
test_pixels_a_normalized = (test_pixels_a - train_mean) / train_std
test_pixels_b_normalized = (test_pixels_b - train_mean) / train_std


###################### -- Import XGBoost  -- ######################
# Implémentation de XGBoost
class XGBoostClassifier:
    def __init__(self, max_depth, eta, num_class):
        # Paramètres de l'algorithme
        self.params = {
            'objective': 'multi:softmax',
            'num_class': num_class,
            'booster': 'dart',
            'eval_metric': 'merror',
            'eta': eta,
            'max_depth': max_depth,
        }

    # Entrainement du modèle
    def train(self, train_data, train_labels, validation_data, validation_labels):
        dtrain = xgb.DMatrix(train_data, label=train_labels)
        dval = xgb.DMatrix(validation_data, label=validation_labels)
        watchlist = [(dtrain, 'train'), (dval, 'validation')]
        self.model = xgb.train(self.params, dtrain, num_boost_round=200, evals=watchlist, early_stopping_rounds=20, verbose_eval=False)

    # Calcul des prédictions pour les images A et B
    def compute_predictions(self, data_a, data_b):
        ddata_a = xgb.DMatrix(data_a)
        ddata_b = xgb.DMatrix(data_b)
        preds_a = self.model.predict(ddata_a)
        preds_b = self.model.predict(ddata_b)

        # Convertir les prédictions en valeurs ASCII et les traiter selon les règles spécifiées
        ascii_a = preds_a + 65  # ASCII pour les majuscules
        ascii_b = preds_b + 65
        ascii_sum = ascii_a + ascii_b
        # Traiter la somme ASCII
        ascii_sum_adjusted = np.where(ascii_sum > 122, ascii_sum - 57, ascii_sum)  # 57 = 122 - 65 + 1
        final_chars = [chr(int(val)) for val in ascii_sum_adjusted]

        return final_chars

    # Calcul de l'accuracy (à adapter selon les besoins)
    def compute_accuracy(self, preds, labels):
        return np.mean(preds == labels)

###################### -- Code d'entrainement XGBoost -- ######################
def startXGBOOST():
    learning_rates = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
    max_depths = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    learning_rates = [0.1]
    max_depths = [3]
    results_XGB = []
    best_acc_xgb = 0
    best_lr = None
    best_depth = None

    def train(lr, depth):
        nonlocal best_acc_xgb, best_lr, best_depth
        print(f"Training with learning rate: {lr}, max_depth: {depth}\n")
        xgb_model = XGBoostClassifier(max_depth=depth, eta=lr, num_class=26)  # 26 classes (A-Z excluant J et Z)

        # Entrainement du modèle avec les données d'entraînement
        xgb_model.train(split_train_pixels, split_train_labels, split_validation_pixels, split_validation_labels)

        # Calcul des prédictions pour les données de validation
        val_preds_a = xgb_model.compute_predictions(split_validation_pixels_a)  # Image A
        val_preds_b = xgb_model.compute_predictions(split_validation_pixels_b)  # Image B

        # Convertir les prédictions en ASCII et les sommer
        val_final_preds = [chr(int(a + b)) for a, b in zip(val_preds_a, val_preds_b)]

        # Calculer l'accuracy (à adapter en fonction des besoins)
        acc = xgb_model.compute_accuracy(val_final_preds, split_validation_labels)  # Nécessite un ajustement pour correspondre à la logique de la compétition

        if acc > best_acc_xgb:
            best_acc_xgb = acc
            best_lr = lr
            best_depth = depth

        print(f"Accuracy for learning rate: {lr}, max_depth: {depth} = {acc}\n")
        results_XGB.append((lr, depth, acc))

    threads = []
    for lr in learning_rates:
        for depth in max_depths:
            t = threading.Thread(target=train, args=(lr, depth))
            threads.append(t)
            t.start()

    for t in threads:
        t.join()

    results_XGB.sort(key=lambda x: (x[0], x[1]))
    print(f"Best accuracy: {best_acc_xgb} with learning rate: {best_lr}, max_depth: {best_depth}\n")

    best_xgb_model = XGBoostClassifier(max_depth=best_depth, eta=best_lr, num_class=26)
    best_xgb_model.train(split_train_pixels, split_train_labels, split_validation_pixels, split_validation_labels)

    test_preds_a = best_xgb_model.compute_predictions(test_pixels_a_normalized)
    test_preds_b = best_xgb_model.compute_predictions(test_pixels_b_normalized)

    test_final_preds = [chr(int(a + b)) for a, b in zip(test_preds_a, test_preds_b)]

    save_predictions_to_csv(f"xgb_predictions_lr{best_lr}_depth{best_depth}.csv", test_final_preds)

    plot_results_XGB(results_XGB)

In [None]:
startXGBOOST()