# Partie 3 - Comparaison avec Scikit-Learn

Ce notebook compare notre implementation "from scratch" avec l'implementation de sklearn.

**Contenu:**
- Entrainement avec DecisionTreeClassifier de sklearn
- Comparaison des performances
- Visualisation de l'arbre sklearn
- Analyse des resultats

**Auteur**: Projet Data Mining - Arbres de Decision

## 1. Importation des bibliotheques

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Configuration
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

## 2. Chargement des donnees

In [None]:
# Chargement depuis GitHub
base_url = 'https://raw.githubusercontent.com/NassimZahri/Data_Mining/main/data/'
df = pd.read_csv(base_url + 'credit_simple.csv')

print("Dimensions du dataset:", df.shape)
print("\nDistribution de la variable cible:")
print(df['defaut'].value_counts())
df.head()

In [None]:
# Preparation des donnees
X = pd.get_dummies(df.drop('defaut', axis=1))
y = df['defaut'].map({'oui': 1, 'non': 0})

# Division train/test (70%/30%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print(f"Taille ensemble d'entrainement: {len(X_train)}")
print(f"Taille ensemble de test: {len(X_test)}")

## 3. Notre implementation "From Scratch"

Re-implementation des fonctions du notebook precedent pour comparaison.

In [None]:
# Classes et fonctions de notre implementation
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, 
                 value=None, samples=0, impurity=0.0):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        self.samples = samples
        self.impurity = impurity
    
    def is_leaf(self):
        return self.value is not None


def gini(y):
    if len(y) == 0:
        return 0
    counts = Counter(y)
    total = len(y)
    probs = [count / total for count in counts.values()]
    return 1 - sum(p**2 for p in probs)


def best_split(X, y):
    best_gain = 0
    best_feature = None
    best_threshold = None
    parent_impurity = gini(y)
    n_samples = len(y)
    
    for feature in X.columns:
        thresholds = sorted(X[feature].unique())
        for threshold in thresholds:
            left_mask = X[feature] <= threshold
            right_mask = X[feature] > threshold
            left_y = y[left_mask]
            right_y = y[right_mask]
            
            if len(left_y) == 0 or len(right_y) == 0:
                continue
            
            w_left = len(left_y) / n_samples
            w_right = len(right_y) / n_samples
            child_impurity = w_left * gini(left_y) + w_right * gini(right_y)
            gain = parent_impurity - child_impurity
            
            if gain > best_gain:
                best_gain = gain
                best_feature = feature
                best_threshold = threshold
    
    return best_feature, best_threshold, best_gain


def build_tree(X, y, depth=0, max_depth=3, min_samples_leaf=1):
    n_samples = len(y)
    n_classes = len(set(y))
    impurity = gini(y)
    
    if n_classes == 1 or depth >= max_depth or n_samples < min_samples_leaf * 2:
        majority_class = y.mode()[0]
        return Node(value=majority_class, samples=n_samples, impurity=impurity)
    
    feature, threshold, gain = best_split(X, y)
    
    if feature is None or gain <= 0:
        majority_class = y.mode()[0]
        return Node(value=majority_class, samples=n_samples, impurity=impurity)
    
    left_mask = X[feature] <= threshold
    right_mask = X[feature] > threshold
    
    left_subtree = build_tree(X[left_mask], y[left_mask], depth + 1, max_depth, min_samples_leaf)
    right_subtree = build_tree(X[right_mask], y[right_mask], depth + 1, max_depth, min_samples_leaf)
    
    return Node(feature=feature, threshold=threshold, left=left_subtree, 
                right=right_subtree, samples=n_samples, impurity=impurity)


def predict_one(x, node):
    if node.is_leaf():
        return node.value
    if x[node.feature] <= node.threshold:
        return predict_one(x, node.left)
    else:
        return predict_one(x, node.right)


def predict(X, tree):
    predictions = []
    for idx in X.index:
        pred = predict_one(X.loc[idx], tree)
        predictions.append(pred)
    return predictions

## 4. Entrainement et comparaison

In [None]:
# Notre implementation
print("Entrainement de notre arbre 'from scratch'...")
our_tree = build_tree(X_train, y_train, max_depth=3)
our_predictions_train = predict(X_train, our_tree)
our_predictions_test = predict(X_test, our_tree)

our_acc_train = accuracy_score(y_train, our_predictions_train)
our_acc_test = accuracy_score(y_test, our_predictions_test)

print(f"  Accuracy (train): {our_acc_train:.2%}")
print(f"  Accuracy (test): {our_acc_test:.2%}")

In [None]:
# Implementation sklearn
print("\nEntrainement de l'arbre sklearn...")
sklearn_tree = DecisionTreeClassifier(max_depth=3, criterion='gini', random_state=42)
sklearn_tree.fit(X_train, y_train)

sklearn_predictions_train = sklearn_tree.predict(X_train)
sklearn_predictions_test = sklearn_tree.predict(X_test)

sklearn_acc_train = accuracy_score(y_train, sklearn_predictions_train)
sklearn_acc_test = accuracy_score(y_test, sklearn_predictions_test)

print(f"  Accuracy (train): {sklearn_acc_train:.2%}")
print(f"  Accuracy (test): {sklearn_acc_test:.2%}")

## 5. Tableau comparatif

In [None]:
# Tableau de comparaison
comparison_data = {
    'Metrique': ['Accuracy (Train)', 'Accuracy (Test)'],
    'Notre Implementation': [f'{our_acc_train:.2%}', f'{our_acc_test:.2%}'],
    'Sklearn': [f'{sklearn_acc_train:.2%}', f'{sklearn_acc_test:.2%}']
}

comparison_df = pd.DataFrame(comparison_data)
print("Comparaison des performances:")
print("=" * 60)
comparison_df

## 6. Matrices de confusion

In [None]:
# Matrices de confusion
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Notre implementation
cm_our = confusion_matrix(y_test, our_predictions_test)
axes[0].imshow(cm_our, cmap='Blues')
axes[0].set_title('Notre Implementation')
axes[0].set_xlabel('Prediction')
axes[0].set_ylabel('Vraie valeur')
for i in range(2):
    for j in range(2):
        axes[0].text(j, i, str(cm_our[i, j]), ha='center', va='center', fontsize=20)
axes[0].set_xticks([0, 1])
axes[0].set_yticks([0, 1])
axes[0].set_xticklabels(['Non', 'Oui'])
axes[0].set_yticklabels(['Non', 'Oui'])

# Sklearn
cm_sklearn = confusion_matrix(y_test, sklearn_predictions_test)
axes[1].imshow(cm_sklearn, cmap='Greens')
axes[1].set_title('Sklearn')
axes[1].set_xlabel('Prediction')
axes[1].set_ylabel('Vraie valeur')
for i in range(2):
    for j in range(2):
        axes[1].text(j, i, str(cm_sklearn[i, j]), ha='center', va='center', fontsize=20)
axes[1].set_xticks([0, 1])
axes[1].set_yticks([0, 1])
axes[1].set_xticklabels(['Non', 'Oui'])
axes[1].set_yticklabels(['Non', 'Oui'])

plt.tight_layout()
plt.show()

## 7. Visualisation de l'arbre sklearn

In [None]:
# Visualisation graphique de l'arbre sklearn
plt.figure(figsize=(20, 10))
plot_tree(
    sklearn_tree, 
    feature_names=X.columns.tolist(),
    class_names=['Non', 'Oui'],
    filled=True,
    rounded=True,
    fontsize=10
)
plt.title('Arbre de Decision (sklearn)')
plt.tight_layout()
plt.show()

In [None]:
# Affichage textuel de l'arbre sklearn
print("Structure de l'arbre sklearn:")
print("=" * 60)
tree_rules = export_text(sklearn_tree, feature_names=X.columns.tolist())
print(tree_rules)

## 8. Rapport de classification detaille

In [None]:
print("Rapport de classification (sklearn sur ensemble de test):")
print("=" * 60)
print(classification_report(y_test, sklearn_predictions_test, target_names=['Non', 'Oui']))

## 9. Importance des features

In [None]:
# Importance des features (sklearn)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': sklearn_tree.feature_importances_
}).sort_values('Importance', ascending=False)

print("Importance des features:")
feature_importance

In [None]:
# Graphique d'importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'], color='steelblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Importance des Features dans l\'Arbre de Decision')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 10. Conclusion

### Observations:

1. **Performances similaires**: Notre implementation et sklearn produisent des resultats comparables, validant notre algorithme.

2. **Avantages de sklearn**:
   - Optimisations de performance
   - Fonctionnalites supplementaires (visualisation, importance des features)
   - Code teste et maintenu

3. **Valeur pedagogique**: L'implementation "from scratch" permet de comprendre en profondeur le fonctionnement des arbres de decision.

In [None]:
print("Fin du notebook - Comparaison avec sklearn")
print("Le notebook suivant analysera le sur-apprentissage et les forets aleatoires.")