In [1]:
# Chargement des données
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt

# Chargement des données sur le cancer du sein
data = datasets.load_breast_cancer()
X, y = data.data, data.target

In [2]:
# Séparation des données en ensembles d'entraînement et de test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [3]:
# Affichage des noms des caractéristiques
print(data.feature_names)

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [4]:
# Conversion des données en DataFrame pour une meilleure visualisation
import pandas as pd
df = pd.DataFrame(X_train, columns=data.feature_names)
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,11.85,17.46,75.54,432.7,0.08372,0.05642,0.02688,0.0228,0.1875,0.05715,...,13.06,25.75,84.35,517.8,0.1369,0.1758,0.1316,0.0914,0.3101,0.07007
1,11.22,19.86,71.94,387.3,0.1054,0.06779,0.005006,0.007583,0.194,0.06028,...,11.98,25.78,76.91,436.1,0.1424,0.09669,0.01335,0.02022,0.3292,0.06522
2,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,23.69,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637
3,13.59,17.84,86.24,572.3,0.07948,0.04052,0.01997,0.01238,0.1573,0.0552,...,15.5,26.1,98.91,739.1,0.105,0.07622,0.106,0.05185,0.2335,0.06263
4,16.69,20.2,107.1,857.6,0.07497,0.07112,0.03649,0.02307,0.1846,0.05325,...,19.18,26.56,127.3,1084.0,0.1009,0.292,0.2477,0.08737,0.4677,0.07623


In [5]:
# Affichage des noms des classes cibles
print(data.target_names)

['malignant' 'benign']


In [6]:
# Importation des modèles et entraînement
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

# Initialisation des modèles
model_1 = KNeighborsClassifier(n_neighbors = 2)
model_2 = SGDClassifier(random_state = 0)
model_3 = DecisionTreeClassifier(random_state = 0)
model_4 = VotingClassifier([('SGD' , model_2) , ('Tree' , model_3) , ('KNN' , model_1)] , voting = 'hard')

# Entraînement et évaluation des modèles
for model in ( model_1 , model_2 , model_3 , model_4 ) :
        model.fit(X_train , y_train)
        print(model.__class__.__name__ , ':' ,model.score(X_test , y_test))

KNeighborsClassifier : 0.9020979020979021
SGDClassifier : 0.9300699300699301
DecisionTreeClassifier : 0.8811188811188811
VotingClassifier : 0.9230769230769231


In [7]:
# Fonctions pour le bootstrap et classe majoritaire
from collections import Counter

# Fonction pour obtenir la classe la plus fréquente
def class_frequente ( y ) :
    c = Counter(y)
    plus_freq = c.most_common()[0][0]
    return plus_freq

# Fonction pour échantillonner avec remplacement
def bootstrap ( X , y ) :
    n_observations = X.shape[0]
    idx = np.random.choice(n_observations , n_observations)
    return X[idx] , y[idx]

# Génération d'un échantillon bootstrap
x_b , y_b = bootstrap(X , y)
df = pd.DataFrame(x_b , columns = data.feature_names)
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,10.75,14.97,68.26,355.3,0.07793,0.05139,0.02251,0.007875,0.1399,0.05688,...,11.95,20.72,77.79,441.2,0.1076,0.1223,0.09755,0.03413,0.23,0.06769
1,16.17,16.07,106.3,788.5,0.0988,0.1438,0.06651,0.05397,0.199,0.06572,...,16.97,19.14,113.1,861.5,0.1235,0.255,0.2114,0.1251,0.3153,0.0896
2,17.27,25.42,112.4,928.8,0.08331,0.1109,0.1204,0.05736,0.1467,0.05407,...,20.38,35.46,132.8,1284.0,0.1436,0.4122,0.5036,0.1739,0.25,0.07944
3,10.26,16.58,65.85,320.8,0.08877,0.08066,0.04358,0.02438,0.1669,0.06714,...,10.83,22.04,71.08,357.4,0.1461,0.2246,0.1783,0.08333,0.2691,0.09479
4,13.85,19.6,88.68,592.6,0.08684,0.0633,0.01342,0.02293,0.1555,0.05673,...,15.63,28.01,100.9,749.1,0.1118,0.1141,0.04753,0.0589,0.2513,0.06911


In [8]:
# Implémentation d'une forêt aléatoire
class foret_aleatoire : 
    def __init__ (self , n_arbres = 100) :
        self.n_arbres = n_arbres
        self.arbres = []
        
    def entrainement ( self , X , y ) :
        self.arbres = []
        for _ in range(self.n_arbres):
            tree = DecisionTreeClassifier()
            x_b , y_b = bootstrap(X , y)
            tree.fit(x_b , y_b)
            self.arbres.append(tree)
        
    def prediction ( self , X ) :
        tree_preds = np.array([tree.predict( X ) for tree in self.arbres ])
        tree_preds = np.swapaxes(tree_preds , 0 , 1)
        y_pred = [class_frequente( y_red ) for y_red in tree_preds ]
        return np.array(y_pred)

In [9]:
# Entraînement et prédiction avec la forêt aléatoire
fa = foret_aleatoire()
fa.entrainement(X_train , y_train)
y_pred_fa = fa.prediction(X_test)
y_pred_fa

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0])

In [11]:
# AdaBoost Classifier
from sklearn.ensemble import AdaBoostClassifier

# Initialisation et entraînement du modèle AdaBoost
model = AdaBoostClassifier(n_estimators=100)
model.fit(X_train , y_train)
model.score(X_test , y_test)



0.986013986013986

In [12]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

# Initialisation et entraînement du modèle Gradient Boosting
model = GradientBoostingClassifier(n_estimators=100)
model.fit(X_train , y_train)
model.score(X_test , y_test)

0.972027972027972