# Ensemble Learning

### Extraction des données

In [2]:
import sklearn
import sklearn.datasets
from sklearn import tree
import numpy as np
#from sklearn.datasets import load_svmlight_file
#X_train, y_train = load_svmlight_file("a7a")
#X_test, y_test = load_svmlight_file("a7a.t")

In [3]:
ds = sklearn.datasets.make_classification(n_samples=20000,
                                          n_features=30,    # 30 features
                                          n_informative=5,  # only 5 informatives ones
                                          n_redundant=0,
                                          n_repeated=3,     # and 3 duplicate
                                          n_classes=2,
                                          n_clusters_per_class=1,
                                          weights=None,
                                          flip_y=0.03,
                                          class_sep=0.8,
                                          hypercube=True,
                                          shift=0.0,
                                          scale=1.0,
                                          shuffle=True,
                                          random_state=None)

In [4]:
Data= ds[0]
Target= ds[1]
XTrain=Data[:10000]
YTrain=Target[:10000]
XTest=Data[10000:]
YTest=Target[10000:]

### Algorithme du Bagging

In [5]:
class Bagging(object):
    def __init__(self,nbr_stumps):
        self.nbr_stumps = nbr_stumps
        self.fb = {}
        
    def fit(self,data,target):
        self.data = data
        self.target = target
        
    def predict(self, inputs):
        labels = list(set(self.target))
        n = len(self.data)
        for i in range(0,self.nbr_stumps):
            data_tmp = []
            target_tmp = []
            for j in range(0,n):
                index = np.random.randint(0,n)
                data_tmp.append(self.data[index])
                target_tmp.append(self.target[index])
            clf = tree.DecisionTreeClassifier(max_depth=1)
            clf = clf.fit(data_tmp, target_tmp) 
            output = clf.predict(inputs)
            self.fb[i]=output
        
        m = len(inputs)
        nbrLabels=len(labels)  
        output = []
        for indice in range(0,m):
            compter_index = np.zeros(nbrLabels)
            for l in range(0,self.nbr_stumps):
                for indexL in range(0,nbrLabels):
                    if self.fb[l][indice] == labels[indexL]:
                        compter_index[indexL] += 1
            maxIndex = max((v, i) for i, v in enumerate(compter_index))[1]
            label = labels[maxIndex]
            output.append(label)
        return output

    def score(self,prediction,target):
        """
        length = len(prediction)
        nbrCorrect = sum(i == j for i, j in zip(prediction, target))
        precision = nbrCorrect/(length*1.0)
        return precision"""
        return np.mean(prediction == target)

### Test du Bagging

In [6]:
b = Bagging(5)
b.fit(XTrain, YTrain)
outputs = b.predict(XTest)
print b.score(outputs,YTest)

0.7449


### Algorithme du Boosting (AdaBoost)

In [8]:
class Boosting(object):
    def __init__(self,nbr_stumps):
        self.nbr_stumps = nbr_stumps
        self.weights = None
        self.alpha = np.matrix(np.zeros((self.nbr_stumps, 1)))
        self.func = list()
        
    def fit(self,data,target):
        self.data = data
        self.target = target
        self.M = len(self.target)
        self.weights = np.ones(self.M)/self.M
        
    def predict(self, inputs):
        w = np.zeros(self.M)
        for i in range(0,self.nbr_stumps):
            clf = tree.DecisionTreeClassifier(max_depth=1)
            clf = clf.fit(self.data, self.target) 
            output = clf.predict(self.data)
            t = np.array(self.target)
            p = np.array(output)
            Indicator = np.abs(t-p)
            errors=np.dot(Indicator,self.weights)
            self.alpha[i]=0.5*np.log((1-errors)/errors)
            for j in range(self.M):
                if Indicator[j] == 1: 
                    w[j] = self.weights[j] * np.exp(self.alpha[i])
                else: 
                    w[j] = self.weights[j] * np.exp(-self.alpha[i])
            self.weights = w / w.sum()
            self.func.append(clf)
        predictions = []
        f = np.zeros((1,len(inputs)))
        for i in range(0,self.nbr_stumps):
            f += self.alpha[i]*self.func[i].predict(inputs)
        return np.sign(f)
        
    def score(self,prediction,target):
        return np.mean(prediction == target)

### Test du Boosting

In [11]:
boosting = Boosting(5)
boosting.fit(XTrain, YTrain)
outputs = boosting.predict(XTest)
print b.score(outputs,YTest)

0.742


### Conclusion :
Pour dresser une conclusion de nos expérimentations, plusieures affirmations peuvent être faites.
Les algorithmes de Bagging et Boosting sont de bons moyens d'augmenter l'efficacité de son algorithme de classification faibles lorsqu'ils ont des précisions faibles.
#### Comparaison entre les deux algorithmes :
Le Bagging est plus simple à élaborer et peut être parrallélisé.
Le Boosting atteind de meilleurs résultats. Ceci s'explique par l'utilisation de poids sur les exemples. Le poids des exemples mal classifiés est augmenté. Ce qui permet au prochain classifieur de ce concentrer sur les exemples d'entrainement mal classifiés.