# Ensemble methods. Exercises


In this section we have only two exercise:

1. Find the best three classifier in the stacking method using the classifiers from scikit-learn package.

2. Build arcing arc-x4 method. 

In [1]:
%store -r data_set
%store -r labels
%store -r test_data_set
%store -r test_labels
%store -r unique_labels

## Exercise 1: Find the best three classifier in the stacking method

Please use the following classifiers:

* Linear regression,
* Nearest Neighbors,
* Linear SVM,
* Decision Tree,
* Naive Bayes,
* QDA.

In [2]:
import numpy as np
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import itertools as it

import warnings
warnings.filterwarnings('ignore')

In [3]:
def build_classifiers(): 
      
    neighbors = KNeighborsClassifier()
    neighbors.fit(data_set, labels)

    linear_regression = LinearRegression()
    linear_regression.fit(data_set, labels)

    qda = QuadraticDiscriminantAnalysis()
    qda.fit(data_set, labels)
    
    gauss = GaussianNB()
    gauss.fit(data_set, labels)

    svc = SVC()
    svc.fit(data_set, labels)

    return neighbors, linear_regression, qda, gauss, svc

In [4]:
def get_three_out_of_five(classifiers_combinations, stc):
    
    output = [] 
    classifiers_list = []
    
    for model in classifiers_combinations:
        tmp_output = []
        tmp_model = []
        for classifier in model:
            tmp_output.append(classifier.predict(data_set)) 
            tmp_model.append(classifier)
        tmp_output = np.array(tmp_output).reshape((130,3))
        output.append(tmp_output)  # ten set of data, for each of 10 combination 3-elements from 5 elements set
        classifiers_list.append(tmp_model)
        
    model_fit = []
    
    for model_output in output:
        stacked_classifier =  stc
        model_fit.append(stacked_classifier.fit(model_output.reshape((130,3)), labels.reshape((130,))))
      
    return classifiers_list, model_fit

In [5]:
def get_best_accuracy(accuracy_list, test_set, classifiers_list, model_fit):
    
    best_prediction_index = np.argmax(accuracy_list)
    best_classifiers = classifiers_list[best_prediction_index]
    best_stacked_classifier = model_fit[best_prediction_index]
    best_predicted = best_stacked_classifier.predict(test_set)
    
    return  best_classifiers, best_predicted

In [6]:
def get_best_prediction(model_fit, classifiers_list):
    test_predictions_list = []   
    
    for i in range(len(model_fit)):
        classifiers = classifiers_list[i]
        stacked_classifier = model_fit[i]
        test_set = []
        
        for classifier in classifiers:
            test_set.append(classifier.predict(test_data_set))
        test_set = np.array(test_set).reshape((len(test_set[0]),3))
        predicted = stacked_classifier.predict(test_set)
        test_predictions_list.append(predicted)
    
    accuracy_list = []
    for prediction in test_predictions_list:
        accuracy_list.append(accuracy_score(test_labels, prediction))
        accuracy = accuracy_score(test_labels, prediction) 
       
    return get_best_accuracy(accuracy_list, test_set, classifiers_list, model_fit)

In [7]:
def build_stacked_classifier(classifiers, stc = GaussianNB() ):
    
    classifiers_combinations = it.combinations(classifiers, 3)
    classifiers_list, model_fit = get_three_out_of_five(classifiers_combinations, stc)
  
    test_predictions_list = []  
    accuracy_list = []    
    return get_best_prediction(model_fit, classifiers_list)


### Sprawdzenie modelu dla kilku różnych stacked stacked_classifier

In [8]:
raw_list = [KNeighborsClassifier(),LinearRegression(), QuadraticDiscriminantAnalysis(),GaussianNB(),SVC()]  

classifiers = build_classifiers()
best_classifiers, predicted = build_stacked_classifier(classifiers, stc = GaussianNB() )
accuracy = accuracy_score(test_labels, predicted)
print("\nBest set of classifiers:", best_classifiers)
print("\nBest accuracy:",accuracy)


Best set of classifiers: [KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'), LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False), QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariance=False, store_covariances=None, tol=0.0001)]

Best accuracy: 1.0


In [9]:
classifiers = build_classifiers()
best_classifiers, predicted = build_stacked_classifier(classifiers, stc = QuadraticDiscriminantAnalysis() )
accuracy = accuracy_score(test_labels, predicted)
print("\nBest set of classifiers:", best_classifiers)
print("\nBest accuracy:",accuracy)


Best set of classifiers: [KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'), QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariance=False, store_covariances=None, tol=0.0001), GaussianNB(priors=None, var_smoothing=1e-09)]

Best accuracy: 0.75


## Exercise 2: 

Use the boosting method and change the code to fullfilt the following requirements:

* the weights should be calculated as:
$w_{n}^{(t+1)}=\frac{1+ I(y_{n}\neq h_{t}(x_{n})}{\sum_{i=1}^{N}1+I(y_{n}\neq h_{t}(x_{n})}$,
* the prediction is done with a voting method.

In [10]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# prepare data set

def generate_data(sample_number, feature_number, label_number):
    data_set = np.random.random_sample((sample_number, feature_number))
    labels = np.random.choice(label_number, sample_number)
    return data_set, labels

labels = 2
dimension = 2
test_set_size = 1000
train_set_size = 5000
train_set, train_labels = generate_data(train_set_size, dimension, labels)
test_set, test_labels = generate_data(test_set_size, dimension, labels)

# init weights
number_of_iterations = 10
weights = np.ones((test_set_size,)) / test_set_size


def train_model(classifier, weights):
    return classifier.fit(X=test_set, y=test_labels, sample_weight=weights)

def calculate_accuracy_vector(predicted, labels):
    result = []
    for i in range(len(predicted)):
        if predicted[i] == labels[i]:
            result.append(0)
        else:
            result.append(1)
    return result

def calculate_error(model):
    predicted = model.predict(test_set)
    I=calculate_accuracy_vector(predicted, test_labels)
    Z=np.sum(I)
    return (1+Z)/1.0

Fill the two functions below:

In [11]:
def set_new_weights(model):
    I = calculate_accuracy_vector(model.predict(test_set), test_labels)
    new_weights =  np.add(1,I) /np.sum( np.add(1,I)) 
    return new_weights 

Train the classifier with the code below:

In [12]:
classifier = DecisionTreeClassifier(max_depth=1, random_state=1)
classifier.fit(X=train_set, y=train_labels)
alphas = []
classifiers = []

for iteration in range(number_of_iterations):
    model = train_model(classifier, weights)
    weights = set_new_weights(model)
    classifiers.append(model)

print(weights)

validate_x, validate_label = generate_data(1, dimension, labels)

[0.00133245 0.00133245 0.00066622 0.00133245 0.00066622 0.00066622
 0.00133245 0.00133245 0.00066622 0.00133245 0.00066622 0.00133245
 0.00133245 0.00066622 0.00066622 0.00133245 0.00066622 0.00133245
 0.00133245 0.00066622 0.00066622 0.00066622 0.00066622 0.00066622
 0.00133245 0.00133245 0.00133245 0.00066622 0.00066622 0.00133245
 0.00133245 0.00133245 0.00133245 0.00133245 0.00133245 0.00066622
 0.00133245 0.00066622 0.00066622 0.00066622 0.00066622 0.00133245
 0.00066622 0.00066622 0.00133245 0.00066622 0.00133245 0.00066622
 0.00066622 0.00066622 0.00066622 0.00066622 0.00066622 0.00066622
 0.00133245 0.00066622 0.00066622 0.00066622 0.00066622 0.00066622
 0.00066622 0.00066622 0.00133245 0.00133245 0.00066622 0.00066622
 0.00066622 0.00133245 0.00066622 0.00133245 0.00066622 0.00133245
 0.00133245 0.00133245 0.00066622 0.00066622 0.00066622 0.00133245
 0.00133245 0.00133245 0.00133245 0.00066622 0.00133245 0.00066622
 0.00066622 0.00133245 0.00066622 0.00133245 0.00066622 0.0013

Set the validation data set:

In [13]:
validate_x, validate_label = generate_data(1, dimension, labels)

Fill the prediction code:

In [14]:
def get_prediction(x):
    predictions = []
    for i in range(len(classifiers)):
        predicted = classifiers[i].predict(x)
        predictions.append(predicted)
  
    labels_list = np.unique(test_labels) 
    size_vec = np.zeros(len(labels_list))
    
    for p in predictions:
        for i in range(len(labels_list)):
            if labels_list[i] == p[0]:
                size_vec[i] =  size_vec[i] + 1
                
    index_max = np.argmax(size_vec)           
    
    if size_vec[index_max] > 0.5 * (len(classifiers)):
        return labels_list[index_max]

Test it:

In [15]:
prediction = get_prediction(validate_x)
print(prediction)

0
