## BOOSTING

In [193]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from   sklearn.tree import DecisionTreeClassifier
from   sklearn import preprocessing

Nesse trabalho implementaremos um processo de Boosting assumindo classificação binária com atributos categóricos.

Primeiramente vamos utilizar a base de dados tic tac toe endgame disponível em https://archive.ics.uci.edu/dataset/101/tic+tac+toe+endgame

In [166]:
df = pd.read_csv('tic-tac-toe.data', header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive
...,...,...,...,...,...,...,...,...,...,...
953,o,x,x,x,o,o,o,x,x,negative
954,o,x,o,x,x,o,x,o,x,negative
955,o,x,o,x,o,x,x,o,x,negative
956,o,x,o,o,x,x,x,o,x,negative


Vamos alterar a base para trabalhar com a saída sendo 0 para negativo e 1 para positivo.

In [167]:
df[9] = df[9].map({"positive" : 1, "negative" : 0})
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,x,x,x,x,o,o,x,o,o,1
1,x,x,x,x,o,o,o,x,o,1
2,x,x,x,x,o,o,o,o,x,1
3,x,x,x,x,o,o,o,b,b,1
4,x,x,x,x,o,o,b,o,b,1
...,...,...,...,...,...,...,...,...,...,...
953,o,x,x,x,o,o,o,x,x,0
954,o,x,o,x,x,o,x,o,x,0
955,o,x,o,x,o,x,x,o,x,0
956,o,x,o,o,x,x,x,o,x,0


Também vamos alterar os dados do banco de dados da seguinte forma: valor 1 em caso de ter marcado x, valor 0 em caso de não ter marcado x. Isso porque a única informação que importa para verificar a vitória de x é a disposição de suas marcações.

In [168]:
for i in range(9):
    df[i] = df[i].map({"x" : 1, "o" : 0, "b" : 0})
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1,1,1,1,0,0,1,0,0,1
1,1,1,1,1,0,0,0,1,0,1
2,1,1,1,1,0,0,0,0,1,1
3,1,1,1,1,0,0,0,0,0,1
4,1,1,1,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
953,0,1,1,1,0,0,0,1,1,0
954,0,1,0,1,1,0,1,0,1,0
955,0,1,0,1,0,1,1,0,1,0
956,0,1,0,0,1,1,1,0,1,0


In [169]:
print("quantidade de vitorias de x: ", df.iloc[:,9].sum())
print("quantidade de derrotas de x (ou empate): ", df.iloc[:,9].count() - df.iloc[:,9].sum())

quantidade de vitorias de x:  626
quantidade de derrotas de x (ou empate):  332


Vamos criar uma classe para a implementação do algoritmo de boosting do problema.

In [170]:
df.insert(9, 10, df.shape[0] * [1])
df.set_axis([0,1,2,3,4,5,6,7,8,9,10], axis="columns", inplace=True)

  df.set_axis([0,1,2,3,4,5,6,7,8,9,10], axis="columns", inplace=True)


In [239]:
class Boosting:

    def __init__(self, df, rounds):
        self.rounds = rounds
        self.df = df
        self.w = np.array(self.df.shape[0] * [1/self.df.shape[0]])
        self.h = lambda i : lambda p: lambda x : 1 if x[i] == p else -1
        self.y = [-1 if i == 0 else i for i in self.df.iloc[:,10]]
        self.create_stumps()

    def get_df(self):
        return self.df

    def create_stumps(self):
        # quantidade de possiveis stumps (2 * 10)
        self.stumps = {}
        for i in range(10):
            stumpP = []
            stumpN = []
            for j in range(self.df.shape[0]):
                if self.df.iloc[j,i] == self.df.iloc[j,10]:
                    stumpN.append(j)
                else:
                    stumpP.append(j)
            
            self.stumps["{0}p".format(i)] = stumpP
            self.stumps["{0}n".format(i)] = stumpN

    def get_stumps(self):
        return self.stumps

    def get_better_stump(self):
        better = self.df.shape[0] * [0]
        error = 1
        name = ""
        for n, stump in self.stumps.items():
            e = 0
            for i in stump:
                e += self.w[i]
            if e < error:
                better = stump
                name = n
                error = e
        return name, better, error
    
    def alphat(self, error):
        return 1/2 * np.log((1 - error)/error)
    
    def wtp1(self, alphat, ht):
        wt = np.zeros(self.w.shape)
        for i in range(len(self.w)):
            wt[i] = np.float64(self.w[i]) * np.exp((-alphat)*(ht(self.df.iloc[i]))*(self.y[i]))
        wt = (wt + min(wt))/sum(wt + min(wt))
        self.w = wt

    def get_w(self):
        return self.w


In [243]:
from sklearn.preprocessing import MinMaxScaler
model = Boosting(df, 0)
model.get_df()
# print(model.get_stumps())
name, stump, error = model.get_better_stump()
# print(name)
alpha = model.alphat(error)
h = lambda i : lambda p: lambda x : 1 if x[i] == p else -1

model.wtp1(alpha, h(9)(1))

model.get_w()

array([0.00090498, 0.00090498, 0.00090498, 0.00090498, 0.00090498,
       0.00090498, 0.00090498, 0.00090498, 0.00090498, 0.00090498,
       0.00090498, 0.00090498, 0.00090498, 0.00090498, 0.00090498,
       0.00090498, 0.00090498, 0.00090498, 0.00090498, 0.00090498,
       0.00090498, 0.00090498, 0.00090498, 0.00090498, 0.00090498,
       0.00090498, 0.00090498, 0.00090498, 0.00090498, 0.00090498,
       0.00090498, 0.00090498, 0.00090498, 0.00090498, 0.00090498,
       0.00090498, 0.00090498, 0.00090498, 0.00090498, 0.00090498,
       0.00090498, 0.00090498, 0.00090498, 0.00090498, 0.00090498,
       0.00090498, 0.00090498, 0.00090498, 0.00090498, 0.00090498,
       0.00090498, 0.00090498, 0.00090498, 0.00090498, 0.00090498,
       0.00090498, 0.00090498, 0.00090498, 0.00090498, 0.00090498,
       0.00090498, 0.00090498, 0.00090498, 0.00090498, 0.00090498,
       0.00090498, 0.00090498, 0.00090498, 0.00090498, 0.00090498,
       0.00090498, 0.00090498, 0.00090498, 0.00090498, 0.00090

Realizaremos a avaliação do modelo com a validação cruzada com 5 partições. A medida de eficácia a ser considerada é a taxa de erro simples.