# 1.1 Dataset Preparation

In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits
from __future__ import division
from sklearn import cross_validation
digits = load_digits()
data = digits [ "data" ]
images = digits [ "images" ]
target = digits [ "target" ]
target_names = digits [ "target_names" ]

## 1.1 Dataset preperation

In [None]:
# Subtract mean vector and divide each feature vector by the standard deviation of the whole dataset
X = (data - np.mean(data,axis=0))
X = X/np.std(X)

# X^k
X_0 = X[np.where(target==0)]
X_1 = X[np.where(target==1)]
X_2 = X[np.where(target==2)]
X_3 = X[np.where(target==3)]
X_4 = X[np.where(target==4)]
X_5 = X[np.where(target==5)]
X_6 = X[np.where(target==6)]
X_7 = X[np.where(target==7)]
X_8 = X[np.where(target==8)]
X_9 = X[np.where(target==9)]

In [3]:
#select a test and training set
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, target,
                                            random_state=0, test_size=0.3)

# 1.2 One vs Rest

In [7]:
class one_vs_rest:
    def __init__(self, X,Y, binary=True, subsampling=True):
        self.X = X
        self.Y = Y
        self.binary = binary
        if subsampling:
            self.subsampling = "balanced_subsample"  # may be "subsample" or "balanced_subsample", depending on version of sklearn
        else:
            self.subsampling = "auto"
    
    def train(self):
        self.clfs = []
        for i in range(0,10):
            y = np.zeros(self.Y.size)
            y[np.where(self.Y == i)] = 1 
            rfc = RandomForestClassifier(n_estimators = 10, class_weight = self.subsampling)
            rfc.fit(self.X,y)
            self.clfs.append(rfc)
            
    def predict(self, test):  
        pred = []
        
        if self.binary:
            for i in range(0,10):
                pred.append(self.clfs[i].predict(test))
            return np.array(pred)
        else:
            for i in range(0,10):
                pred.append(self.clfs[i].predict_proba(test))
            return np.argmax(np.array(pred),axis=0)
        
    def tot_error(self, test_x, test_y):
            pred = self.predict(test_x)
            if self.binary:
                y = np.zeros(pred.shape)
                for num, i in enumerate(test_y):
                    y[i,num]=1
                return np.sum(np.not_equal(pred,y))/len(test_x)
                
            else:
                pred = pred[:,1]
                return np.sum(np.not_equal(pred,test_y))/len(test_x)

In [8]:
# Without weighted loss, argmax
ovr = one_vs_rest(X_train,y_train, 0,0)
ovr.train()
err = ovr.tot_error(X_test,y_test)
err

0.048148148148148148

In [9]:
# With subsampling, argmax
ovr = one_vs_rest(X_train,y_train, 0,1)
ovr.train()
err = ovr.tot_error(X_test,y_test)
err

0.05185185185185185

In [10]:
# Without weighted loss, binary
ovr = one_vs_rest(X_train,y_train, 1,0)
ovr.train()
err = ovr.tot_error(X_test,y_test)
err

0.22222222222222221

In [11]:
# Subsampling, binary
ovr = one_vs_rest(X_train,y_train, 1,1)
ovr.train()
err = ovr.tot_error(X_test,y_test)
err

0.22592592592592592

None of the methods seems to profit significantly from the weighted loss.

# 1.4 One - vs - One Classifier

In [67]:
class one_vs_one:
    def __init__(self, X,Y, subsampling=True, method=1):
        self.X = X
        self.Y = Y
        self.method = method
        if subsampling:
            self.subsampling = "balanced_subsample" # may be "subsample" or "balanced_subsample", depending on version of sklearn
        else:
            self.subsampling = "auto"
    
    def train(self):
        self.clfs = []
        if self.method == 1:
            for i in range(0,10):
                for j in range(i+1,10):
                    X_classes = self.X[(self.Y==i)|(self.Y==j)]
                    y = self.Y[(self.Y==i)|(self.Y==j)]
                    y[y == i] = 1
                    y[y == j] = 0
                    rfc = RandomForestClassifier(n_estimators = 10, class_weight = self.subsampling)
                    rfc.fit(X_classes,y)
                    self.clfs.append(rfc)
        else:
            for i in range(0,9):
                X_classes = self.X[(self.Y==i)|(self.Y==i+1)]
                y = self.Y[(self.Y==i)|(self.Y==i+1)]
                y[y == i] = 0
                y[y == i+1] = 1
                rfc = RandomForestClassifier(n_estimators = 10, class_weight = self.subsampling)
                rfc.fit(X_classes,y)
                self.clfs.append(rfc)
     
            
    def predict(self, test):  
        if self.method == 1:
            preds = np.zeros((np.shape(test)[0],10))
            k = 0
            for i in range(0,10):
                for j in range(i+1,10):
                    pred=(self.clfs[k].predict(test))
                    preds[:,i] = preds[:,i]+pred
                    preds[:,j] = preds[:,j]-(pred-1)
                    k = k+1
            return np.argmax(preds,axis = 1)
        else:
            pred = np.zeros((np.shape(test)[0]))
            for i in range(0,np.shape(test)[0]):
                a = 0
                b = 10
                while (a != b-1):
                    a_prob = self.clfs[a].predict_proba(test[i:i+1,:])[0,0]
                    b_prob =  self.clfs[b-2].predict_proba(test[i:i+1,:])[0,0]
                    if a_prob>(1-b_prob):
                        b = b-1
                    else:
                        a = a+1
                pred[i] = a
            return pred
        
    def tot_error(self, test_x, test_y):
            pred = self.predict(test_x)
            return np.sum(np.not_equal(pred,test_y))/len(test_x)


## Method 1

In [69]:
ovr = one_vs_one(X_train,y_train, 1,1)
ovr.train()
err = ovr.tot_error(X_test,y_test)
err


0.062962962962962957

## Method 2

In [70]:
ovr = one_vs_one(X_train,y_train, 1,2)
ovr.train()
err = ovr.tot_error(X_test,y_test)
err

0.24259259259259258

# 1.5 Multi-Class Random Forest

In [12]:
class multiclass:
    def __init__(self, X,Y, subsampling=True,binary = True):
        self.X = X
        self.Y = Y
        self.binary = binary
        if subsampling:
            self.subsampling = "balanced_subsample" # may be "subsample" or "balanced_subsample", depending on version of sklearn
        else:
            self.subsampling = "auto"
    
    def train(self):
        rfc = RandomForestClassifier(n_estimators = 100, class_weight = self.subsampling)
        self.clfs = rfc.fit(self.X,self.Y)
     
    def predict(self, test):  
        if self.binary:
            return np.array(self.clfs.predict(test))
        else:
            return np.argmax(self.clfs.predict_proba(test),axis = 1)

        
    def tot_error(self, test_x, test_y):
            pred = self.predict(test_x)
            return np.sum(np.not_equal(pred,test_y))/len(test_x)


In [13]:
# Without weighted loss, binary
ovr = multiclass(X_train,y_train, 0,1)
ovr.train()
err = ovr.tot_error(X_test,y_test)
print err

0.0296296296296


In [14]:
# Without weighted loss, argmax
ovr = multiclass(X_train,y_train, 0,0)
ovr.train()
err = ovr.tot_error(X_test,y_test)
print err

0.0277777777778


In [15]:
# With subsampling, binary
ovr = multiclass(X_train,y_train, 1,1)
ovr.train()
err = ovr.tot_error(X_test,y_test)
print err

0.0277777777778


In [16]:
# With subsampling, argmax
ovr = multiclass(X_train,y_train, 1,0)
ovr.train()
err = ovr.tot_error(X_test,y_test)
print err

0.0185185185185
