## Stacked ensemble tests

In [1]:
import pandas as pd
import numpy as np

# read the dataset
dt = pd.read_csv('hgsc.csv')
dt.head()

Unnamed: 0,class,ABAT,ABHD2,ACTB,ACTR2,ACTR5,ACVR2A,ADAMDEC1,ADCYAP1R1,AEBP1,...,WT1,XPO7,XPOT,YTHDC2,ZDHHC14,ZDHHC7,ZEB1,ZFP36,ZHX3,ZNF423
0,PRO.C5,-0.010674,0.263376,-0.115492,-0.323565,0.005161,-0.504271,-1.28372,-0.433908,0.673072,...,0.077048,0.459961,-0.072049,0.243935,-0.056318,-0.204971,0.179639,-0.292136,-0.034261,0.490152
1,MES.C1,-0.710741,0.110421,0.532555,-0.253877,-0.389024,-0.121941,-1.73292,-0.72788,1.70611,...,0.54712,-0.674773,-0.236746,0.551354,0.215982,0.196677,1.46732,2.46104,0.415041,2.11688
2,DIF.C4,0.881506,0.372862,0.052344,0.028721,-0.848119,-1.28118,1.52437,-0.288317,-2.01083,...,1.05817,0.350895,-5.1e-05,0.010498,0.592285,-0.338954,-0.842242,0.096242,-0.471005,-1.66219
3,MES.C1,-1.08509,0.415651,0.395376,-0.27105,0.146536,-0.36327,0.993823,-0.450427,1.99917,...,-0.677226,-0.109778,0.033163,0.76008,-1.16903,0.325604,1.78576,-0.212328,0.537493,-0.102138
4,MES.C1,-0.93223,0.045352,0.595068,0.187856,-0.200287,0.211144,1.84464,-0.416482,1.3278,...,0.961688,-0.00901,0.529045,-0.55147,-0.188697,0.157393,0.469166,1.748,0.144196,-0.561641


In [2]:
dt=dt.dropna()

In [3]:
# Now x, y
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

X = dt.loc[:,dt.columns!='class']
#dt['class']=dt['class'].astype('integer') # the following is useless.. We can use LabelEncoder from sklearn.preprocessing
y = dt['class']
y = le.fit_transform(y)

In [4]:
X.shape

(489, 321)

In [5]:
# now split in test and train set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.5, random_state=42)

In [6]:
# Now let us define 4 pipelines. Start with the SVM pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import svm

pipe_svm = Pipeline([('scl', StandardScaler()),
                   ('clf', svm.SVC(C=1.0, kernel='linear', probability=True))])

In [7]:
# Now define a LDA pipeline.
#from sklearn.lda import LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

pipe_lda = Pipeline([('sc', StandardScaler()),
                    ('clf', LDA())])

In [8]:
# Next define the QDA pipeline
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA

pipe_qda = Pipeline([('sc', StandardScaler()),
                    ('clf', QDA())])

In [9]:
# Last, let's define the Random Forest Pipeline

from  sklearn.ensemble import RandomForestClassifier as RFC

pipe_rf = Pipeline([('sc', StandardScaler()),
                    ('clf', RFC())])

In [10]:
# Now let's try to use the above pipelines
#SVM
pipe_svm.fit(X_train, y_train)
#LDA
pipe_lda.fit(X_train, y_train)
#QDA
pipe_qda.fit(X_train, y_train)
#RF
pipe_rf.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('sc', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
         ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [11]:
# Checking the accuracies of each classifier
svm_acc = pipe_svm.score(X_test, y_test)
lda_acc = pipe_lda.score(X_test, y_test)
qda_acc = pipe_qda.score(X_test, y_test)
rf_acc  = pipe_rf.score(X_test, y_test)

In [12]:
# Now print the accuracies
print('SVM accuracy: ', svm_acc)
print('LDA accuracy: ', lda_acc)
print('QDA accuracy: ', qda_acc)
print('RFC accuracy: ', rf_acc)

SVM accuracy:  0.85306122449
LDA accuracy:  0.563265306122
QDA accuracy:  0.338775510204
RFC accuracy:  0.804081632653


## First trial: there is no J-folding here

In [13]:
# Trying to code the stacked ensemble classifier

# Before implementing the actual J-fold methid, I want to train and predict on the same thing... just to practice.

from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
from sklearn.model_selection import KFold
import numpy as np
import operator

class StackedEnsembleClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, lev0_clfs, lev1_clf, J ,weights = None):
        self.lev0_clfs = lev0_clfs
        self.named_clfs = {key: value for key, value in _name_estimators(lev0_clfs)}
        self.lev1_clf = lev1_clf
        self.weights = weights
        # number of folds
        self.J = J


    def fit(self, X, y):
                
        # lev0_clfs_ is the set of fitted classifiers.
        self.lev0_clfs_ = []
        
        # Now fit each classifier
        for clf in self.lev0_clfs:
            fitted_clf = clf.fit(X,y)
            self.lev0_clfs_.append(fitted_clf)
        
        # Initialize the array for the level 1 classifier
        rows = X.shape[0]
        columns = len(np.unique(y))*len(self.lev0_clfs)
        
        self.num_classes = len(np.unique(y))
        
        X2=np.zeros((rows, columns))
        
        # Prepare the data for the level 1 classifier
        for i in range(len(self.lev0_clfs_)):
            clf = self.lev0_clfs_[i]
            X_temp=clf.predict_proba(X)
            for j in range(X_temp.shape[1]):
                for k in range(X2.shape[0]):
                    c = len(np.unique(y))*i+j
                    X2[k][c] = X_temp[k][j]
        
        
        # now train the level 1 classifier
        self.lev1_clf_ = self.lev1_clf.fit(X2,y)
        
        # That's it        
        return self
    
    def predict(self,X):
        
        rows = X.shape[0]
        columns = self.num_classes*len(self.lev0_clfs_)
    
        X2=np.zeros((rows, columns))
        print('X2.shape:',X2.shape)
        
        for i in range(len(self.lev0_clfs_)):
            clf = self.lev0_clfs_[i]
            X_temp=clf.predict_proba(X)
            for j in range(X_temp.shape[1]):
                for k in range(X2.shape[0]):
                    c = len(np.unique(y))*i+j
                    #print(k,c)
                    X2[k][c] = X_temp[k][j]
                    
                    
        # now feed this stuff into the lev1_classifier
        y_pred = self.lev1_clf_.predict(X2)
        return(y_pred)
        
    def predict_proba(self,X):
        # will modify this later.
        
        rows = X.shape[0]
        columns = self.num_classes*len(self.lev0_clfs_)
    
        X2=np.zeros((rows, columns))
        print('X2.shape:',X2.shape)
        
        for i in range(len(self.lev0_clfs_)):
            clf = self.lev0_clfs_[i]
            X_temp=clf.predict_proba(X)
            for j in range(X_temp.shape[1]):
                for k in range(X2.shape[0]):
                    c = len(np.unique(y))*i+j
                    #print(k,c)
                    X2[k][c] = X_temp[k][j]
                    
                    
        # now feed this stuff into the lev1_classifier
        
        p_pred = self.lev1_clf_.predict_proba(X)
        
        return(p_pred)
    
        # return prediction probabilities

In [15]:
from sklearn.linear_model import LogisticRegression

lr1 = LogisticRegression()
lr2 = LogisticRegression()
svc1 = svm.SVC(C=1.0, kernel = 'linear', probability=True, random_state= 42)
svc2 = svm.SVC(C=1.0, kernel = 'rbf', probability=True, random_state= 42)
lda = LDA()
qda = QDA()
rf = RFC()

#sec=StackedEnsembleClassifier([svc, lda, rf], lr)
sec=StackedEnsembleClassifier([svc1, svc2], lr1, J=2)

In [16]:
# Now try to fit the thing
sc = StandardScaler()

X_train = sc.fit_transform(X_train)

In [17]:
# Now train the ensemble.
sec.fit(X_train, y_train)

StackedEnsembleClassifier(J=2,
             lev0_clfs=[SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=False), SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=False)],
             lev1_clf=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
             weights=None)

In [18]:
y_pred = sec.predict(X_test)

X2.shape: (245, 8)


In [19]:
accuracy = 1.0 - (y_pred != y_test).sum()/len(y_test)

In [20]:
print('accuracy Stacked Ensemble:', accuracy)

accuracy Stacked Ensemble: 0.873469387755


In [21]:
# accuracy of a single thing
svc1.fit(X_train, y_train)
y_pred = svc1.predict(X_test)
accuracy = 1.0 - (y_pred!=y_test).sum()/len(y_test)
print('accuracy SVM1:', accuracy)

svc2.fit(X_train, y_train)
y_pred = svc2.predict(X_test)
accuracy = 1.0 - (y_pred!=y_test).sum()/len(y_test)
print('accuracy SVM2:', accuracy)



accuracy SVM1: 0.85306122449
accuracy SVM2: 0.865306122449


## Implementing J-foldings right now

In [None]:
# Trying to code the stacked ensemble classifier

# Before implementing the actual J-fold methid, I want to train and predict on the same thing... just to practice.

# Now J-folding

from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
from sklearn.model_selection import KFold
import numpy as np
import operator

class StackedEnsembleClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, lev0_clfs, lev1_clf, J ,weights = None):
        self.lev0_clfs = lev0_clfs
        self.named_clfs = {key: value for key, value in _name_estimators(lev0_clfs)}
        self.lev1_clf = lev1_clf
        self.weights = weights
        # number of folds
        self.J = J


    def fit(self, X, y):
        
        # Initialize the k-fold
        
        # Initialize the Kfold stuff
        #kf = KFold(n_splits = self.J, random_state=42)
        
        # lev0_clfs_ is the set of fitted classifiers.
        self.lev0_clfs_ = []
        
        # Now fit each classifier
        for clf in self.lev0_clfs:
            fitted_clf = clf.fit(X,y)
            self.lev0_clfs_.append(fitted_clf)
            
        #print('classifiers fitted..')
        
        
        # Initialize the array for the level 1 classifier
        rows = X.shape[0]
        columns = len(np.unique(y))*len(self.lev0_clfs)
        
        self.num_classes = len(np.unique(y))
        
        X2=np.zeros((rows, columns))
        #print('X2.shape:',X2.shape)


        #first_iter = True
        #ind=0
        #for (train, test) in kf.split(X, y):
        #    #print('k:',k)
        #    ind+=1
        #    print('ind:', ind)

            # Initialize a temporary array
        #    rows_temp = len(y[test])
        #    X_temp = np.zeros((rows_temp, columns))
        #    
            # level 0 for the k-th fold
        #    for i in range(len(self.lev0_clfs)):
        #        self.lev0_clfs[i].fit(X[train], y[train])
        #        X_temp2 = self.lev0_clfs[i].predict_proba(X[test])
                
        #        for n in range(X_temp.shape[0]):
        #            for m in range(X_temp2.shape[1]):
        #                c = len(np.unique(y))*i+m
        #                X_temp[n][c] = X_temp2[n][m]
        #                
        #    if (first_iter):
        #        X2 = X_temp
        #        first_iter = False
        #    else:
        #        X2 = np.concatenate((X2,X_temp))
              
            
        #print('X2.shape:', X2.shape)
                
                        
        
        
        for i in range(len(self.lev0_clfs_)):
            clf = self.lev0_clfs_[i]
            X_temp=clf.predict_proba(X)
            for j in range(X_temp.shape[1]):
                for k in range(X2.shape[0]):
                    c = len(np.unique(y))*i+j
        #            #print(k,c)
                    X2[k][c] = X_temp[k][j]
        
        
        # now train the level 1 classifier
        self.lev1_clf_ = self.lev1_clf.fit(X2,y)
        
        # That's it        
        return self
    
    def predict(self,X):
        
        rows = X.shape[0]
        columns = self.num_classes*len(self.lev0_clfs_)
    
        X2=np.zeros((rows, columns))
        print('X2.shape:',X2.shape)
        
        for i in range(len(self.lev0_clfs_)):
            clf = self.lev0_clfs_[i]
            X_temp=clf.predict_proba(X)
            for j in range(X_temp.shape[1]):
                for k in range(X2.shape[0]):
                    c = len(np.unique(y))*i+j
                    #print(k,c)
                    X2[k][c] = X_temp[k][j]
                    
                    
        # now feed this stuff into the lev1_classifier
        y_pred = self.lev1_clf_.predict(X2)
        return(y_pred)
        # return prediction
        
    def predict_proba(self,X):
        # will modify this later.
        
        rows = X.shape[0]
        columns = self.num_classes*len(self.lev0_clfs_)
    
        X2=np.zeros((rows, columns))
        print('X2.shape:',X2.shape)
        
        for i in range(len(self.lev0_clfs_)):
            clf = self.lev0_clfs_[i]
            X_temp=clf.predict_proba(X)
            for j in range(X_temp.shape[1]):
                for k in range(X2.shape[0]):
                    c = len(np.unique(y))*i+j
                    #print(k,c)
                    X2[k][c] = X_temp[k][j]
                    
                    
        # now feed this stuff into the lev1_classifier
        
        p_pred = self.lev1_clf_.predict_proba(X)
        
        return(p_pred)
    
        # return prediction probabilities