In [4]:
#comparitive accuracy of bagging classifier vs just one classifier
from scipy.special import comb
N, p, k = 100, 1./3, 3.
p_=0
for i in range(0, int(N/k)+1):
    p_+=comb(N, i)*p**i*(1-p)**(N-i)
print(p, 1-p_)

0.3333333333333333 0.4811966952738904


In [11]:
#three ways of setting up a random forest 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

#avgU = average uniqueness between samples
avgU = 1.0

clf0 = RandomForestClassifier(n_estimators=1000, class_weight='balanced_subsample', criterion='entropy')

clf1 = DecisionTreeClassifier(criterion='entropy',max_features='auto', class_weight='balanced')
clf1 = BaggingClassifier(base_estimator=clf1, n_estimators=1000, max_samples=avgU)

clf2 = RandomForestClassifier(n_estimators=1, criterion='entropy', bootstrap=False, class_weight='balanced_subsample')
clf2 = BaggingClassifier(base_estimator=clf2, n_estimators=1000, max_samples=avgU, max_features=1.)

In [None]:
#purging overlapping observations in the training dataset
def getTrainTimes(t1, testTimes):
    '''
    t1.index: time when observation started
    t1.value: time when observation ended
    testTimes: Times of testing observations'''
    trn = t1.copy(deep=True)
    for i,j in testTimes.iteritems():
        df0 = trn[(i<=trn.index)&(trn.index<=j)].index #train starts within test
        df1 = trn[(i<=trn)&(trn<=j)].index #train ends within test
        df2 = trn[(trn.index<=i)&(j<=trn)].index #train envelops test
        trn = trn.drop(df0.union(df1).union(df2))
    return trn


In [None]:
def getEmbargoTimes(times, pctEmbargo):
    #get embargo time for every bar
    step = int(times.shape[0]*pctEmbargo)
    if step==0:
        mbrg = pd.Series(times, index=times)
    else:
        mbrg = pd.Series(times[step:], index=times[:-step])
        mbrg = mbrg.append(pd.Series(times[-1], index=times[-step:]))
    return mbrg

'''
#applying purging to dataset
testTimes=pd.Series(mbrg[dt1],index=[dt0]) #include embargo before purge
trainTimes = getTrainTimes(t1, testTimes)
testTimes=t1.loc[dt0:dt1].index
'''

In [1]:
#CV when observations overlap
def PurgedKFold(_BaseKFold):
    '''
    Extend Kfold to work with labels that span intervals
    the train is purged of observations overlapping test-label intervals
    test set is assumed contiguous (shuffle=False), w/o training examples in between'''
    def __init__(self, n_splits=3, t1=None, pctEmbargo=0.):
        if not isinstance(t1, pd.Series):
            raise ValueError('must be pd series')
        super(PurgedKFold, self).__init__(n_splits, shuffle=False, random_state=None)
        self.t1=t1
        self.pctEmbargo=pctEmbargo
    def split(self, X, y=None, groups=None):
        if(X.index==self.t1.index).sum()!=len(self.t1):
            raise ValueError('X and thruDateValues must have the same index')
        indices = np.arange(X.shape[0])
        mbrg = int(X.shape[0]*self.pctEmbargo)
        test_starts=[(i[0], i[-1]+1) for i in np.array_split(np.arrange(X.shape[0]), self.n_splits)]
        for i,j in test_starts:
            t0 = self.t1.index[i] #start of test set
            test_indices = indices[i:j]
            maxT1Idx = self.t1.index.searchSorted(self.t1[test_indices].max())
            train_indices = self.t1.index.searchSorted(self.t1[self.t1<=t0].index)
            train_indices = np.concatenate((train_indices, indices[maxT1Idx+mbrg:]))
            yield train_indices, test_indices

In [None]:
'''there are problems with the cross val in sklearn: 1. scoring functions do not know classes_, as a consequence 
of sklearns reliance on numpy arrays rather than pandas series. 2. cross_val_score will give different results 
because it passes weights to the fit method, but not to the log loss method
below is a fucntion cvsScore to be used in place of cross_val_score for financial applications'''
def cvScore(clf, X, y, sample_weight, scoring='neg_log_loss', t1=None, cv=None, cvGen=None,pctEmbargo=None):
    if scoring not in ['neg_log_loss','accuracy']:
        raise Exception('wrong scoring method')
    from sklearn.metrics import log_loss, accuracy_score
    from clfSequential import PurgedKFold
    if cvGen is None:
        cvGen = PurgedKFold(n_splits=cv, t1=t1, pctEmbargo=pctEmbargo) #purged
    scores=[]
    for train, test in cvGen.Split(X=X):
        fit = clf.fit(X=X.iloc[train, :], y=y.iloc[train],sample_weight=sample_weight.iloc[train].values)
        if scoring == 'neg_log_loss':
            prob=fit.predict_proba(X.iloc[test, :])
            score_=-log_loss(y.iloc[test],prob,sample_weight=sample_weight.iloc[test].values, labels = clf.classes_)
        else: 
            pred = fit.predict(X.iloc[test,:])
            score_ = accuracy_score(y.iloc[test],pred,sample_weight=sample_weight.iloc[test].values)
        score.append(score_)
    return np.array(score)