In [6]:
#estimating the uniqueness of a label for concurrent returns

#finding overlapping events
def mpNumCoEvents(closeIdx, t1, molecule):
    '''Compute the number of concurrent events per bar.
    +molecule[0] is the date of the first event on which the weight will be computed
    +molecule[-1] is the date of the last event on which the weight will be computed 
    any event that starts before t1[molecule].max() impacts the count'''
    
    # 1) find events that span the period [molecule[0], molecule[-1]]
    t1 = t1.fillna(closeIdx[-1]) #unclosed events still msut impact other weights
    t1 = t1[t1>=molecule[0]] #events that end at or after molecule[0]
    t1 = tl.loc[:t1[molecule].max()] # events that start at or before t1[molecule].max()
    
    #count events spanning a bar
    iloc = closeIdx.searchsorted(np.array([t1.index[0], t1.max()]))
    count = pd.Series(0, index=closeIdx[iloc[0]:iloc[1]+1])
    for tIn, tOut in t1.iteritems():
        count.loc[tIn:tOut]+=1.
    return count.loc[molecule[0]:t1[molecule].max()]

In [7]:
def mpSampleTW(t1, numCoEvents, molecule):
    #derive average uniqueness over the event's lifespan
    wght = pd.Series(index=molecule)
    for tIn, tOut in t1.loc[wght.index].iteritems():
        wght.loc[tIn]=(1./numCoEvents.loc[tIn:tOut]).mean()
    return wght 
    

In [8]:
#using sequential bootstrapping to draw samples by likelihood of uniqueness, thus creating a sampling closer to IID 
#than using standard bootstrapping on a dataset with overlapping features

#build an indicator matrix 
import pandas as pd
import numpy as np 

def getIndMatrix(barIx, t1):
    indM = pd.DataFrame(0, index=barIx, columns= range(t1.shape[0]))
    for i, (t0, t1) in enumerate(t1.iteritems()):
        indM.loc[t0:t1, i] = 1
    return indM


In [9]:
def getAvgUniqueness(indM):
    #Average Uniqueness from indicator matrix
    c = indM.sum(axis=1) #concurrency
    u = indM.div(c, axis=0) # uniqueness
    avgU = u[u>0].mean() # avg u
    return avgU

def seqBootstrap(indM, sLength=None):
    #generate a sample via sequential bootstrap
    if sLength is None: 
        sLength = indM.shape[1]
    phi = []
    while len(phi)<sLength:
        avgU = pd.Series()
        for i in indM:
            indM_ = indM[phi+[i]] #reduce indM
            avgU.loc[i] = getAvgUniqueness(indM_).iloc[-1]
        prob = avgU/avgU.sum() #draw prob
        phi+=[np.random.choice(indM.columns, p=prob)]
    return phi

In [10]:
def main():
    t1 = pd.Series([2,3,5], index=[0,2,4]) # t0, t1 for each feature obs
    barIx = range(t1.max()+1) #index of bars
    indM = getIndMatrix(barIx, t1)
    phi = np.random.choice(indM.columns,size=indM.shape[1])
    print(phi)
    print('Standard Uniqueness:', getAvgUniqueness(indM[phi]).mean())
    phi=seqBootstrap(indM)
    print('Sequential Uniqueness:', getAvgUniqueness(indM[phi]).mean())
    return

main()

[0 2 2]
Standard Uniqueness: 0.6666666666666666
Sequential Uniqueness: 0.8611111111111112


In [11]:
# monte carlo experiment 
def getRndT1(numObs, numBars, maxH):
    #random t series
    t1 = pd.Series()
    for i in range(numObs):
        ix = np.random.randint(0,numBars)
        val= ix + np.random.randint(1,maxH)
        t1.loc[ix] = val
    return t1.sort_index()

def auxMC(numObs,numBars,maxH):
    #parallelized auxiliary function
    t1 = getRndT1(numObs, numBars, maxH)
    barIx = range(t1.max() + 1)
    indM = getIndMatrix(barIx, t1)
    phi = np.random.choice(indM.columns,size=indM.shape[1])
    stdU = getAvgUniqueness(indM[phi]).mean()
    phi = seqBootstrap(indM)
    seqU = getAvgUniqueness(indM[phi]).mean()
    return {'stdU': stdU, 'seqU': seqU}
    

In [2]:
import multiprocessing as mp
import time
import datetime as dt
import sys
def expandCall(kargs):
    #expand the arguments of a callback function 
    func = kargs['func']
    del kargs['func']
    out = func(**kargs)
    return out

def processJobs_(jobs):
    #run jobs sequentially. for debugging
    out=[]
    for job in jobs:
        out_=expandCall(job)
        out.append(out_)
    return out

def reportProgress(jobNum, numJobs, time0, task):
    #report progress as asynch jobs are completed
    msg = [float(jobNum)/numJobs,(time.time()-time0)/60.]
    msg.append(msg[1] * (1/msg[0]-1))
    timeStamp = str(dt.datetime.fromtimestamp(time.time()))
    msg = timeStamp + ' ' + str(round(msg[0]*100,2)) + '% ' + task + 'done after ' + str(round(msg[1],2)) + ' minutes. Remianing ' + str(round(msg[2],2)) + 'minutes.'
    if jobNum<numJobs: 
        sys.stderr.write(msg+'\r')
    else:
        sys.stderr.write(msg + '\n')
    return

def processJobs(jobs, task=None, numThreads=24):
    #run in parallel
    #jobs must contain a 'func' callback, for expandCall
    if task is None: 
        task = jobs[0]['func'].__name__
    pool = mp.Pool(processes=numThreads)
    outputs, out, time0 = pool.imap_unordered(expandCall, jobs), [], time.time()
    #process asynch output, report progress 
    for i, out_ in enumerate(outputs, 1): 
        out.append(out_)
        reportProgress(i, len(jobs), time0, task)
    pool.close();pool.join() #prevent memory leaks
    return out

In [13]:
def mainMC(numObs=10, numBars=100, maxH=5, numIters=1E6, numThreads=24):
    #monte carlo experiments
    jobs = []
    for i in range(int(numIters)):
        job = {'func':auxMC,'numObs': numObs, 'numBars': numBars, 'maxH': maxH}
        jobs.append(job)
    if numThreads==1:
        out = processJobs_(jobs)
    else:
        out = processJobs(jobs, numThreads=numThreads)
    print(pd.DataFrame(out).describe())
    return

mainMC()

2019-01-24 13:47:17.294702 100.0% auxMCdone after 1116.9 minutes. Remianing 0.0minutes.....


                 seqU            stdU
count  1000000.000000  1000000.000000
mean         0.693431        0.607656
std          0.092250        0.100507
min          0.301562        0.200000
25%          0.628333        0.541667
50%          0.700000        0.600000
75%          0.761667        0.672222
max          1.000000        1.000000


In [2]:
#determination of sample weight by absolute return attribution 
def mpSampleW(t1, numCoEvents, close, molecule):
    #derive sample weight by return attribution 
    ret = np.log(close).diff() #log-returns, so that they are additive
    wght = pd.Series(index=molecule)
    for tIn, tOut in t1.loc[wght.index].iteritems():
        wght.loc[tIn] = (ret.loc[tIn:tOut]/numCoEvents.loc[tIn:tOut]).sum()
    return wght.abs()

In [4]:
def getTimeDecay(tW, clfLastW=1.):
    #apply piecewise linear decay to observed uniqueness
    #newest observation gets weight 1, oldest obs gets weight=clfLastW
    clfW = tW.sort_index().cumsum()
    if clfLastW>=0:
        slope=(1.-clfLastW)/clfW.iloc[-1]
    else:
        slope = 1./((clfLastW+1)*clfW.iloc[-1])
    const = 1.-slope*clfW.iloc[-1]
    clfW=const+slope*clfW
    clfW[clfW<0]=0
    print(const, slope)
    return clfW
