In [1]:
np.random.seed(2020)
from os import listdir
from os.path import isfile, join
import datetime as dt

In [2]:
%matplotlib inline

In [3]:
def preprocess_df(data, col_name):
    data = data[data.체결시각 != 31000000]
    data.일자     = data.일자.astype(str)
    data.종목코드 = data.종목코드.astype(str).str.zfill(6)
    data.체결시각 = data.체결시각.astype(str).str.zfill(8)
    data["거래대금"] = data.현재가 * data.체결수량
    data.index = pd.to_datetime(data.일자 + data.체결시각, format="%Y%m%d%H%M%S%f")
    data.index.name = "시각"    
    return data[set(col_name + ["거래대금"]) - set(["일자", "체결시각"])]

def value_bar(df, value_size):
    df = df.reset_index()
    
    df["value_num"] = df["거래대금"].cumsum() // value_size
    groupby = df.groupby("value_num")
    
    bars = groupby["현재가"].ohlc()
    bars[["volume", "value"]] = groupby[["체결수량", "거래대금"]].sum()
    bars["시각"] = groupby["시각"].first()
    bars.set_index("시각", inplace = True)
    return bars    

def read_all_csv(path, col_name):
    df = pd.DataFrame([])
    files = [f for f in listdir(path) if isfile(join(path, f))]
    for file in files:
        data = pd.read_csv(path + "/"+file, names = col_name)
        df = pd.concat([df, data], axis = 0)
    df = preprocess_df(df, col_name)
    df = value_bar(df, 5_000_000)
    df = df[df.volume > 0]
    return df

def preprocess_df(data, col_name):
    data = data[data.체결시각 != 31000000]
    data.일자     = data.일자.astype(str)
    data.종목코드 = data.종목코드.astype(str).str.zfill(6)
    data.체결시각 = data.체결시각.astype(str).str.zfill(8)
    data["거래대금"] = data.현재가 * data.체결수량
    data.index = pd.to_datetime(data.일자 + data.체결시각, format="%Y%m%d%H%M%S%f")
    data.index.name = "시각"    
    return data[set(col_name + ["거래대금"]) - set(["일자", "체결시각"])]

In [4]:
def mpPandasObj(func,pdObj,numThreads=24,mpBatches=1,linMols=True,**kargs):
    '''
    Parallelize jobs, return a dataframe or series
    + func: function to be parallelized. Returns a DataFrame
    + pdObj[0]: Name of argument used to pass the molecule
    + pdObj[1]: List of atoms that will be grouped into molecules
    + kwds: any other argument needed by func
    Example: df1=mpPandasObj(func,('molecule',df0.index),24,**kwds)
    '''
    import pandas as pd
    #if linMols:parts=linParts(len(argList[1]),numThreads*mpBatches)
    #else:parts=nestedParts(len(argList[1]),numThreads*mpBatches)
    if linMols:parts=linParts(len(pdObj[1]),numThreads*mpBatches)
    else:parts=nestedParts(len(pdObj[1]),numThreads*mpBatches)

    jobs=[]
    for i in range(1,len(parts)):
        job={pdObj[0]:pdObj[1][parts[i-1]:parts[i]],'func':func}
        job.update(kargs)
        jobs.append(job)
    if numThreads==1:out=processJobs_(jobs)
    else: out=processJobs(jobs,numThreads=numThreads)
    if isinstance(out[0],pd.DataFrame):df0=pd.DataFrame()
    elif isinstance(out[0],pd.Series):df0=pd.Series()
    else:return out
    for i in out:df0=df0.append(i)
    df0=df0.sort_index()
    return df0


def linParts(numAtoms,numThreads):
    # partition of atoms with a single loop
    parts=np.linspace(0,numAtoms,min(numThreads,numAtoms)+1)
    parts=np.ceil(parts).astype(int)
    return parts

def nestedParts(numAtoms,numThreads,upperTriang=False):
    # partition of atoms with an inner loop
    parts,numThreads_=[0],min(numThreads,numAtoms)
    for num in range(numThreads_):
        part=1+4*(parts[-1]**2+parts[-1]+numAtoms*(numAtoms+1.)/numThreads_)
        part=(-1+part**.5)/2.
        parts.append(part)
    parts=np.round(parts).astype(int)
    if upperTriang: # the first rows are heaviest
        parts=np.cumsum(np.diff(parts)[::-1])
        parts=np.append(np.array([0]),parts)
    return parts

def processJobs_(jobs):
    # Run jobs sequentially, for debugging
    out=[]
    for job in jobs:
        out_=expandCall(job)
        out.append(out_)
    return out
# =======================================================
# Example of async call to multiprocessing lib [20.9]
import multiprocessing as mp
import datetime as dt

#________________________________
def reportProgress(jobNum,numJobs,time0,task):
    # Report progress as asynch jobs are completed
    msg=[float(jobNum)/numJobs, (time.time()-time0)/60.]
    msg.append(msg[1]*(1/msg[0]-1))
    timeStamp=str(dt.datetime.fromtimestamp(time.time()))
    msg=timeStamp+' '+str(round(msg[0]*100,2))+'% '+task+' done after '+ \
        str(round(msg[1],2))+' minutes. Remaining '+str(round(msg[2],2))+' minutes.'
    if jobNum<numJobs:sys.stderr.write(msg+'\r')
    else:sys.stderr.write(msg+'\n')
    return
#________________________________
def processJobs(jobs,task=None,numThreads=24):
    # Run in parallel.
    # jobs must contain a 'func' callback, for expandCall
    if task is None:task=jobs[0]['func'].__name__
    pool=mp.Pool(processes=numThreads)
    outputs,out,time0=pool.imap_unordered(expandCall,jobs),[],time.time()
    # Process asyn output, report progress
    for i,out_ in enumerate(outputs,1):
        out.append(out_)
        reportProgress(i,len(jobs),time0,task)
    pool.close();pool.join() # this is needed to prevent memory leaks
    return out
# =======================================================
# Unwrapping the Callback [20.10]
def expandCall(kargs):
    # Expand the arguments of a callback function, kargs['func']
    func=kargs['func']
    del kargs['func']
    out=func(**kargs)
    return out
# =======================================================
# Pickle Unpickling Objects [20.11]
def _pickle_method(method):
    func_name=method.im_func.__name__
    obj=method.im_self
    cls=method.im_class
    return _unpickle_method, (func_name,obj,cls)
#________________________________
def _unpickle_method(func_name,obj,cls):
    for cls in cls.mro():
        try:func=cls.__dict__[func_name]
        except KeyError:pass
        else:break
    return func.__get__(obj,cls)
#________________________________

In [5]:
## 2.4. 대칭 CUSUM 필터
def getTEventS(gRaw, h):
    tEvents, sPos, sNeg = [], 0, 0
    diff = gRaw.diff()
    for i in diff.index[1:]:
        sPos, sNeg = max(0, sPos + diff.loc[i]), min(0, sNeg + diff.loc[i])
        if sNeg < -h:
            sNeg = 0
            tEvents.append(i)
        elif sNeg > h:
            sPos = 0
            tEvents.append(i)
    return pd.DatetimeIndex(tEvents)

# 3.1.일별 변동성 계산 
def getDailyVol(close, span0=100):
    # daily vol, reindexed to cloes
    df0 = close.index.searchsorted(close.index - pd.Timedelta(days=1))
    df0 = df0[df0>0]
    df0 = pd.Series(close.index[df0 - 1], index=close.index[close.shape[0] - df0.shape[0]:])
    df0 = close.loc[df0.index] / close.loc[df0.values].values - 1 # daily returns
    df0 = df0.ewm(span=span0).std()
    return df0

# 3.3. 최초 도달 시간 측정
def getEvents(close, tEvents, ptSl, trgt, minRet, numThreads=1, t1=False, side=None):
    # 1) 목표 구하기
    trgt = trgt.loc[tEvents]
    trgt = trgt[trgt > minRet]
    # 2) t1 구하기(최대 보유 기간)
    if t1 is False:
        t1 = pd.Series(pd.NaT, index=tEvents)
    # 3) t1에 손절을 적용해 이벤트 객체를 형성
    if side is None:
        side_, ptSl_ = pd.Series(1.0, index=trgt.index), [ptSl[0], ptSl[0]]
    else:
        side_, ptSl_ = side.loc[trgt.index], ptSl[:2]
    events = pd.concat({'t1': t1, 'trgt': trgt, 'side': side_}, axis=1).dropna(subset=['trgt'])
    df0 = mpPandasObj(func=applyPtSlOnT1, pdObj=('molecule', events.index), numThreads=numThreads, close=close, events=events, ptSl=ptSl_)
    events['t1'] = df0.dropna(how='all').min(axis=1) # pd.min ignores NaN
    if side is None:
        events = events.drop('side', axis=1)

    # store for later
    events['pt'] = ptSl[0]
    events['sl'] = ptSl[1]

    return events

# 3.2. 트리플-배리어 레이블 기법
def applyPtSlOnT1(close, events, ptSl, molecule):
    # apply stop loss/profit taking, if it takes place before t1 (end of event)
    events_ = events.loc[molecule]
    out = events_[['t1']].copy(deep=True)

    if ptSl[0] > 0:
        pt = ptSl[0] * events_['trgt']
    else:
        pt = pd.Series(index=events.index) # NaNs

    if ptSl[1] > 0:
        sl = - ptSl[1] * events_['trgt']
    else:
        sl = pd.Series(index=events.index) # 'mo NaNs

    for loc, t1 in events_['t1'].fillna(close.index[-1]).iteritems():
        df0 = close[loc:t1] # path prices
        df0 = (df0 / close[loc] - 1) * events_.at[loc, 'side'] # path returns
        out.loc[loc, 'sl'] = df0[df0<sl[loc]].index.min() # earliest stop loss
        out.loc[loc, 'pt'] = df0[df0>pt[loc]].index.min() # earliest profit take
    return out

In [6]:
## 4.1. 레이블의 고유성 계산
def mpNumCoEvents(closeIdx,t1,molecule):
    """
    바별로 공존하는 이벤트 개수 계산
    +molecule[0]은 가중값이 계산될 첫 이벤트 날짜
    +molecule[-1]은 가중값이 계산될 마지막 이벤트 날짜다.
    t1[moldecule].max() 이전에 발생하는 모든 이벤트는 개수에 영향을 미침
    """
    #1) [molecule[0],molecule[-1]] 구간에서 이벤트 탐색
    t1=t1.fillna(closeIdx[-1]) # 드러난 이벤트는 다른 가중값에 영향 미침
    t1=t1[t1>=molecule[0]] # molecule[0] 마지막이나 이후에 발생하는 이벤트
    t1=t1.loc[:t1[molecule].max()] # t1[molecule].max() 이전이나 시작시에 발생하는 이벤트
    
    #2) 이벤트 바 개수 확인 
    iloc=closeIdx.searchsorted(np.array([t1.index[0],t1.max()]))
    count=pd.Series(0,index=closeIdx[iloc[0]:iloc[1]+1])
    for tIn,tOut in t1.iteritems():count.loc[tIn:tOut]+=1.
    return count.loc[molecule[0]:t1[molecule].max()]

In [7]:
path = './005930_1M_20200517_20200814'
col_name = ["일자", "종목코드", "체결시각", "체결수량", "현재가", 
            "매도최우선호가", "매수최우선호가" ,"매도최우선호가잔량", "매수최우선호가잔량", 
            "매도10단계호가잔량합", "매수10단계호가잔량합", "매도성향체결수량", "매수성향체결수량"]

In [8]:
close = read_all_csv(path, col_name).close

In [9]:
daily_vol = getDailyVol(close)
daily_vol.head()

시각
2020-05-19 09:01:00         NaN
2020-05-19 09:02:00    0.003025
2020-05-19 09:03:00    0.002457
2020-05-19 09:04:00    0.002190
2020-05-19 09:05:00    0.002748
Name: close, dtype: float64

In [10]:
## 전체 길이가 h런 있을 경우만 표본 추출, h는 daily_vol의 mean값 사용
filtered_bars = getTEventS(close, daily_vol.mean())

In [11]:
len(filtered_bars) / len(close)

0.08413776178010471

In [12]:
trip_barr_events = getEvents(close, filtered_bars, ptSl=[1,1], trgt=daily_vol, minRet=0.01)
t1 = trip_barr_events["t1"]
trip_barr_events.head()

Unnamed: 0,t1,trgt,pt,sl
2020-05-20 09:08:00,2020-05-22 09:39:00,0.012649,1,1
2020-05-20 09:09:00,2020-05-22 10:11:00,0.013324,1,1
2020-05-20 09:12:00,2020-05-22 10:11:00,0.014298,1,1
2020-05-20 09:17:00,2020-05-22 10:11:00,0.01547,1,1
2020-05-20 09:41:00,2020-05-22 10:11:00,0.015548,1,1


In [13]:
numCoEvents = mpNumCoEvents(close.index, trip_barr_events['t1'], filtered_bars)
numCoEvents.head()

시각
2020-05-20 09:08:00    1.0
2020-05-20 09:09:00    2.0
2020-05-20 09:10:00    2.0
2020-05-20 09:11:00    2.0
2020-05-20 09:12:00    3.0
dtype: float64

In [14]:
close.head()

시각
2020-05-18 09:00:00.990    47850
2020-05-18 09:01:00.000    47950
2020-05-18 09:02:00.000    47950
2020-05-18 09:03:00.000    47900
2020-05-18 09:04:00.000    47950
Name: close, dtype: int64

In [15]:
molecule = getTEventS(close, daily_vol.mean())
molecule[:5]

DatetimeIndex(['2020-05-18 09:03:00', '2020-05-18 09:05:00',
               '2020-05-18 09:10:00', '2020-05-18 09:11:00',
               '2020-05-18 09:12:00'],
              dtype='datetime64[ns]', freq=None)

In [16]:
## 절대 수익률 기여도에 의한 표본 가중값 결정
def mpSampleW(t1, numCoEvents, close, molecule):
    # 수익률 기여에 따른 샘플 가중값 도출
    ret = np.log(close).diff()
    wght = pd.Series(index=molecule)
    for tIn, tOut in t1.loc[wght.index].iteritems():
        wght.loc[tIn] = (ret.loc[tIn:tOut] / numCoEvents.loc[tIn:tOut]).sum()
    return wght.abs()

In [17]:
ret = np.log(close).diff()
wght = pd.Series(index=molecule)
ls =[]
for tIn, tOut in t1.loc[wght.index].iteritems():
    wght.loc[tIn] = (ret.loc[tIn:tOut] / numCoEvents.loc[tIn:tOut]).sum()
    ls.append(tOut)

In [19]:
w = mpSampleW(t1, numCoEvents, close, filtered_bars)
w.tail()

2020-08-13 15:07:00    0.001503
2020-08-13 15:08:00    0.001333
2020-08-13 15:10:00    0.001143
2020-08-13 15:13:00    0.000858
2020-08-13 15:15:00    0.000000
dtype: float64

In [20]:
w *= w.shape[0] / w.sum()
w.tail()

2020-08-13 15:07:00    4.302653
2020-08-13 15:08:00    3.814569
2020-08-13 15:10:00    3.271328
2020-08-13 15:13:00    2.455071
2020-08-13 15:15:00    0.000000
dtype: float64

## 4.11

In [21]:
# 4.2. 레이블의 평균 고유성
def mpSampleTW(t1, numCoEvents, molecule):
    ## 이벤트 생명 주기 동안의 평균 고유성 도출
    wght = pd.Series(index=molecule)
    for tIn, tOut in t1.loc[wght.index].iteritems():
        wght.loc[tIn] = (1.0 / numCoEvents.loc[tIn:tOut]).mean()
    return wght


In [27]:
tW = mpSampleTW(t1, numCoEvents, molecule)
tW

2020-05-18 09:03:00         inf
2020-05-18 09:05:00         inf
2020-05-18 09:10:00         inf
2020-05-18 09:11:00         inf
2020-05-18 09:12:00         inf
                         ...   
2020-08-13 15:07:00    0.184921
2020-08-13 15:08:00    0.199074
2020-08-13 15:10:00    0.229167
2020-08-13 15:13:00    0.250000
2020-08-13 15:15:00         NaN
Length: 2057, dtype: float64

In [24]:
def getTimeDecay(tW, clfLastW=1.0):
    # 관측된 고유성(tW)에 구간-선형 감쇄 적용
    # 최신 관측값  weight = 1, 가장 오래된 관측값 : clfLastW
    clfW = tW.sort_index().cumsum()
    if clfLastW >= 0:
        slope = (1.0 - clfLastW) / clfW.iloc[-1]
    else:
        slope = 1 / ((clfLastW + 1) * clfW.iloc[-1])
    const = 1.0 - slope * clfW.iloc[-1]
    clfW = const + slope * clfW
    clfW[clfW < 0] = 0
    print(const, slope)
    return clfW

## 4.1. 레이블의 고유성 계산

In [2]:
def mpNumCoEvents(closeIdx,t1,molecule):
    """
    바별로 공존하는 이벤트 개수 계산
    +molecule[0]은 가중값이 계산될 첫 이벤트 날짜
    +molecule[-1]은 가중값이 계산될 마지막 이벤트 날짜다.
    t1[moldecule].max() 이전에 발생하는 모든 이벤트는 개수에 영향을 미침
    """
    #1) [molecule[0],molecule[-1]] 구간에서 이벤트 탐색
    t1=t1.fillna(closeIdx[-1]) # 드러난 이벤트는 다른 가중값에 영향 미침
    t1=t1[t1>=molecule[0]] # molecule[0] 마지막이나 이후에 발생하는 이벤트
    t1=t1.loc[:t1[molecule].max()] # t1[molecule].max() 이전이나 시작시에 발생하는 이벤트
    
    #2) 이벤트 바 개수 확인 
    iloc=closeIdx.searchsorted(np.array([t1.index[0],t1.max()]))
    count=pd.Series(0,index=closeIdx[iloc[0]:iloc[1]+1])
    for tIn,tOut in t1.iteritems():count.loc[tIn:tOut]+=1.
    return count.loc[molecule[0]:t1[molecule].max()]

In [None]:
conc_events = mpNumCoEvents(filtered_bars, trip_barr_events['t1'], filtered_bars)

## 4.2. 레이블의 평균 고유성 계산

In [3]:
def mpSampleTW(t1, numCoEvents, molecule):
    ## 이벤트 생명 주기 동안의 평균 고유성 도출
    wght = pd.Series(index=molecule)
    for tIn, tOut in t1.loc[wght.index].iteritems():
        wght.loc[tIn] = (1.0 / numCoEvents.loc[tIn:tOut]).mean()
    return wght


## 4.3. 지표 행렬 구축

In [4]:
def getIndMatrix(barIx, t1):
    ## 지표 행렬 구하기
    indM = pd.DataFrame(0, index=barIx, columns=range(t1.shape[0]))
    for i, (t0, t1) in enumerate(t1.iteritems()):
        indM.loc[t0:t1, i] = 1.0
    return indM

## 4.4 평균 고유성 계산

In [5]:
def getAvgUniqueness(indM):
    # 지표 행렬로부터의 평균 고유성 
    c = indM.sum(axis=1)  # 공존
    u = indM.div(c, axis=0)  # 공유성
    avgU = u[u > 0].mean()  # 평균 고유성
    return avgU

## 4.5. 순차적 부트스랩으로부터 수익률 표본 추출

In [6]:
def seqBootstrap(indM,sLength = None):
    # Generate a sample via sequential bootstrap
    if sLength is None:sLength = indM.shape[1]
    phi = []
    while len(phi) < sLength:
        avgU = pd.Series()
        for i in indM:
            indM_ = indM[phi + [i]] # reduce indM
            avgU.loc[i] = getAvgUniqueness(indM_).iloc[-1]
        prob = avgU/avgU.sum() # draw prob
        phi += [np.random.choice(indM.columns, p=prob)]
    return phi


## 4.7. 랜덤 T1 Series 생성

In [7]:
def getRndT1(numObs, numBars, maxH):
    # Serie t1 aleatoria
    t1 = pd.Series()
    for i in range(numObs):
        ix = np.random.randint(0, numBars)
        val = ix + np.random.randint(1, maxH)
        t1.loc[ix] = val
    return t1.sort_index()

## 4.8. 표준과 순차적 부트스트랩에서의 고유성

In [8]:
def auxMC(numObs,numBars,maxH):
    # Parallelized auxiliary function
    t1 = getRndT1(numObs,numBars,maxH)
    barIx = range(t1.max()+1)
    indM = getIndMatrix(barIx,t1)
    phi = np.random.choice(indM.columns,size = indM.shape[1])
    stdU = getAvgUniqueness(indM[phi]).mean()
    phi = seqBootstrap(indM)
    seqU = getAvgUniqueness(indM[phi]).mean()
    return { 'stdU':stdU,'seqU':seqU }


## 4.10 절대 수익률 기여도에 의한 표본 가중값 결정

In [9]:
def mpSampleW(t1, numCoEvents, close, molecule):
    # 수익률 기여에 따른 샘플 가중값 도출
    ret = np.log(close).diff() # 로그 리턴이브로 가산적
    wght = pd.Series(index=molecule)
    for tIn, tOut in t1.loc[wght.index].iteritems():
        wght.loc[tIn] = (ret.loc[tIn:tOut] / numCoEvents.loc[tIn:tOut]).sum()
    return wght.abs()

In [10]:
def seqBootstrap(indM,sLength = None):
    # Generate a sample via sequential bootstrap
    if sLength is None:sLength = indM.shape[1]
    phi = []
    while len(phi) < sLength:
        avgU = pd.Series()
        for i in indM:
            indM_ = indM[phi + [i]] # reduce indM
            avgU.loc[i] = getAvgUniqueness(indM_).iloc[-1]
        prob = avgU/avgU.sum() # draw prob
        phi += [np.random.choice(indM.columns, p=prob)]
    return phi





def mainMC(numbObs=10, numBars=100, maxH=5, numIters=1000, numThreads=24):
    # Experimentos Monte Carlo
    jobs = []
    for i in range(int(numIters)):
        job = {"func": auxMC,
               "numObs": numbObs,
               "numBars": numBars,
               "maxH": maxH}
        jobs.append(job)
    return pd.DataFrame(jobs).describe()


def mpSampleW(t1, numCoEvents, close, molecule):
    # Derivar el peso de ejemplar por la atribución devuelta
    ret = np.log(close).diff()
    wght = pd.Series(index=molecule)
    for tIn, tOut in t1.loc[wght.index].iteritems():
        wght.loc[tIn] = (ret.loc[tIn:tOut] / numCoEvents.loc[tIn:tOut]).sum()
    return wght.abs()


def getTimeDecay(tW, clfLastW=1.0):
    # Aplicar un decaimiento lineal por pieza a la unicidad obserbada tW
    # Las observaciones más recientes obtienen un weight=1, las más antiguas obtiene weight=clfLastW
    clfW = tW.sort_index().cumsum()
    if clfLastW >= 0:
        slope = (1.0 - clfLastW) / clfW.iloc[-1]
    else:
        slope = 1.0 / ((clfLastW + 1) * clfW.iloc[-1])
    const = 1.0 - slope * clfW.iloc[-1]
    clfW = const + slope * clfW
    clfW[clfW < 0] = 0
    print(const, slope)
    return clfW


In [11]:
t1 = pd.Series([2, 3, 5], index=[0, 2, 4])

In [12]:
barIx = range(t1.max() + 1)

In [26]:
indM = getIndMatrix(barIx, t1)
indM.columns = [f"$i={i}$" for i in indM.columns]
indM.index =  [f"$1_{i},i$" for i in indM.index]
indM.astype(int)

Unnamed: 0,$i=0$,$i=1$,$i=2$
"$1_0,i$",1,0,0
"$1_1,i$",1,0,0
"$1_2,i$",1,1,0
"$1_3,i$",0,1,0
"$1_4,i$",0,0,1
"$1_5,i$",0,0,1


In [20]:
if __name__ == '__main__':
    t1 = pd.Series([2, 3, 5], index=[0, 2, 4])
    barIx = range(t1.max() + 1)
    indM = getIndMatrix(barIx, t1)
    phi = np.random.choice(indM.columns, size=indM.shape[1])
    print(phi)
    print("Unicidad estándar:", getAvgUniqueness(indM[phi]).mean())
    phi = seqBootstrap(indM)
    print(phi)
    print("Unicidad secuencial:", getAvgUniqueness(indM[phi]).mean())
    print(mainMC())

[0 1 1]
Unicidad estándar: 0.5370370370370371
[0, 2, 0]
Unicidad secuencial: 0.6666666666666666
       numObs  numBars    maxH
count  1000.0   1000.0  1000.0
mean     10.0    100.0     5.0
std       0.0      0.0     0.0
min      10.0    100.0     5.0
25%      10.0    100.0     5.0
50%      10.0    100.0     5.0
75%      10.0    100.0     5.0
max      10.0    100.0     5.0


In [12]:
t1 = pd.Series([2, 3, 5], index=[0, 2, 4])
t1

0    2
2    3
4    5
dtype: int64

In [13]:
barIx = range(t1.max() + 1)
barIx

range(0, 6)

In [14]:
indM = getIndMatrix(barIx, t1)
indM.columns = [f"y_{i}" for i in indM.columns]
indM.index =  [f"r_{i}" for i in indM.index]
indM.astype(int)

Unnamed: 0,y_0,y_1,y_2
r_0,1,0,0
r_1,1,0,0
r_2,1,1,0
r_3,0,1,0
r_4,0,0,1
r_5,0,0,1


In [15]:
phi = np.random.choice(indM.columns, size=indM.shape[1])
print("Unicidad estándar:", getAvgUniqueness(indM[phi]).mean())
phi

Unicidad estándar: 0.5185185185185185


array(['y_1', 'y_0', 'y_0'], dtype=object)

In [16]:
phi = seqBootstrap(indM)
print(phi)

['y_2', 'y_2', 'y_1']
Unicidad secuencial: 0.6666666666666666
       numObs  numBars    maxH
count  1000.0   1000.0  1000.0
mean     10.0    100.0     5.0
std       0.0      0.0     0.0
min      10.0    100.0     5.0
25%      10.0    100.0     5.0
50%      10.0    100.0     5.0
75%      10.0    100.0     5.0
max      10.0    100.0     5.0


In [22]:
print("Unicidad secuencial:", getAvgUniqueness(indM[phi]).mean())
mainMC().astype(int)

Unicidad secuencial: 0.6666666666666666


Unnamed: 0,numObs,numBars,maxH
count,1000,1000,1000
mean,10,100,5
std,0,0,0
min,10,100,5
25%,10,100,5
50%,10,100,5
75%,10,100,5
max,10,100,5
