In [1]:
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile

In [2]:
def zip_to_df(link):
    """
    Reads ATUS zip files into pandas dataframes.
    Args:
    link: str
    Returns:
    pandas.DataFrame
    """   
    with urlopen(link) as zipresp:
        with ZipFile(BytesIO(zipresp.read())) as zfile:
            txt1 = zfile.read("_".join(link.split('/')[-1].split('.')[0].split('-')) + '.dat')
    txt_lst = [i.split(',') for i in txt1.decode("utf-8").split('\r\n')]
    df1 = pd.DataFrame(txt_lst)
    df1.columns = df1.iloc[0]
    df1 = df1[1:]
    return df1

In [3]:
import pandas as pd

df = zip_to_df('https://www.bls.gov/tus/special.requests/atusact-0320.zip')
df.head()

Unnamed: 0,TUCASEID,TUACTIVITY_N,TUACTDUR24,TUCC5,TUCC5B,TRTCCTOT_LN,TRTCC_LN,TRTCOC_LN,TUSTARTTIM,TUSTOPTIME,...,TRTONHH_LN,TRTOHH_LN,TRTHH_LN,TRTNOHH_LN,TEWHERE,TUCC7,TRWBELIG,TRTEC_LN,TUEC24,TUDURSTOP
1,20030100013280,1,60,-1,-1,-1,-1,-1,04:00:00,05:00:00,...,-1,-1,-1,-1,9,-1,-1,-1,-1,-1
2,20030100013280,2,30,-1,-1,-1,-1,-1,05:00:00,05:30:00,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,20030100013280,3,600,-1,-1,-1,-1,-1,05:30:00,15:30:00,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,20030100013280,4,150,-1,-1,-1,-1,-1,15:30:00,18:00:00,...,-1,-1,-1,-1,1,-1,-1,-1,-1,-1
5,20030100013280,5,5,-1,-1,-1,-1,-1,18:00:00,18:05:00,...,-1,-1,-1,-1,1,-1,-1,-1,-1,-1


In [4]:
df.shape

(4276393, 29)

In [5]:
df_ec = zip_to_df('https://www.bls.gov/tus/special.requests/atusrostec-1120.zip')
df_ec.head()

Unnamed: 0,TUCASEID,TEAGE_EC,TEELDUR,TEELWHO,TEELYRS,TRELHH,TUECLNO,TULINENO
1,20110101110074,70,4,44,2,0,5,-1
2,20110101110156,85,4,46,2,0,5,-1
3,20110101110507,80,1,55,-1,0,2,-1
4,20110101110521,85,3,43,-1,0,3,-1
5,20110101110522,80,4,44,6,0,2,-1


In [6]:
df_ec.shape

(26436, 8)

## Filter caregivers

In [7]:
elec = df[df['TUCASEID'].isin(df_ec['TUCASEID'])]
elec.head()

Unnamed: 0,TUCASEID,TUACTIVITY_N,TUACTDUR24,TUCC5,TUCC5B,TRTCCTOT_LN,TRTCC_LN,TRTCOC_LN,TUSTARTTIM,TUSTOPTIME,...,TRTONHH_LN,TRTOHH_LN,TRTHH_LN,TRTNOHH_LN,TEWHERE,TUCC7,TRWBELIG,TRTEC_LN,TUEC24,TUDURSTOP
2228592,20110101110074,1,10,0,0,0,-1,0,04:00:00,04:10:00,...,-1,-1,-1,-1,1,0,-1,-1,-1,-1
2228593,20110101110074,2,300,0,0,0,-1,0,04:10:00,09:10:00,...,-1,-1,-1,-1,1,0,-1,-1,-1,-1
2228594,20110101110074,3,5,0,0,0,-1,0,09:10:00,09:15:00,...,-1,-1,-1,-1,13,0,-1,-1,-1,-1
2228595,20110101110074,4,525,0,0,0,-1,0,09:15:00,18:00:00,...,-1,-1,-1,-1,3,0,-1,-1,-1,-1
2228596,20110101110074,5,5,0,0,0,-1,0,18:00:00,18:05:00,...,-1,-1,-1,-1,13,0,-1,-1,-1,-1


In [8]:
elec.shape

(395956, 29)

In [9]:
not_elec = df[~df['TUCASEID'].isin(df_ec['TUCASEID'])]
not_elec.shape

(3880437, 29)

## creating two dataframes = demographic & diary

In [10]:
elec.columns

Index(['TUCASEID', 'TUACTIVITY_N', 'TUACTDUR24', 'TUCC5', 'TUCC5B',
       'TRTCCTOT_LN', 'TRTCC_LN', 'TRTCOC_LN', 'TUSTARTTIM', 'TUSTOPTIME',
       'TRCODEP', 'TRTIER1P', 'TRTIER2P', 'TUCC8', 'TUCUMDUR', 'TUCUMDUR24',
       'TUACTDUR', 'TR_03CC57', 'TRTO_LN', 'TRTONHH_LN', 'TRTOHH_LN',
       'TRTHH_LN', 'TRTNOHH_LN', 'TEWHERE', 'TUCC7', 'TRWBELIG', 'TRTEC_LN',
       'TUEC24', 'TUDURSTOP'],
      dtype='object', name=0)

In [11]:
elec.head()

Unnamed: 0,TUCASEID,TUACTIVITY_N,TUACTDUR24,TUCC5,TUCC5B,TRTCCTOT_LN,TRTCC_LN,TRTCOC_LN,TUSTARTTIM,TUSTOPTIME,...,TRTONHH_LN,TRTOHH_LN,TRTHH_LN,TRTNOHH_LN,TEWHERE,TUCC7,TRWBELIG,TRTEC_LN,TUEC24,TUDURSTOP
2228592,20110101110074,1,10,0,0,0,-1,0,04:00:00,04:10:00,...,-1,-1,-1,-1,1,0,-1,-1,-1,-1
2228593,20110101110074,2,300,0,0,0,-1,0,04:10:00,09:10:00,...,-1,-1,-1,-1,1,0,-1,-1,-1,-1
2228594,20110101110074,3,5,0,0,0,-1,0,09:10:00,09:15:00,...,-1,-1,-1,-1,13,0,-1,-1,-1,-1
2228595,20110101110074,4,525,0,0,0,-1,0,09:15:00,18:00:00,...,-1,-1,-1,-1,3,0,-1,-1,-1,-1
2228596,20110101110074,5,5,0,0,0,-1,0,18:00:00,18:05:00,...,-1,-1,-1,-1,13,0,-1,-1,-1,-1


In [12]:
elec['TUEC24'].unique()
#TUEC24
#-1 "Blank"
#-2 "Don't Know"
#-3 "Refused"
#1 "Activity identified as eldercare"
#96 "All day"
#97 "No more activities"

array(['-1', '1', '97', '96', '-2', '-3', None], dtype=object)

In [30]:
elec.loc[elec['TUEC24']=='96', ]

Unnamed: 0,TUCASEID,TUACTIVITY_N,TUACTDUR24,TUCC5,TUCC5B,TRTCCTOT_LN,TRTCC_LN,TRTCOC_LN,TUSTARTTIM,TUSTOPTIME,...,TRTONHH_LN,TRTOHH_LN,TRTHH_LN,TRTNOHH_LN,TEWHERE,TUCC7,TRWBELIG,TRTEC_LN,TUEC24,TUDURSTOP
2229083,20110101110664,1,270,0,0,0,-1,0,04:00:00,08:30:00,...,-1,-1,-1,-1,-1,0,-1,0,96,-1
2234583,20110111101482,1,90,0,0,0,-1,0,04:00:00,05:30:00,...,-1,-1,-1,-1,-1,0,-1,0,96,-1
2251269,20110112102311,1,360,1,0,0,0,0,04:00:00,10:00:00,...,-1,0,0,-1,-1,0,-1,0,96,-1
2268452,20110212100920,1,300,0,0,0,-1,0,04:00:00,09:00:00,...,-1,-1,-1,-1,-1,0,-1,0,96,-1
2268537,20110212101031,1,240,0,0,0,-1,0,04:00:00,08:00:00,...,-1,-1,-1,-1,-1,0,-1,0,96,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4262818,20201111200879,1,180,1,0,0,0,0,04:00:00,07:00:00,...,-1,0,0,-1,-1,0,-1,0,96,2
4267706,20201211200620,1,240,0,0,0,-1,0,04:00:00,08:00:00,...,-1,-1,-1,-1,-1,0,-1,0,96,2
4270129,20201211201240,1,45,0,0,0,-1,0,04:00:00,04:45:00,...,-1,-1,-1,-1,1,0,-1,45,96,1
4272062,20201211201671,1,420,0,0,0,-1,0,04:00:00,11:00:00,...,-1,-1,-1,-1,-1,0,-1,0,96,2


In [12]:
diary = elec[['TUCASEID', 'TUACTIVITY_N', 'TRCODEP', 'TUACTDUR24', 'TUSTARTTIM', 'TUSTOPTIME']].dropna()


In [13]:
diary.columns = ['caseid', 'actline', 'activity', 'duration', 'start', 'stop']
diary.head()

Unnamed: 0,caseid,actline,activity,duration,start,stop
2228592,20110101110074,1,110101,10,04:00:00,04:10:00
2228593,20110101110074,2,20402,300,04:10:00,09:10:00
2228594,20110101110074,3,180482,5,09:10:00,09:15:00
2228595,20110101110074,4,500101,525,09:15:00,18:00:00
2228596,20110101110074,5,180482,5,18:00:00,18:05:00


In [14]:
not_diary =  not_elec[['TUCASEID', 'TUACTIVITY_N', 'TRCODEP', 'TUACTDUR24', 'TUSTARTTIM', 'TUSTOPTIME']].dropna()
not_diary.columns = ['caseid', 'actline', 'activity', 'duration', 'start', 'stop']
not_diary.head()

Unnamed: 0,caseid,actline,activity,duration,start,stop
1,20030100013280,1,130124,60,04:00:00,05:00:00
2,20030100013280,2,10201,30,05:00:00,05:30:00
3,20030100013280,3,10101,600,05:30:00,15:30:00
4,20030100013280,4,120303,150,15:30:00,18:00:00
5,20030100013280,5,110101,5,18:00:00,18:05:00


## creating tempotable

In [27]:
import activityDictionary
from importlib import reload
activityDictionary = reload(activityDictionary)

In [29]:
activityDictionary.activityDictionary("10599")

2

In [18]:
#from timeStampDictionary import numberToOneFourFourZeroScale, timestampToNumber, clocktimeToNumber, numberToClocktime

In [20]:
import timeStampDictionary
timeStampDictionary = reload(timeStampDictionary)
timeStampDictionary.number_to_clocktime(2)

'04:01:00'

In [22]:
timeStampDictionary.number_to_one_four_four_zero_scale(1)

255

In [23]:
timeStampDictionary.timestamp_to_number(15960)

27

In [24]:
timeStampDictionary.clocktime_to_number('13:31:00')

572

In [25]:
diary['lst_act'] = diary['activity'].astype(int).astype(str).apply(activityDictionary.activityDictionary)
diary.head()

Unnamed: 0,caseid,actline,activity,duration,start,stop,lst_act
2228592,20110101110074,1,110101,10,04:00:00,04:10:00,9
2228593,20110101110074,2,20402,300,04:10:00,09:10:00,3
2228594,20110101110074,3,180482,5,09:10:00,09:15:00,11
2228595,20110101110074,4,500101,525,09:15:00,18:00:00,11
2228596,20110101110074,5,180482,5,18:00:00,18:05:00,11


In [30]:
not_diary['lst_act'] = not_diary['activity'].astype(int).astype(str).apply(activityDictionary.activityDictionary)
not_diary.head()

Unnamed: 0,caseid,actline,activity,duration,start,stop,lst_act
1,20030100013280,1,130124,60,04:00:00,05:00:00,10
2,20030100013280,2,10201,30,05:00:00,05:30:00,2
3,20030100013280,3,10101,600,05:30:00,15:30:00,1
4,20030100013280,4,120303,150,15:30:00,18:00:00,8
5,20030100013280,5,110101,5,18:00:00,18:05:00,9


In [31]:
# renaming the redundant activities
diary = diary.replace({'lst_act': {2: 1, 8: 10, 9: 10}})

In [32]:
not_diary = not_diary.replace({'lst_act': {2: 1, 8: 10, 9: 10}})

In [33]:
from itertools import repeat

diary['seq'] = diary['lst_act'].apply(lambda x: [x])
diary.head()

Unnamed: 0,caseid,actline,activity,duration,start,stop,lst_act,seq
2228592,20110101110074,1,110101,10,04:00:00,04:10:00,10,[10]
2228593,20110101110074,2,20402,300,04:10:00,09:10:00,3,[3]
2228594,20110101110074,3,180482,5,09:10:00,09:15:00,11,[11]
2228595,20110101110074,4,500101,525,09:15:00,18:00:00,11,[11]
2228596,20110101110074,5,180482,5,18:00:00,18:05:00,11,[11]


In [34]:
not_diary['seq'] = not_diary['lst_act'].apply(lambda x: [x])
not_diary.head()

Unnamed: 0,caseid,actline,activity,duration,start,stop,lst_act,seq
1,20030100013280,1,130124,60,04:00:00,05:00:00,10,[10]
2,20030100013280,2,10201,30,05:00:00,05:30:00,1,[1]
3,20030100013280,3,10101,600,05:30:00,15:30:00,1,[1]
4,20030100013280,4,120303,150,15:30:00,18:00:00,10,[10]
5,20030100013280,5,110101,5,18:00:00,18:05:00,10,[10]


In [35]:
diary['seq'] = diary['seq']*diary['duration'].astype(int)
diary.head()

Unnamed: 0,caseid,actline,activity,duration,start,stop,lst_act,seq
2228592,20110101110074,1,110101,10,04:00:00,04:10:00,10,"[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]"
2228593,20110101110074,2,20402,300,04:10:00,09:10:00,3,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
2228594,20110101110074,3,180482,5,09:10:00,09:15:00,11,"[11, 11, 11, 11, 11]"
2228595,20110101110074,4,500101,525,09:15:00,18:00:00,11,"[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1..."
2228596,20110101110074,5,180482,5,18:00:00,18:05:00,11,"[11, 11, 11, 11, 11]"


In [36]:
not_diary['seq'] = not_diary['seq']*not_diary['duration'].astype(int)
not_diary.head()

Unnamed: 0,caseid,actline,activity,duration,start,stop,lst_act,seq
1,20030100013280,1,130124,60,04:00:00,05:00:00,10,"[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
2,20030100013280,2,10201,30,05:00:00,05:30:00,1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,20030100013280,3,10101,600,05:30:00,15:30:00,1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,20030100013280,4,120303,150,15:30:00,18:00:00,10,"[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
5,20030100013280,5,110101,5,18:00:00,18:05:00,10,"[10, 10, 10, 10, 10]"


In [37]:
not_tempoTable ={}
for i in not_diary['caseid']:
    not_tempoTable[i] = []

In [38]:
tempoTable ={}
for i in diary['caseid']:
    tempoTable[i] = []

In [39]:
for i in diary.index:
    tempoTable[diary.loc[i, 'caseid']].extend(diary.loc[i, 'seq'])

In [40]:
for i in not_diary.index:
    not_tempoTable[not_diary.loc[i, 'caseid']].extend(not_diary.loc[i, 'seq'])

In [41]:
del diary
diary = pd.DataFrame(tempoTable).T
diary.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1430,1431,1432,1433,1434,1435,1436,1437,1438,1439
20110101110074,10,10,10,10,10,10,10,10,10,10,...,1,1,1,1,1,1,1,1,1,1
20110101110156,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
20110101110507,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
20110101110521,10,10,10,10,10,10,10,10,10,10,...,1,1,1,1,1,1,1,1,1,1
20110101110522,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [42]:
del not_diary
not_diary = pd.DataFrame(not_tempoTable).T
not_diary.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1430,1431,1432,1433,1434,1435,1436,1437,1438,1439
20030100013280,10,10,10,10,10,10,10,10,10,10,...,10,10,10,10,10,10,10,10,10,10
20030100013344,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
20030100013352,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
20030100013848,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
20030100014165,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [43]:
diary.shape

(18776, 1440)

In [44]:
not_diary.shape

(200592, 1440)

In [48]:
#rename columns in diary
diary.columns = ['var' + str(i) for i in diary.columns]

diary.head()


Unnamed: 0,var0,var1,var2,var3,var4,var5,var6,var7,var8,var9,...,var1430,var1431,var1432,var1433,var1434,var1435,var1436,var1437,var1438,var1439
20110101110074,10,10,10,10,10,10,10,10,10,10,...,1,1,1,1,1,1,1,1,1,1
20110101110156,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
20110101110507,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
20110101110521,10,10,10,10,10,10,10,10,10,10,...,1,1,1,1,1,1,1,1,1,1
20110101110522,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [49]:
# rename columns in not_diary

not_diary.columns = ['var' + str(i) for i in not_diary.columns]

not_diary.head()



Unnamed: 0,var0,var1,var2,var3,var4,var5,var6,var7,var8,var9,...,var1430,var1431,var1432,var1433,var1434,var1435,var1436,var1437,var1438,var1439
20030100013280,10,10,10,10,10,10,10,10,10,10,...,10,10,10,10,10,10,10,10,10,10
20030100013344,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
20030100013352,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
20030100013848,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
20030100014165,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [50]:
diary.to_csv('diary.csv')

In [51]:
not_diary.to_csv('not_diary.csv')

## Needleman Wunsch

In [4]:
import numpy as np

def nw(x, y, match = 1, mismatch = 1, gap = 1):
    nx = len(x)
    ny = len(y)
    # Optimal score at each possible pair of characters.
    F = np.zeros((nx + 1, ny + 1))
    F[:,0] = np.linspace(0, -nx, nx + 1)
    F[0,:] = np.linspace(0, -ny, ny + 1)
    # Pointers to trace through an optimal aligment.
    P = np.zeros((nx + 1, ny + 1))
    P[:,0] = 3
    P[0,:] = 4
    # Temporary scores.
    t = np.zeros(3)
    for i in range(nx):
        for j in range(ny):
            if x[i] == y[j]:
                t[0] = F[i,j] + match
            else:
                t[0] = F[i,j] - mismatch
            t[1] = F[i,j+1] - gap
            t[2] = F[i+1,j] - gap
            tmax = np.max(t)
            F[i+1,j+1] = tmax
            if t[0] == tmax:
                P[i+1,j+1] += 2
            if t[1] == tmax:
                P[i+1,j+1] += 3
            if t[2] == tmax:
                P[i+1,j+1] += 4
    # Trace through an optimal alignment.
    i = nx
    j = ny
    rx = []
    ry = []
    while i > 0 or j > 0:
        if P[i,j] in [2, 5, 6, 9]:
            rx.append(x[i-1])
            ry.append(y[j-1])
            i -= 1
            j -= 1
        elif P[i,j] in [3, 5, 7, 9]:
            rx.append(x[i-1])
            ry.append('-')
            i -= 1
        elif P[i,j] in [4, 6, 7, 9]:
            rx.append('-')
            ry.append(y[j-1])
            j -= 1
    # Reverse the strings.
    rx = ''.join(rx)[::-1]
    ry = ''.join(ry)[::-1]
    return '\n'.join([rx, ry])




In [5]:
x = "GATTACA"
y = "GCATGCU"
print(nw(x, y))


np.random.seed(42)
x = np.random.choice(['A', 'T', 'G', 'C'], 50)
y = np.random.choice(['A', 'T', 'G', 'C'], 50)

print(nw(x, y, gap = 0))


G-ATTACA
GCA-TGCU
G-----C--AGGCAAGTGGGGCACCCGTATCCT-T-T-C-C-AACTTACAAGGGT-C-CC-----CGT-T
GTGCGCCAGAGG-AAGT----CA--C-T-T--TATATCCGCG--C--AC---GGTACTCCTTTTTC-TA-


In [6]:
print(nw(x, y, gap = 1))


print(nw(x, y, gap = 2))

GCAG-GCAAGTGG--GGCAC-CCGTATCCTTTC-CAAC-TTACAAGGGTCC-CCGT-T-
G-TGCGCCAGAGGAAGTCACTTTATATCC--GCGC-ACGGTAC-----TCCTTTTTCTA
GCAGGCAAGTGG--GGCAC-CCGTATCCTTTCCAACTTACAAGGGTCCCCGTT
GTGCGCCAGAGGAAGTCACTTTATATCC-GCGCACGGTAC-TCCTTTTTC-TA


In [7]:
x = "GATTACA"
y = "GCATGCU"
nx = len(x)
ny = len(y)
# Optimal score at each possible pair of characters.
F = np.zeros((nx + 1, ny + 1))
F[:,0] = np.linspace(0, nx, nx + 1)
F[0,:] = np.linspace(0, ny, ny + 1)
F

array([[0., 1., 2., 3., 4., 5., 6., 7.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [2., 0., 0., 0., 0., 0., 0., 0.],
       [3., 0., 0., 0., 0., 0., 0., 0.],
       [4., 0., 0., 0., 0., 0., 0., 0.],
       [5., 0., 0., 0., 0., 0., 0., 0.],
       [6., 0., 0., 0., 0., 0., 0., 0.],
       [7., 0., 0., 0., 0., 0., 0., 0.]])

In [8]:
match = 0
mismatch = 2
gap = 1
# Temporary scores.
t = np.zeros(3)
for i in range(nx):
    for j in range(ny):
        if x[i] == y[j]:
            t[0] = F[i,j] - match
        else:
            t[0] = F[i,j] + mismatch
        t[1] = F[i,j+1] + gap
        t[2] = F[i+1,j] + gap
        tmin = np.min(t)
        F[i+1,j+1] = tmin
F[nx, ny]

6.0

In [9]:
np.random.seed(42)

dct_seq = {}

for i in range(20000):
    dct_seq[i] = np.random.choice(['A', 'T', 'G', 'C'], 96)

len(dct_seq)

20000

In [10]:
OM_mat = np.zeros((len(dct_seq), len(dct_seq)))
OM_mat.shape[0]

20000

In [11]:
def nw_social(x, y, match = 0, mismatch = 2, gap=1):
    nx = len(x)
    ny = len(y)
    # Optimal score at each possible pair of characters.
    F = np.zeros((nx + 1, ny + 1))
    F[:,0] = np.linspace(0, nx, nx + 1)
    F[0,:] = np.linspace(0, ny, ny + 1)

    # Temporary scores.
    t = np.zeros(3)
    for i in range(nx):
        for j in range(ny):
            if x[i] == y[j]:
                t[0] = F[i,j] - match
            else:
                t[0] = F[i,j] + mismatch
            t[1] = F[i,j+1] + gap
            t[2] = F[i+1,j] + gap
            tmin = np.min(t)
            F[i+1,j+1] = tmin
    return F[nx, ny]

In [12]:
x = "GATTACA"
y = "GCATGCU"
nw_social(x, y)

6.0

In [None]:
#for i in range(OM_mat.shape[0]):
#    for j in range(OM_mat.shape[1]):
#        OM_mat[i, j] = nw_social(dct_seq[i], dct_seq[j])

In [13]:
## try applying a function to all elements
import pandas as pd

In [32]:
#seq = pd.DataFrame(dct_seq).T
#seq.columns = ['var' + str(i) for i in range(0, 96)]
#seq.head()

In [21]:
dct_seq

{0: array(['G', 'C', 'A', 'G', 'G', 'C', 'A', 'A', 'G', 'T', 'G', 'G', 'G',
        'G', 'C', 'A', 'C', 'C', 'C', 'G', 'T', 'A', 'T', 'C', 'C', 'T',
        'T', 'T', 'C', 'C', 'A', 'A', 'C', 'T', 'T', 'A', 'C', 'A', 'A',
        'G', 'G', 'G', 'T', 'C', 'C', 'C', 'C', 'G', 'T', 'T', 'G', 'T',
        'G', 'C', 'G', 'C', 'C', 'A', 'G', 'A', 'G', 'G', 'A', 'A', 'G',
        'T', 'C', 'A', 'C', 'T', 'T', 'T', 'A', 'T', 'A', 'T', 'C', 'C',
        'G', 'C', 'G', 'C', 'A', 'C', 'G', 'G', 'T', 'A', 'C', 'T', 'C',
        'C', 'T', 'T', 'T', 'T'], dtype='<U1'),
 1: array(['T', 'C', 'T', 'A', 'G', 'T', 'T', 'C', 'T', 'T', 'T', 'C', 'T',
        'G', 'C', 'G', 'C', 'T', 'G', 'C', 'A', 'T', 'C', 'A', 'C', 'A',
        'T', 'G', 'A', 'C', 'T', 'A', 'C', 'C', 'C', 'A', 'A', 'A', 'G',
        'A', 'A', 'A', 'G', 'A', 'C', 'A', 'C', 'C', 'C', 'G', 'G', 'G',
        'A', 'C', 'G', 'G', 'A', 'G', 'A', 'T', 'G', 'T', 'A', 'C', 'G',
        'A', 'C', 'C', 'T', 'A', 'C', 'G', 'G', 'T', 'C', 'A', 'G', 'C

In [33]:
from itertools import combinations
comb = combinations(dct_seq.values(), 2)
comb_keys = combinations(dct_seq, 2)

In [31]:
list(comb)[0]

(array(['G', 'C', 'A', 'G', 'G', 'C', 'A', 'A', 'G', 'T', 'G', 'G', 'G',
        'G', 'C', 'A', 'C', 'C', 'C', 'G', 'T', 'A', 'T', 'C', 'C', 'T',
        'T', 'T', 'C', 'C', 'A', 'A', 'C', 'T', 'T', 'A', 'C', 'A', 'A',
        'G', 'G', 'G', 'T', 'C', 'C', 'C', 'C', 'G', 'T', 'T', 'G', 'T',
        'G', 'C', 'G', 'C', 'C', 'A', 'G', 'A', 'G', 'G', 'A', 'A', 'G',
        'T', 'C', 'A', 'C', 'T', 'T', 'T', 'A', 'T', 'A', 'T', 'C', 'C',
        'G', 'C', 'G', 'C', 'A', 'C', 'G', 'G', 'T', 'A', 'C', 'T', 'C',
        'C', 'T', 'T', 'T', 'T'], dtype='<U1'),
 array(['T', 'C', 'T', 'A', 'G', 'T', 'T', 'C', 'T', 'T', 'T', 'C', 'T',
        'G', 'C', 'G', 'C', 'T', 'G', 'C', 'A', 'T', 'C', 'A', 'C', 'A',
        'T', 'G', 'A', 'C', 'T', 'A', 'C', 'C', 'C', 'A', 'A', 'A', 'G',
        'A', 'A', 'A', 'G', 'A', 'C', 'A', 'C', 'C', 'C', 'G', 'G', 'G',
        'A', 'C', 'G', 'G', 'A', 'G', 'A', 'T', 'G', 'T', 'A', 'C', 'G',
        'A', 'C', 'C', 'T', 'A', 'C', 'G', 'G', 'T', 'C', 'A', 'G', 'C',
   

In [34]:
list(comb_keys)[0] 

(0, 1)

In [50]:
new_seq = pd.DataFrame(index = range(len(dct_seq)), columns = range(len(dct_seq)))

new_seq.head()

In [57]:
#for i in range(len(dct_seq)):
#    for j in range(len(dct_seq)):
#        new_seq.loc[i, j] = set([i, j])

In [56]:
#res = seq.apply(nw_social, axis=0)