## Import OGER output

In [41]:
%%time

import pandas as pd
from datetime import datetime
# pd.options.mode.chained_assignment = None  # default='warn'

CPU times: user 11 µs, sys: 1e+03 ns, total: 12 µs
Wall time: 14.1 µs


In [46]:
cols = ['BiosampleID', 'Biolink', 'BeginTerm', 'EndTerm', 'TokenizedTerm', 'PreferredTerm', 'CURIE', 'NaN1', 'S-Type', 'NaN2', 'UMLS_CUI']
df = pd.read_csv('../output/FullNLPOutput.tsv', sep='\t', names=cols )
df = df.drop(['NaN1', 'NaN2', 'UMLS_CUI'], axis = 1)
print('Total number of rows: ', str(len(df)))
df.head()

Total number of rows:  2386121


Unnamed: 0,BiosampleID,Biolink,BeginTerm,EndTerm,TokenizedTerm,PreferredTerm,CURIE,S-Type
0,SAMN10586215,biolink:OntologyClass,16,24,contains,contains,RO:0001019,S1
1,SAMN10586215,biolink:OrganismalEntity,119,127,Bacteria,Bacteria,NCBITaxon:2,S1
2,SAMN05720810,biolink:OntologyClass,50,57,part of,part of,BFO:0000050,S1
3,SAMN05720810,biolink:OntologyClass,79,90,Agriculture,agricultural process,ENVO:01001246,S1
4,SAMN05720810,biolink:OntologyClass,79,90,Agriculture,animal husbandry,ENVO:01001248,S1


In [3]:
print('Unique BiosampleIds: '+ str(len(df.BiosampleID.unique())))
print('Unique CURIEs: '+ str(len(df.CURIE.unique())))
print('Unique Biolinks: '+ str(len(df.Biolink.unique())))
print('Unique Standard Terms: '+ str(len(df.PreferredTerm.unique())))

Unique CURIEs: 1686
Unique Biolinks: 4
Unique Standard Terms: 1677


In [4]:
curieOfInterest = ['ENVO', 'PATO']
envoPato = df[df.CURIE.str.contains('|'.join(curieOfInterest))]
print('Number of rows: '+ str(len(envoPato)))
envoPato.head()

Number of rows: 1222326


Unnamed: 0,BiosampleID,Biolink,BeginTerm,EndTerm,TokenizedTerm,PreferredTerm,CURIE,S-Type
3,SAMN05720810,biolink:OntologyClass,79,90,Agriculture,agricultural process,ENVO:01001246,S1
4,SAMN05720810,biolink:OntologyClass,79,90,Agriculture,animal husbandry,ENVO:01001248,S1
5,SAMN05720810,biolink:OntologyClass,79,90,Agriculture,agriculture,ENVO:01001442,S1
8,SAMN15691669,biolink:OntologyClass,52,68,marine sediments,marine sediment,ENVO:03000033,S1
12,SAMN14402390,biolink:OntologyClass,47,58,rhizosphere,rhizosphere,ENVO:00005801,S1


In [5]:
salineTerms = ['sali', 'salt']
envPatSal = envoPato[envoPato.PreferredTerm.str.contains('|'.join(salineTerms))]
print('Number of rows: '+ str(len(envPatSal)))
envPatSal.head()

Number of rows: 20083


Unnamed: 0,BiosampleID,Biolink,BeginTerm,EndTerm,TokenizedTerm,PreferredTerm,CURIE,S-Type
60,SAMN09200348,biolink:OntologyClass,0,7,Wetland,saline marsh,ENVO:00000054,S1
572,SAMN13482251,biolink:OntologyClass,11,15,Lake,saline evaporation pond,ENVO:00000055,S1
575,SAMN13482251,biolink:OntologyClass,11,15,Lake,container of an intermittent saline lake,ENVO:00000502,S1
582,SAMN13482251,biolink:OntologyClass,11,15,Lake,intermittent saline evaporation pond,ENVO:00000532,S1
772,SAMN10915792,biolink:OntologyClass,0,4,Lake,saline evaporation pond,ENVO:00000055,S1


### Form Phrases for each biosample

In [44]:
%%time
# Function to phrase-ify
def phrasify(bsIdList, columnNames):
    count = 0
    returnDF = pd.DataFrame(columns=columnNames)
    for i in bsIdList:
        count += 1
        tmpDF = pd.DataFrame()
        bpList = []
        phrase1 = []
        phrase2 = []

        tmpDF = df.loc[(df.BiosampleID == i)]
        tmpDF.sort_values(by=['BeginTerm'], inplace=True)
        break;

        for row in tmpDF.iterrows():
            bpList.append(row[1].BeginTerm)
            phrase1.append(row[1].TokenizedTerm)
            phrase2.append(row[1].PreferredTerm)

        returnDF = returnDF.append(pd.DataFrame([[i, bpList, phrase1, phrase2]], columns=columnNames))
        if count % 1000 == 0:
                print('Counter value: '+ str(count//1000)+ 'K - '+ datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
        
    return returnDF

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 7.87 µs


In [None]:
%%time

pCols = ['BiosampleID', 'BeginPosList', 'Phrase', 'StdPhrase']
phraseDF = phrasify(df.BiosampleID.unique(),pCols)
phraseDF.head()

Counter value: 1K - 22/10/2020 18:27:22
Counter value: 2K - 22/10/2020 18:30:13
Counter value: 3K - 22/10/2020 18:33:02
Counter value: 4K - 22/10/2020 18:35:51
Counter value: 5K - 22/10/2020 18:38:40
Counter value: 6K - 22/10/2020 18:41:33
Counter value: 7K - 22/10/2020 18:44:22
Counter value: 8K - 22/10/2020 18:47:12
Counter value: 9K - 22/10/2020 18:50:01
Counter value: 10K - 22/10/2020 18:52:55
Counter value: 11K - 22/10/2020 18:55:45
Counter value: 12K - 22/10/2020 18:58:34
Counter value: 13K - 22/10/2020 19:01:25
Counter value: 14K - 22/10/2020 19:04:15
Counter value: 15K - 22/10/2020 19:07:05
Counter value: 16K - 22/10/2020 19:09:56
Counter value: 17K - 22/10/2020 19:12:50
Counter value: 18K - 22/10/2020 19:15:40
Counter value: 19K - 22/10/2020 19:18:25
Counter value: 20K - 22/10/2020 19:21:09
Counter value: 21K - 22/10/2020 19:23:46
Counter value: 22K - 22/10/2020 19:26:26
Counter value: 23K - 22/10/2020 19:29:05
Counter value: 24K - 22/10/2020 19:31:50
Counter value: 25K - 22/1

Counter value: 199K - 23/10/2020 04:14:34
Counter value: 200K - 23/10/2020 04:17:47
Counter value: 201K - 23/10/2020 04:21:00
Counter value: 202K - 23/10/2020 04:24:14
Counter value: 203K - 23/10/2020 04:27:27
Counter value: 204K - 23/10/2020 04:30:41
Counter value: 205K - 23/10/2020 04:33:55
Counter value: 206K - 23/10/2020 04:37:09
Counter value: 207K - 23/10/2020 04:40:23
Counter value: 208K - 23/10/2020 04:43:37
Counter value: 209K - 23/10/2020 04:46:51
Counter value: 210K - 23/10/2020 04:50:05
Counter value: 211K - 23/10/2020 04:53:19
Counter value: 212K - 23/10/2020 04:56:35
Counter value: 213K - 23/10/2020 04:59:49
Counter value: 214K - 23/10/2020 05:03:04
Counter value: 215K - 23/10/2020 05:06:19
Counter value: 216K - 23/10/2020 05:09:33
Counter value: 217K - 23/10/2020 05:12:49
Counter value: 218K - 23/10/2020 05:16:04
Counter value: 219K - 23/10/2020 05:19:20
Counter value: 220K - 23/10/2020 05:22:34
Counter value: 221K - 23/10/2020 05:25:51
Counter value: 222K - 23/10/2020 0

In [None]:
phraseDF.to_csv('../output/enitityPhrases.tsv', sep='\t', index=False)