## Import OGER output

In [8]:
%%time

import pandas as pd
from datetime import datetime
pd.options.mode.chained_assignment = None  # default='warn'

CPU times: user 50 µs, sys: 0 ns, total: 50 µs
Wall time: 55.1 µs


In [107]:
cols = ['BiosampleId', 'Biolink', 'BeginTerm', 'EndTerm', 'TokenizedTerm', 'PreferredTerm', 'CURIE', 'NaN1', 'S-Type', 'NaN2', 'UMLS_CUI']
ogerOutput = pd.read_csv('../output/FullNLPOutput.tsv', sep='\t', names=cols )
ogerOutput = ogerOutput.drop(['NaN1', 'NaN2', 'UMLS_CUI'], axis = 1)
print('Total number of rows: ', str(len(ogerOutput)))
ogerOutput.head()
#pd.set_option("display.max_rows", None)
#print(ogerOutput.CURIE.value_counts())

Total number of rows:  2386121


Unnamed: 0,BiosampleId,Biolink,BeginTerm,EndTerm,TokenizedTerm,PreferredTerm,CURIE,S-Type
0,SAMN10586215,biolink:OntologyClass,16,24,contains,contains,RO:0001019,S1
1,SAMN10586215,biolink:OrganismalEntity,119,127,Bacteria,Bacteria,NCBITaxon:2,S1
2,SAMN05720810,biolink:OntologyClass,50,57,part of,part of,BFO:0000050,S1
3,SAMN05720810,biolink:OntologyClass,79,90,Agriculture,agricultural process,ENVO:01001246,S1
4,SAMN05720810,biolink:OntologyClass,79,90,Agriculture,animal husbandry,ENVO:01001248,S1


In [39]:
ogerInput = pd.read_csv('../output/biosampleDescriptionDF.tsv', sep='\t')
ogerInput.BiosampleId = ogerInput.BiosampleId.map(lambda x: x.replace('BIOSAMPLE:',''))
ogerInput = ogerInput.drop(['StudyId', 'Name', 'Title'], axis = 1)
print('Number of rows: '+ str(len(ogerInput)))
print('Number of unique Ids: '+ str(len(ogerInput.BiosampleId.unique())))
print('Number of unique Desc: '+ str(len(ogerInput.Description.unique())))
ogerInput = ogerInput.drop_duplicates()
print('******After duplicate row removal*****')
print('Number of rows: '+ str(len(ogerInput)))
print('Number of unique Ids: '+ str(len(ogerInput.BiosampleId.unique())))
print('Number of unique Description: '+ str(len(ogerInput.Description.unique())))
ogerInput.head()

Number of rows: 948401
Number of unique Ids: 938333
Number of unique Desc: 59789
******After duplicate row removal*****
Number of rows: 947374
Number of unique Ids: 938333
Number of unique Description: 59789


Unnamed: 0,Description,BiosampleId
0,Treponema denticola ATCC 35405. Treponema dent...,SAMN02603967
1,Treponema pallidum subsp. pallidum. This organ...,SAMN02604348
2,Campylobacter fetus subsp. venerealis str. Azu...,SAMN02471365
3,Campylobacter jejuni strain NCTC11168. This st...,SAMEA1705929
4,Francisella tularensis subsp. tularensis strai...,SAMEA3138185


In [3]:
print('Unique BiosampleIds: '+ str(len(ogerOutput.BiosampleID.unique())))
print('Unique CURIEs: '+ str(len(ogerOutput.CURIE.unique())))
print('Unique Biolinks: '+ str(len(ogerOutput.Biolink.unique())))
print('Unique Standard Terms: '+ str(len(ogerOutput.PreferredTerm.unique())))

Unique BiosampleIds: 649677
Unique CURIEs: 1686
Unique Biolinks: 4
Unique Standard Terms: 1677


In [112]:
curieOfInterest = ['ENVO', 'PATO', 'RO']
envoPato = ogerOutput[ogerOutput.CURIE.str.contains('|'.join(curieOfInterest))]
print('Number of rows: '+ str(len(envoPato)))
envoPato.head()

Number of rows: 1531659


Unnamed: 0,BiosampleId,Biolink,BeginTerm,EndTerm,TokenizedTerm,PreferredTerm,CURIE,S-Type
0,SAMN10586215,biolink:OntologyClass,16,24,contains,contains,RO:0001019,S1
3,SAMN05720810,biolink:OntologyClass,79,90,Agriculture,agricultural process,ENVO:01001246,S1
4,SAMN05720810,biolink:OntologyClass,79,90,Agriculture,animal husbandry,ENVO:01001248,S1
5,SAMN05720810,biolink:OntologyClass,79,90,Agriculture,agriculture,ENVO:01001442,S1
6,SAMN16364345,biolink:OntologyClass,16,24,contains,contains,RO:0001019,S1


In [113]:
salineTerms = ['sali', 'salt']
envPatSal = envoPato[envoPato.TokenizedTerm	.str.contains('|'.join(salineTerms))]
print('Number of rows: '+ str(len(envPatSal)))
envPatSal.head()

Number of rows: 2325


Unnamed: 0,BiosampleId,Biolink,BeginTerm,EndTerm,TokenizedTerm,PreferredTerm,CURIE,S-Type
1302,SAMN13189859,biolink:OntologyClass,35,39,salt,sodium chloride salt,ENVO:01000681,S1
1304,SAMN13189859,biolink:OntologyClass,35,45,salt marsh,saline marsh,ENVO:00000054,S1
2815,SAMN08794385,biolink:OntologyClass,1025,1029,salt,sodium chloride salt,ENVO:01000681,S7
4336,SAMN03334908,biolink:AnatomicalEntity,243,249,saliva,saliva,UBERON:0001836,S2
7911,SAMN09201718,biolink:OntologyClass,19,30,saline lake,saline lake,ENVO:00000019,S1


### Form Phrases for each biosample

In [130]:
%%time
# Function to phrase-ify
def phrasify(dataFrame, columnNames):
    count = 0
    returnDF = pd.DataFrame(columns=columnNames)
    
    bsIdList = dataFrame.BiosampleId.unique()
    
    for i in bsIdList:
        count += 1
        tmpDF = pd.DataFrame()
        bpList = []
        phrase1 = []
        phrase2 = []
        curie = []

        tmpDF = dataFrame.loc[(dataFrame.BiosampleId == i)]
        
        # Filter: Consider ONLY 3 or more DISTINCT terms AND if an RO CURIE is present
        
        if len(tmpDF) > 2 \
        and len(tmpDF.TokenizedTerm.unique()) > 2 \
        and tmpDF.CURIE.str.contains('RO').any():
            
            tmpDF.sort_values(by=['BeginTerm'], inplace=True)

            for row in tmpDF.iterrows():
                bpList.append(row[1].BeginTerm)
                phrase1.append(row[1].TokenizedTerm)
                phrase2.append(row[1].PreferredTerm)
                curie.append(row[1].CURIE)

            returnDF = returnDF.append(pd.DataFrame([[i, bpList, phrase1, phrase2, curie]], columns=columnNames))
            
        if count % 1000 == 0:
            print('Number of relevant rows: '+ str(len(returnDF))+ ' - '+ datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
        
        # *********** FOR DEBUG PURPOSES ONLY ****************
        if count == 5000:
            break;
        
    return returnDF

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 6.91 µs


In [None]:
%%time

pCols = ['BiosampleId', 'BeginPosList', 'Phrase', 'StdPhrase', 'CURIE']
phraseDF = phrasify(envoPato,pCols)
print('*****Number of rows: '+ str(len(phraseDF)) + ' ********')
phraseDF.head()

Number of relevant rows: 177 - 23/10/2020 17:43:19
Number of relevant rows: 352 - 23/10/2020 17:45:06
Number of relevant rows: 543 - 23/10/2020 17:46:54
Number of relevant rows: 761 - 23/10/2020 17:48:42


In [None]:
result = ogerInput.merge(phraseDF, how='inner', on='BiosampleId')
result.head()

In [94]:
result.to_csv('../output/enitityPhrases.tsv', sep='\t', index=False)