# Pivot Merged NCBITaxon KG Edges
### This notebook pivots the data from the NCBITaxon Merged KG Edges tsv.  
### Output is saved to the file `data/pivot_merged-kg_edges_NCBITaxon.tsv.gz`

In [1]:
import pandas as pds

## Load file `data/merged-kg_edges_NCBITaxon_clean.tsv.gz` into dataframe

In [2]:
kgEdgesDf = pds.read_csv('data/merged-kg_edges_NCBITaxon_clean.tsv.gz', sep='\t')
len(kgEdgesDf)

73808

## Create a unique list of columns from the objects and get a unique list of subjects.
### The subjects are sorted in order to create a more efficient index.

In [4]:
cols = ['subject'] +list(kgEdgesDf['object'].unique()) # note: prepend a subject col
subjects = kgEdgesDf['subject'].unique()
subjects.sort()
print('num cols:', len(cols))
print('num subjects:', len(subjects))

num cols: 328
num subjects: 31835


## Create an empty pivot "template" with subjects as index and each uinque object as a column
### The NaNs are filled with 0 in order to make filling the template easier.

In [20]:
pivotDf = pds.DataFrame(columns=cols)
pivotDf['subject'] = subjects

In [21]:
pivotDf.fillna(0, inplace=True)
pivotDf.set_index('subject', inplace=True)

In [22]:
# pivotDf.head() # peek at data

Unnamed: 0_level_0,ECOCORE:00000172,ENVO:00000215,ECOCORE:00000173,Shape:bacillus,ENVO:01000306,ECOCORE:00000177,ECOCORE:00000179,ENVO:00002007,ECOCORE:00000180,ENVO:00000051,...,NCBITaxon:9526,NCBITaxon:376913,NCBITaxon:1890424,NCBITaxon:1437180,NCBITaxon:1446379,NCBITaxon:314293,NCBITaxon:436880,NCBITaxon:7705,NCBITaxon:1206794,NCBITaxon:314146
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENVO:00000062,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENVO:01000355,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENVO:01000993,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENVO:01000996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENVO:01001000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## fill in pivot template

In [32]:
for idx, subj, obj, in kgEdgesDf[['subject', 'object']].itertuples():
    pivotDf.loc[subj, obj] = pivotDf.loc[subj, obj] + 1

In [43]:
# pivotDf.head() # peek at data

## Save pivoted data

In [45]:
pivotDf.to_csv('data/pivot_merged-kg_edges_NCBITaxon.tsv.gz', sep='\t', compression='gzip')