In [2]:
import pandas as pds
import numpy as np
from pandasql import sqldf

pysqldf = lambda q: sqldf(q, globals()) # define pysqldf function for queries

## Load MIxS 6 enviromental package data

In [3]:
df = pds.read_excel("data/mixs-6-term-updates.xlsx", sheet_name="MIxS6 packages - to edit")

In [4]:
df.head() # peek at data

Unnamed: 0,Environmental package,Structured comment name,Package item,Definition,Expected value,Value syntax,Example,Requirement,Preferred unit,Occurrence,Position,MIXS ID
0,air,alt,altitude,Altitude is a term used to identify heights of...,measurement value,{float} {unit},100 meter,M,meter,1,0,
1,air,elev,elevation,Elevation of the sampling site is its height a...,measurement value,{float} {unit},100 meter,C,meter,1,0,
2,air,barometric_press,barometric pressure,Force per unit area exerted against a surface ...,measurement value,{float} {unit},5 millibar,X,millibar,1,1,
3,air,carb_dioxide,carbon dioxide,Carbon dioxide (gas) amount or concentration a...,measurement value,{float} {unit},410 parts per million,X,"micromole per liter, parts per million",1,1,
4,air,carb_monoxide,carbon monoxide,Carbon monoxide (gas) amount or concentration ...,measurement value,{float} {unit},0.1 parts per million,X,"micromole per liter, parts per million",1,1,


### Find term/definition distinct counts -- looking for terms with more than one definition

In [25]:
q = """
select
    [Package item], count(distinct Definition) as definition_count
from
    df
group by
    [Package item]
having
  count(distinct Definition) > 1
"""
pysqldf(q)

Unnamed: 0,Package item,definition_count
0,host disease status,2


### Examine differences for host disease status definitions

In [30]:
pds.set_option('display.max_colwidth', 999)
q = """
select distinct
    [Package item], Definition
from
    df
where
    [Package item] = 'host disease status'
"""
pysqldf(q)

Unnamed: 0,Package item,Definition
0,host disease status,"List of diseases with which the host has been diagnosed; can include multiple diagnoses. The value of the field depends on host; for humans the terms should be chosen from the DO (Human Disease Ontology) at http://www.disease-ontology.org, non-human host diseases are free text"
1,host disease status,"List of diseases with which the host has been diagnosed; can include multiple diagnoses. The value of the field depends on host; for humans the terms should be chosen from do (disease ontology) at http://www.disease-ontology.org, other hosts are free text"


### List difference by environmental package

In [31]:
pds.set_option('display.max_colwidth', 999)
q = """
select distinct
    [Environmental Package], [Package item], Definition
from
    df
where
    [Package item] = 'host disease status'
order by Definition, [Environmental Package]
"""
pysqldf(q)

Unnamed: 0,Environmental package,Package item,Definition
0,human-oral,host disease status,"List of diseases with which the host has been diagnosed; can include multiple diagnoses. The value of the field depends on host; for humans the terms should be chosen from do (disease ontology) at http://www.disease-ontology.org, other hosts are free text"
1,host-associated,host disease status,"List of diseases with which the host has been diagnosed; can include multiple diagnoses. The value of the field depends on host; for humans the terms should be chosen from the DO (Human Disease Ontology) at http://www.disease-ontology.org, non-human host diseases are free text"
2,human-associated,host disease status,"List of diseases with which the host has been diagnosed; can include multiple diagnoses. The value of the field depends on host; for humans the terms should be chosen from the DO (Human Disease Ontology) at http://www.disease-ontology.org, non-human host diseases are free text"
3,human-gut,host disease status,"List of diseases with which the host has been diagnosed; can include multiple diagnoses. The value of the field depends on host; for humans the terms should be chosen from the DO (Human Disease Ontology) at http://www.disease-ontology.org, non-human host diseases are free text"
4,human-skin,host disease status,"List of diseases with which the host has been diagnosed; can include multiple diagnoses. The value of the field depends on host; for humans the terms should be chosen from the DO (Human Disease Ontology) at http://www.disease-ontology.org, non-human host diseases are free text"
5,human-vaginal,host disease status,"List of diseases with which the host has been diagnosed; can include multiple diagnoses. The value of the field depends on host; for humans the terms should be chosen from the DO (Human Disease Ontology) at http://www.disease-ontology.org, non-human host diseases are free text"
6,plant-associated,host disease status,"List of diseases with which the host has been diagnosed; can include multiple diagnoses. The value of the field depends on host; for humans the terms should be chosen from the DO (Human Disease Ontology) at http://www.disease-ontology.org, non-human host diseases are free text"


## Create data frame of distinct field name / package name pairings

In [11]:
q = """
select distinct
    [Package item] as field, [Definition] as definition, [Environmental package] as package, count(*) as field_count
from
    df
group by
    [Package item], [Definition], [Environmental package]
order by
    [Package item], [Environmental package], [Definition]
"""
fieldsdf = pysqldf(q)

In [12]:
fieldsdf.head() # peek at data

Unnamed: 0,field,definition,package,field_count
0,API gravity,API gravity is a measure of how heavy or light...,hydrocarbon resources-cores,1
1,API gravity,API gravity is a measure of how heavy or light...,hydrocarbon resources-fluids/swabs,1
2,HRT,Whether subject had hormone replacement therap...,human-vaginal,1
3,IHMC ethnicity,Ethnicity of the subject,human-associated,1
4,IHMC ethnicity,Ethnicity of the subject,human-gut,1


In [33]:
## testing
# q = """
# select * from fieldsdf 
# where field = 'host body product'
# """
# sqldf(q)

### Pivot the fields data frame so that the package values are columns with the field_count as the column values

In [34]:
pivotdf = fieldsdf.pivot_table(index=['field', 'definition'], columns=['package'], values=['field_count'], aggfunc=np.sum, fill_value=0)


In [35]:
pds.set_option('display.max_columns', 999) # display all columns
pivotdf.head() # peek at data

Unnamed: 0_level_0,Unnamed: 1_level_0,field_count,field_count,field_count,field_count,field_count,field_count,field_count,field_count,field_count,field_count,field_count,field_count,field_count,field_count,field_count,field_count,field_count
Unnamed: 0_level_1,package,air,built environment,host-associated,human-associated,human-gut,human-oral,human-skin,human-vaginal,hydrocarbon resources-cores,hydrocarbon resources-fluids/swabs,microbial mat/biofilm,miscellaneous natural or artificial environment,plant-associated,sediment,soil,wastewater/sludge,water
field,definition,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
API gravity,API gravity is a measure of how heavy or light a petroleum liquid is compared to water (source: https://en.wikipedia.org/wiki/API_gravity) (e.g. 31.1¬∞ API),0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0
HRT,"Whether subject had hormone replacement theraphy, and if yes start date",0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
IHMC ethnicity,Ethnicity of the subject,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0
IHMC medication code,Can include multiple medication codes,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0
absolute air humidity,Actual mass of water vapor - mh20 - present in the air water vapor mixture,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Perform some data clean up
- drop the coloum hierarchical index (if scroll to the right, you will see the name 'field_count' above 'water')
- add a column to sum up the number of packages a term occurrs in (this may allow for easy filtering)
- reset the index

In [36]:
pivotdf.columns = pivotdf.columns.droplevel()

In [37]:
pivotdf['total'] = pivotdf.sum(axis=1)

In [38]:
pivotdf.columns.name = "" # this removes the 'package' label from the left side of table

In [39]:
pivotdf.reset_index(inplace=True) # makes the field and definitions to be column names

In [40]:
pivotdf.head() # peek at data

Unnamed: 0,field,definition,air,built environment,host-associated,human-associated,human-gut,human-oral,human-skin,human-vaginal,hydrocarbon resources-cores,hydrocarbon resources-fluids/swabs,microbial mat/biofilm,miscellaneous natural or artificial environment,plant-associated,sediment,soil,wastewater/sludge,water,total
0,API gravity,API gravity is a measure of how heavy or light a petroleum liquid is compared to water (source: https://en.wikipedia.org/wiki/API_gravity) (e.g. 31.1¬∞ API),0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,2
1,HRT,"Whether subject had hormone replacement theraphy, and if yes start date",0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
2,IHMC ethnicity,Ethnicity of the subject,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,5
3,IHMC medication code,Can include multiple medication codes,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,5
4,absolute air humidity,Actual mass of water vapor - mh20 - present in the air water vapor mixture,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [54]:
len(pivotdf)

510

### Create spreadsheet of terms that exist in multiple packages

In [51]:
q = """
select 
    *
from 
    pivotdf
where 
    total > 1
order by
    field
"""
multi_termdf = sqldf(q)

In [52]:
multi_termdf.head()

Unnamed: 0,field,definition,air,built environment,host-associated,human-associated,human-gut,human-oral,human-skin,human-vaginal,hydrocarbon resources-cores,hydrocarbon resources-fluids/swabs,microbial mat/biofilm,miscellaneous natural or artificial environment,plant-associated,sediment,soil,wastewater/sludge,water,total
0,API gravity,API gravity is a measure of how heavy or light a petroleum liquid is compared to water (source: https://en.wikipedia.org/wiki/API_gravity) (e.g. 31.1¬∞ API),0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,2
1,IHMC ethnicity,Ethnicity of the subject,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,5
2,IHMC medication code,Can include multiple medication codes,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,5
3,additional info,Information that doesn't fit anywhere else. Can also be used to propose new entries for fields with controlled vocabulary,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,2
4,alkalinity,"Alkalinity, the ability of a solution to neutralize acids to the equivalence point of carbonate or bicarbonate",0,0,0,0,0,0,0,0,1,1,1,1,0,1,0,1,1,7


In [53]:
len(multi_termdf)

147

### Save spreadsheets

In [57]:
pivotdf.to_excel("output/mixs-package-term-v6.xlsx", engine='xlsxwriter', index=False)

In [58]:
multi_termdf.to_excel("output/multi-package-mixs-terms-only-v6.xlsx", engine='xlsxwriter', index=False)