# Build non-human samples
The notebook creates a dataset of non-human samples. It does this by:
1. Creating a list of MIxS 5 terms from just the air, soil, sediment, plant-associated,and water packages.
2. Reducing the columns in the harmonized table to match only these MIxS 5 terms, and removing rows in which the taxonomy_id is 9606 (Homo sapiens).
3. Subset data (rows) to air, soil, sediment, plant-associated,and water packages.
4. Removing the columns from #3 that only have NaNs.

In [1]:
import pandas as pds
from pandasql import sqldf

# 1. Creating a list of MIxS 5 terms from just the air, soil, sediment, plant-associated,and water packages.

## Load MIxS 5 spreadsheet
- coreDf contains terms from the MIxS sheet; these terms apply to all packages
- packageDf containts terms asscociated with each environmental package

In [2]:
excel = pds.ExcelFile('../../mixs-spreadsheets/mixs_v5.xlsx', engine='openpyxl')

In [3]:
coreDf = excel.parse('MIxS', usecols=['Structured comment name', 'Item', 'Value syntax'])
packageDf = excel.parse('environmental_packages', usecols=['Environmental package', 'Structured comment name', 'Package item', 'Value syntax'])
packageDf.rename(columns={'Package item':'Item'}, inplace=True)

In [4]:
# peek at data
# coreDf
# packageDf

### get list of packages

In [5]:
list(packageDf['Environmental package'].unique())

['air',
 'built environment',
 'host-associated',
 'human-associated',
 'human-gut',
 'human-oral',
 'human-skin',
 'human-vaginal',
 'hydrocarbon resources-cores',
 'hydrocarbon resources-fluids/swabs',
 'microbial mat/biofilm',
 'miscellaneous natural or artificial environment',
 'plant-associated',
 'sediment',
 'soil',
 'wastewater/sludge',
 'water',
 nan]

### subset package dataframe to air, soil, sediment, plant-associated,and water packages and concatenate package specific terms with core terms

In [6]:
q = """
select distinct 
    [Structured comment name], [Item], [Value syntax]
from
    packageDf
where 
    [Environmental package] in ('air', 'soil', 'sediment', 'plant-associated', 'water')
"""
envDf = pds.concat([sqldf(q), coreDf]).drop_duplicates()

In [7]:
len(envDf)

307

In [8]:
envTerms = list(envDf['Structured comment name'].unique())

# 2.  Match harmonized table columns to only  air, soil, sediment, plant-associated,and water enviromental package terms.

### Load harmonized table data from parquet

In [9]:
# df = pds.read_parquet('../../target/harmonized-table.parquet.gz')
df = pds.read_csv('../../target/non-human-samples.tsv', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
len(df) # find number of samples

45951

### remove human taxonomy ids (taxonomy_id 9606)

In [11]:
df = df.query("taxonomy_id != '9606'")

In [12]:
len(df) # find number of remaining samples

45951

### get a list of all columns in the dataframe

In [13]:
allCols = list(df.columns)

In [14]:
# allCols

In [15]:
len(allCols)

156

### match the columns from the full datafram to the env package terms

In [16]:
matchedCols = [c for c in allCols if c in envTerms]

In [17]:
len(matchedCols) # 205 matches

152

### add some extra columns and create slice of data with matching MIxS columns

In [18]:
matchedCols.append('id')
matchedCols.append('taxonomy_id')
matchedCols.append('taxonomy_name')
matchedCols.append('package')

In [19]:
sliceDf = df[matchedCols]

# 3. Subset data to row where the env_package contains air, soil, sediment, plant-associated, and water.

In [20]:
env_packageCount = pds.DataFrame(sliceDf.groupby('env_package').env_package.count())\
                      .rename(columns={'env_package':'count'})\
                      .reset_index()
# env_packageCount

In [21]:
pds.set_option('max_rows', None)
env_packages = \
    [x for x in env_packageCount.env_package.unique()
       if ('air' in x.lower()) 
          or ('soil' in x.lower()) 
          or ('sediment' in x.lower()) 
          or ('water' in x.lower()) 
          or ('plant' in x.lower())]

In [22]:
env_packages

['Coastal water body (ENVO:02000049)',
 'Deep-sea water',
 'MIGS.plant-associated',
 'MIGS/MIMS/MIMARKS.air',
 'MIGS/MIMS/MIMARKS.plant-associated',
 'MIGS/MIMS/MIMARKS.sediment',
 'MIGS/MIMS/MIMARKS.soil',
 'MIGS/MIMS/MIMARKS.wastewater',
 'MIGS/MIMS/MIMARKS.water',
 'MIMARKS_water',
 'Sea water',
 'Sediment',
 'Soil',
 'Water',
 'air',
 'fresh water',
 'freshwater sediment',
 'marine sediment (ENVO:00002113)',
 'plant',
 'plant-associated',
 'sea water',
 'sea water, [ENVO:00002149]',
 'seawater',
 'sediment',
 'soil',
 'soil-associated',
 'sterile water',
 'wastewater',
 'wastewater sludge',
 'wastewater/sludge',
 'wastewater|sludge',
 'water']

### subset data

In [23]:
sliceDf = sliceDf[sliceDf.env_package.isin(env_packages)]

In [24]:
len(sliceDf) # number of remaining samples

45951

# 4. Remove the columns that only have NaNs.

### get counts of each non-human package

In [25]:
env_packageCount = pds.DataFrame(sliceDf.groupby('env_package').env_package.count())\
                                .rename(columns={'env_package':'count'})\
                                .reset_index()
env_packageCount.sort_values(by='count', ascending=False)

Unnamed: 0,env_package,count
31,water,16367
24,soil,11974
23,sediment,5969
6,MIGS/MIMS/MIMARKS.soil,3784
19,plant-associated,2076
8,MIGS/MIMS/MIMARKS.water,1763
5,MIGS/MIMS/MIMARKS.sediment,1177
4,MIGS/MIMS/MIMARKS.plant-associated,1063
30,wastewater|sludge,417
7,MIGS/MIMS/MIMARKS.wastewater,385


### get decscribe information about slice

In [26]:
describeDf = sliceDf.describe()

In [27]:
pds.set_option('max_columns', None)
describeDf

Unnamed: 0,tot_part_carb,taxonomy_id
count,22.0,45951.0
mean,199.07044,611001.8
std,200.39601,390252.9
min,5.06,2.0
25%,9.8475,410658.0
50%,74.978,412755.0
75%,364.913,939928.0
max,545.26,2742685.0


### get list columns with count of zero from describe

In [28]:
valueCount = describeDf.loc['count'] # get count row / index
zeroCount = valueCount[valueCount == 0] # subset to those with counts of 0

In [29]:
zeroCountCols = zeroCount.index # find the columns with zero counts

### drop columns with zero counts from slice

In [30]:
sliceDf.shape

(45951, 156)

In [31]:
sliceDf.drop(zeroCountCols, axis=1, inplace=True)

In [32]:
sliceDf.shape

(45951, 156)

### save dataset

In [33]:
# sliceDf.to_csv('../../target/non-human-samples.tsv.gz', sep='\t', compression='gzip', index=False)

In [34]:
sliceDf.to_csv('non-human-samples.tsv', sep='\t', index=False)