# Build non-human samples
The notebook creates a dataset of non-human samples. It does this by:
1. Creating a list of MIxS 5 terms from just the air, soil, sediment, plant-associated,and water packages.
2. Reducing the columns in the harmonized table to match only these MIxS 5 terms, and removing rows in which the taxonomy_id is 9606 (Homo sapiens).
3. Subset data (rows) to air, soil, sediment, plant-associated,and water packages.
4. Removing the columns from #3 that only have NaNs.

In [1]:
import pandas as pds
from pandasql import sqldf

# 1. Creating a list of MIxS 5 terms from just the air, soil, sediment, plant-associated,and water packages.

## Load MIxS 5 spreadsheet
- coreDf contains terms from the MIxS sheet; these terms apply to all packages
- packageDf containts terms asscociated with each environmental package

In [2]:
excel = pds.ExcelFile('../../mixs-spreadsheets/mixs_v5.xlsx')

In [3]:
coreDf = excel.parse('MIxS', usecols=['Structured comment name', 'Item', 'Value syntax'])
packageDf = excel.parse('environmental_packages', usecols=['Environmental package', 'Structured comment name', 'Package item', 'Value syntax'])
packageDf.rename(columns={'Package item':'Item'}, inplace=True)

In [4]:
# peek at data
# coreDf
# packageDf

### get list of packages

In [5]:
list(packageDf['Environmental package'].unique())

['air',
 'built environment',
 'host-associated',
 'human-associated',
 'human-gut',
 'human-oral',
 'human-skin',
 'human-vaginal',
 'hydrocarbon resources-cores',
 'hydrocarbon resources-fluids/swabs',
 'microbial mat/biofilm',
 'miscellaneous natural or artificial environment',
 'plant-associated',
 'sediment',
 'soil',
 'wastewater/sludge',
 'water']

### subset package dataframe to air, soil, sediment, plant-associated,and water packages and concatenate package specific terms with core terms

In [6]:
q = """
select distinct 
    [Structured comment name], [Item], [Value syntax]
from
    packageDf
where 
    [Environmental package] in ('air', 'soil', 'sediment', 'plant-associated', 'water')
"""
envDf = pds.concat([sqldf(q), coreDf]).drop_duplicates()

In [7]:
len(envDf)

307

In [8]:
envTerms = list(envDf['Structured comment name'].unique())

# 2.  Match harmonized table columns to only  air, soil, sediment, plant-associated,and water enviromental package terms.

### Load harmonized table data from parquet

In [9]:
df = pds.read_parquet('../../target/harmonized-table.parquet.gz')

In [10]:
len(df) # find number of samples

14300584

### remove human taxonomy ids (taxonomy_id 9606)

In [11]:
df = df.query("taxonomy_id != '9606'")

In [12]:
len(df) # find number of remaining samples

7480877

### get a list of all columns in the dataframe

In [13]:
allCols = list(df.columns)

In [14]:
# allCols

In [15]:
len(allCols)

464

### match the columns from the full datafram to the env package terms

In [16]:
matchedCols = [c for c in allCols if c in envTerms]

In [17]:
len(matchedCols) # 205 matches

205

### add some extra columns and create slice of data with matching MIxS columns

In [18]:
matchedCols.append('id')
matchedCols.append('taxonomy_id')
matchedCols.append('taxonomy_name')
matchedCols.append('package')

In [19]:
sliceDf = df[matchedCols]

# 3. Subset data to row where the env_package contains air, soil, sediment, plant-associated, and water.

In [25]:
env_packageCount = pds.DataFrame(sliceDf.groupby('env_package').env_package.count())\
                      .rename(columns={'env_package':'count'})\
                      .reset_index()
# env_packageCount

In [26]:
pds.set_option('max_rows', None)
env_packages = \
    [x for x in env_packageCount.env_package.unique()
       if ('air' in x.lower()) 
          or ('soil' in x.lower()) 
          or ('sediment' in x.lower()) 
          or ('water' in x.lower()) 
          or ('plant' in x.lower())]

In [27]:
env_packages

['Coastal water body (ENVO:02000049)',
 'Deep-sea water',
 'MIGS.plant-associated',
 'MIGS/MIMS/MIMARKS.air',
 'MIGS/MIMS/MIMARKS.plant-associated',
 'MIGS/MIMS/MIMARKS.sediment',
 'MIGS/MIMS/MIMARKS.soil',
 'MIGS/MIMS/MIMARKS.wastewater',
 'MIGS/MIMS/MIMARKS.water',
 'MIMARKS_water',
 'Sea water',
 'Sediment',
 'Soil',
 'Water',
 'air',
 'fresh water',
 'freshwater sediment',
 'marine sediment (ENVO:00002113)',
 'plant',
 'plant-associated',
 'sea water',
 'sea water, [ENVO:00002149]',
 'seawater',
 'sediment',
 'soil',
 'soil-associated',
 'sterile water',
 'wastewater',
 'wastewater sludge',
 'wastewater/sludge',
 'wastewater|sludge',
 'water']

### subset data

In [28]:
sliceDf = sliceDf[sliceDf.env_package.isin(env_packages)]

In [29]:
len(sliceDf) # number of remaining samples

45951

# 4. Remove the columns that only have NaNs.

### get counts of each non-human package

In [30]:
env_packageCount = pds.DataFrame(sliceDf.groupby('env_package').env_package.count())\
                                .rename(columns={'env_package':'count'})\
                                .reset_index()
env_packageCount.sort_values(by='count', ascending=False)

Unnamed: 0,env_package,count
31,water,16367
24,soil,11974
23,sediment,5969
6,MIGS/MIMS/MIMARKS.soil,3784
19,plant-associated,2076
8,MIGS/MIMS/MIMARKS.water,1763
5,MIGS/MIMS/MIMARKS.sediment,1177
4,MIGS/MIMS/MIMARKS.plant-associated,1063
30,wastewater|sludge,417
7,MIGS/MIMS/MIMARKS.wastewater,385


### get decscribe information about slice

In [31]:
describeDf = sliceDf.describe()

In [32]:
pds.set_option('max_columns', None)
describeDf

Unnamed: 0,tot_diss_nitro,salinity_meth,bac_prod,size_frac,fertilizer_regm,watering_regm,host_dry_mass,extreme_event,diss_hydrogen,heavy_metals,microbial_biomass,soil_type,ventilation_type,host_phenotype,tot_inorg_nitro,ph_regm,tidal_stage,encoded_traits,num_replicons,nitrate,rel_to_oxygen,potassium,sodium,petroleum_hydrocarb,virus_enrich_appr,down_par,env_package,bac_resp,soluble_react_phosp,biomass,samp_store_dur,conduc,diss_inorg_nitro,pollutants,methane,propagation,previous_land_use,radiation_regm,mechanical_damage,samp_mat_process,sulfide,host_taxid,fao_class,bishomohopanol,phosphate,light_intensity,porosity,geo_loc_name,barometric_press,org_matter,humidity_regm,herbicide_regm,mineral_nutr_regm,host_tot_mass,al_sat_meth,soil_type_meth,tot_depth_water_col,tot_carb,estimated_size,samp_salinity,host_wet_mass,diss_org_carb,drainage_class,water_current,al_sat,phaeopigments,host_life_stage,non_mineral_nutr_regm,biotic_relationship,host_age,chem_administration,profile_position,pool_dna_extracts,nitrite,tot_org_c_meth,ref_biomaterial,isol_growth_condt,slope_gradient,atmospheric_data,density,plant_product,standing_water_regm,link_climate_info,project_name,salinity,horizon,host_infra_specific_name,microbial_biomass_meth,salt_regm,elev,misc_param,water_temp_regm,flooding,sulfate,link_class_info,slope_aspect,crop_rotation,investigation_type,collection_date,extreme_salinity,rainfall_regm,temp,tot_org_carb,turbidity,season_environment,cur_vegetation_meth,previous_land_use_meth,suspend_part_matter,pressure,samp_store_loc,env_broad_scale,samp_size,tillage,texture,diss_carb_dioxide,org_nitro,tot_nitro,water_content_soil_meth,wind_direction,store_cond,samp_vol_we_dna_ext,extrachrom_elements,mean_frict_vel,heavy_metals_meth,oxy_stat_samp,env_local_scale,humidity,redox_potential,bacteria_carb_prod,local_class_meth,glucosidase_act,diss_org_nitro,pathogenicity,organism_count,perturbation,ventilation_rate,n_alkanes,phosplipid_fatt_acid,subspecf_gen_lin,texture_meth,chloride,pesticide_regm,host_infra_specific_rank,fungicide_regm,tot_phosp,org_carb,carb_nitro_ratio,mean_peak_frict_vel,diss_oxygen,part_org_carb,chem_mutagen,agrochem_addition,air_temp_regm,alkalinity,source_uvig,tiss_cult_growth_med,alkyl_diethers,env_medium,growth_hormone_regm,host_length,gaseous_environment,sediment_type,local_class,primary_prod,fluor,diss_inorg_carb,part_org_nitro,cur_land_use,horizon_meth,fire,climate_environment,chlorophyll,oxygen,host_genotype,antibiotic_regm,host_height,samp_collect_device,cur_vegetation,ammonium,photon_flux,nitro,bromide,wind_speed,particle_class,lat_lon,water_content,ph,ph_meth,magnesium,link_addit_analys,ploidy,diether_lipids,samp_store_temp,carb_dioxide,gravity,experimental_factor,depth,silicate,sieving,tot_part_carb,trophic_level,calcium,resp_part_matter,diss_inorg_phosp,aminopept_act,id,taxonomy_id,taxonomy_name,package
count,3159,330,0.0,0.0,0.0,35,0.0,338,2,158,178,1902,0.0,0.0,0.0,0.0,2,1,398,5109,4256,3787,3428,9,0.0,0.0,45951,0.0,0.0,2,260,1898,0.0,0.0,56,274,849,0.0,0.0,5170,52,7825,505,0.0,4625,65,320,41176,0.0,327.0,35,0.0,35,0.0,108,869,992,1242,277,86,0.0,3609,1317,0.0,108,281,50,0.0,18,495,0.0,337,185,967,863,267,517,257.0,255,968,4826,0.0,109,22345,6285,598,2,185,0.0,29719,649,0.0,126,3191,108,196,222,20419,18203,165,0.0,13186,1693,102,62,188,109,10,403,476,45756,10314,98,629,3348,348,2926,1533,8,1791,24,0.0,0.0,137,724,45754,16,135,60,323,0.0,117.0,1,202,429,0.0,6,0.0,194,122,3285,0.0,0.0,0.0,205,686,932,0.0,2883.0,134,0.0,622,35,579,0.0,0.0,0.0,44834,0.0,0.0,0.0,127,552,80,215,3353,134,1871,108,291,23,916,4338,37,0.0,496,5632,983,1675,0.0,93.0,51,381.0,29,13156,849,11596,7429,3773,0.0,280,0.0,691,0.0,0.0,4479,37143.0,1122,863,22.0,16,3788,0.0,0.0,0.0,45951,45951,45951,45951
unique,99,4,0.0,0.0,0.0,1,0.0,2,1,5,152,62,0.0,0.0,0.0,0.0,1,1,15,973,24,511,538,7,0.0,0.0,32,0.0,0.0,2,19,601,0.0,0.0,16,19,18,0.0,0.0,43,6,32,6,0.0,425,27,31,1046,0.0,25.0,1,0.0,1,0.0,1,4,253,322,33,8,0.0,939,4,0.0,1,19,3,0.0,1,2,0.0,3,8,144,9,38,73,99.0,31,202,1,0.0,2,877,926,8,2,121,0.0,1451,391,0.0,3,206,1,96,27,19,2075,23,0.0,3074,934,48,2,12,2,10,17,8,693,81,4,261,242,39,961,11,2,15,2,0.0,0.0,3,31,1278,6,92,6,3,0.0,12.0,1,46,41,0.0,3,0.0,2,3,205,0.0,0.0,0.0,23,51,375,0.0,768.0,94,0.0,33,1,74,0.0,0.0,0.0,461,0.0,0.0,0.0,24,10,65,107,961,89,32,1,8,1,276,425,7,0.0,2,76,96,395,0.0,63.0,6,69.0,10,2591,445,861,365,1311,0.0,8,0.0,13,0.0,0.0,30,1502.0,540,32,22.0,5,2034,0.0,0.0,0.0,45951,220,218,34
top,Missing: Not reported,Missing:Not reported,,,,daily,,Missing: Not provided,0.50%,N/D,820,"silt loam (2-4% slopes; fine-silty, mixed, sup...",,,,,low,"antibiotic resistance: streptomycine, ampicili...",1,Missing: Not provided,aerobe,Missing: Not provided,Missing: Not provided,iso alkanes,,,water,,,50 g,02/01/2011-12/01/2011,Missing: Not provided,,,Missing: Not collected,self-breeding,Cunninghamia lanceolata plantation,,,lifeguard preservation solution,Missing: Not collected,4577,Cambisol,,Missing: Not provided,5338,981744518,USA:AK,,26.2,60% (day); 85% (night),,hoaglands solution every 2 weeks,,N/D,Gray,6 m,Missing: Not Provided,500mb,8,,Missing: Not provided,moderately well,,N/D,0.01 microgram per liter,seedling,,free living,not applicable,,footslope,No,Missing: Not Provided,Dry_combustion,missing,Two-weeks old,0.5,No sea ice,169,maize,,http://inta.gob.ar/documentos/atlas-climatico-...,DSMP,Missing: Not provided,A,Olote Colorado,Chloroform fumigation-incubation,,missing: not collected,For accompanying metadata and related data pro...,,No,Missing: Not provided,http://geointa.inta.gov.ar/visor/?p=model_suelos,North west,yes,metagenome,2015-06-06,N/D,,Missing: Not provided,Missing: Not provided,1.10 NTU,plants were grown in Duke Phytotron growth cha...,Field observation,Surveys to landowners,2.510^61.210^6 VLPs mlÃ¢ÂÂ1,1.56 bar,"DRI, Murray lab",ENVO:cropland biome,".1,g",No tillage,sandy loam,Missing: Not provided,Missing: Not Provided,Missing: Not Provided,gravimetric,WNW,-80C,10 L,,,N/D,aerobic,plant-associated habitat,40%,Missing: Not collected,327.96 pmol leu/L/hr,http://soils.usda.gov/technical/classification...,,0.248,barley,zooplankton individuals per liter,"Anti bird net applied, plants removed in bucke...",,none,,bacteria and archaea,INTA soil map (http://geointa.inta.gov.ar/viso...,Missing: Not provided,,,,15.1 ÃÂµg/L,Missing: Not Provided,Missing: Not Provided,,0.1,44.47microMolePerLiter,,67 kg N ha-1,21.1 oC (day); 15.6 oC (night),Missing: Not Provided,,,,soil,,,,silt,silty clay loam,Missing: Not collected,0.61 mgm-3,Missing: Not provided,4.32microMolePerLiter,grass/herbaceous cover,Field observation,2004,riparian zone,0.05 microgram per liter,Missing: Not provided,wildtype,,not applicable,multiple corer,montaine steppe,Missing: Not Provided,,0.0,Missing: Not collected,3.08,0.30 % silt and clay; 99.70 % sand,30.274 N 120.155 E,9696,Missing: Not provided,Missing: Not provided,Missing: Not provided,,diploid,,-80 degrees Celcius,,,gene|marine metagenome|uncultured microorganis...,0.01,Missing: Not provided,one forest plot had the size 25.8 m x 25.8 m w...,403.15,photosynthetic,Missing: Not provided,,,,BIOSAMPLE:SAMEA4707948,410658,soil metagenome,Generic.1.0
freq,978,176,,,,35,,230,2,108,3,708,,,,,2,1,363,2001,3945,1111,1111,2,,,16367,,,1,60,232,,,31,219,394,,,3139,40,4739,394,,2067,6,144,3037,,144.0,35,,35,,108,708,284,276,219,49,,942,886,,108,66,35,,18,487,,162,108,276,708,66,219,45.0,34,144,4826,,108,3917,3002,214,1,18,,3917,48,,108,2698,108,39,38,8043,904,108,,1403,74,10,35,72,108,1,144,328,4856,4856,80,71,1904,276,276,797,6,708,14,,,108,376,5126,4,10,10,214,,26.0,1,48,140,,3,,135,108,2698,,,,140,276,276,,207.0,6,,355,35,180,,,,12603,,,,48,166,14,10,1631,6,950,108,150,23,24,2826,23,,487,3821,218,276,,9.0,40,31.0,3,219,144,2773,4517,1155,,247,,328,,,3917,6877.0,180,394,1.0,9,1152,,,,1,8740,8740,36982


### get list columns with count of zero from describe

In [33]:
valueCount = describeDf.loc['count'] # get count row / index
zeroCount = valueCount[valueCount == 0] # subset to those with counts of 0

In [34]:
zeroCountCols = zeroCount.index # find the columns with zero counts

### drop columns with zero counts from slice

In [35]:
sliceDf.shape

(45951, 209)

In [36]:
sliceDf.drop(zeroCountCols, axis=1, inplace=True)

In [37]:
sliceDf.shape

(45951, 156)

### save dataset

In [38]:
sliceDf.to_csv('../../target/non-human-samples.tsv.gz', sep='\t', compression='gzip', index=False)