# Match sample metadata to NOAA sheet


In [25]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import glob

#For illustrator import:
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

### Import Dada2 data

In [26]:
#location of data:
directory = '../data/Dada2_seq_data/'
markers = ['12S']
prefix = 'CN19S'
otus= []
taxas=[]
metas=[]
for marker in markers:
    print('XXXXXX    ',marker)
    df = pd.read_csv(directory+prefix+'_'+marker+'_Dada2_otu_merged.csv')
    df = df.rename(columns={'Unnamed: 0':'ASV'})
    df.set_index('ASV', inplace=True)
    otus.append(df)
    print('Number samples in otu_table:', len(list(df)))
    df = pd.read_csv(directory+prefix+'_'+marker+'_Dada2_meta_merged.csv')
    df.set_index('sample_name', inplace=True)
    #date handling
    print(list(df))
    df['eventDate'] = pd.to_datetime(df['eventDate'])
    df['month'] = df['eventDate'].dt.month
    metas.append(df)
    print('Number samples in metadata table:', len(df.index))
    df = pd.read_csv(directory+prefix+'_'+marker+'_Dada2_taxa_merged.csv')
    df = df.rename(columns={'Unnamed: 0':'ASV'})
    df.set_index('ASV', inplace=True)
    print('Number ASVs in taxa table:', len(df.index))
    taxas.append(df)
taxas[0].head()


XXXXXX     12S
Number samples in otu_table: 286
['FilterID', 'target_gene', 'PlateID', 'library', 'local_time', 'time_label', 'SAMPLING_cruise', 'depth', 'SAMPLING_platform', 'SC', 'ESP', 'SAMPLING_station_number', 'SAMPLING_station', 'SAMPLING_bottle', 'decimalLongitude', 'decimalLatitude', 'sample_type', 'Plates', 'Markers', 'Status', 'Dewar_name', 'Sampling_method', 'replicate', 'SAMPLING_rdepth', 'project_name', 'nitrate', 'fluor', 'density', 'pressure', 'minimumDepthInMeters', 'maximumDepthInMeters', 'start_GMT', 'end_GMT', 'temp', 'salinity', 'sigmat', 'spice', 'diss_oxygen', 'PAR (umol/s/m2)', 'altitude', 'chlorophyll', 'bbp470 (count)', 'bbp650 (count)', 'SAMPLING_project', 'ESP_name', 'diel', 'month', 'day', 'hour', 'eventDate']
Number samples in metadata table: 286
Number ASVs in taxa table: 2827


  df['eventDate'] = pd.to_datetime(df['eventDate'])


Unnamed: 0_level_0,Kingdom,Phylum,Class,Order,Family,Genus,Species
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ASV_1,Eukaryota,Chordata,Actinopteri,Clupeiformes,Engraulidae,Engraulis,Engraulis mordax
ASV_2,Eukaryota,Chordata,Actinopteri,Myctophiformes,Myctophidae,Diaphus,Diaphus theta
ASV_3,Eukaryota,Chordata,Actinopteri,Gadiformes,Macrouridae,unassigned,unassigned
ASV_4,Eukaryota,Chordata,Actinopteri,Gadiformes,Merlucciidae,Merluccius,Merluccius productus
ASV_5,Eukaryota,Chordata,Actinopteri,Myctophiformes,Myctophidae,Stenobrachius,Stenobrachius leucopsarus


In [27]:
metas[0]

Unnamed: 0_level_0,FilterID,target_gene,PlateID,library,local_time,time_label,SAMPLING_cruise,depth,SAMPLING_platform,SC,...,chlorophyll,bbp470 (count),bbp650 (count),SAMPLING_project,ESP_name,diel,month,day,hour,eventDate
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CN19SESPMV1_SC58_eDNA_CE,CN19SESPMV1_SC58_eDNA,12S,CE,CE40,2019-05-29 02:42:00,05-29 night,CN19S,17.6279,daphne,58.0,...,2.7972,100.9675,109.2667,CANON,,night,5.0,29.0,2.0,2019-05-29 09:42:00
CN19SESPKOA_SC58_eDNA_CE,CN19SESPKOA_SC58_eDNA,12S,CE,CE3,2019-05-29 12:32:00,05-29 day,CN19S,32.5799,makai,58.0,...,0.5502,83.3809,149.0871,CANON,,day,5.0,29.0,12.0,2019-05-29 19:32:00
CN19SESPKOA_SC57_eDNA_CE,CN19SESPKOA_SC57_eDNA,12S,CE,CE4,2019-05-29 13:47:00,05-29 day,CN19S,202.5248,makai,57.0,...,0.0782,78.4872,146.0477,CANON,,day,5.0,29.0,13.0,2019-05-29 20:47:00
CN19SESPKOA_SC56_eDNA_CE,CN19SESPKOA_SC56_eDNA,12S,CE,CE5,2019-05-29 14:50:00,05-29 day,CN19S,267.6256,makai,56.0,...,0.0833,78.2715,148.8699,CANON,,day,5.0,29.0,14.0,2019-05-29 21:50:00
CN19SESPKOA_SC55_eDNA_CE,CN19SESPKOA_SC55_eDNA,12S,CE,CE6,2019-05-29 23:17:00,05-29 night,CN19S,27.4770,makai,55.0,...,0.3254,80.2178,143.2971,CANON,,night,5.0,29.0,23.0,2019-05-30 06:17:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ArtComm1_CE,ArtComm1,12S,CE,CE92,,,,,,,...,,,,,,,,,,NaT
ArtComm2_CE,ArtComm2,12S,CE,CE93,,,,,,,...,,,,,,,,,,NaT
CN19SP3_OSU_EB1_CE,CN19SP3_OSU_EB1,12S,CE,CE94,,,,,,,...,,,,,,,,,,NaT
CN19SP3_OSU_EB2_CE,CN19SP3_OSU_EB2,12S,CE,CE95,,,,,,,...,,,,,,,,,,NaT


## Get Filenames data

In [52]:
marker = '12S'
merged_directory = '/Volumes/MBON/processed/banzai_Dada2/12S/Merged_dataset/Results_20220824/'
file = 'Collapsed_meta_table_unfiltered.csv'
# import and filter to samples in this study
df = pd.read_csv(merged_directory + file)
# samples in study:
study_samps = metas[0].index.tolist()
df = df.loc[df['sample_name'].isin(study_samps)==True]
df.set_index('sample_name', inplace=True)
df = df.dropna(how='all', axis=1)
print(list(df))
meta_all = df.copy()
meta_all[['primer_sequence_forward', 'primer_sequence_reverse', 'diss_oxygen', 'samp_store_temp', 'samp_filter_ext_type', 'samp_filter_size_ext']]


['ESP', 'PCR_settings', 'PlateID', 'R1', 'R2', 'SAMPLING_PI', 'SAMPLING_bottle', 'SAMPLING_campaign', 'SAMPLING_cruise', 'SAMPLING_date_time', 'SAMPLING_dec_lat', 'SAMPLING_dec_long', 'SAMPLING_institute', 'SAMPLING_platform', 'SAMPLING_platform_type', 'SAMPLING_project', 'SAMPLING_rdepth', 'SAMPLING_real_depth', 'SAMPLING_station', 'SAMPLING_station_number', 'SC', 'Unnamed: 0', 'chlorophyll', 'collection_date', 'date_pcr', 'day', 'decimalLatitude', 'decimalLongitude', 'density', 'depth', 'diss_oxygen', 'end_GMT', 'env_biome', 'env_broad_scale', 'env_feature', 'env_local_scale', 'env_material', 'env_medium', 'env_package', 'eventDate', 'fluor', 'geo_loc_name', 'investigation_type', 'library', 'library_tag_combo', 'maximumDepthInMeters', 'minimumDepthInMeters', 'month', 'nitrate', 'order', 'original_name', 'pcr_primer_name_forward', 'pcr_primer_name_reverse', 'pcr_primer_reference', 'pressure', 'pressure_dbar', 'primer_sequence_F', 'primer_sequence_R', 'primer_sequence_forward', 'primer

Unnamed: 0_level_0,primer_sequence_forward,primer_sequence_reverse,diss_oxygen,samp_store_temp,samp_filter_ext_type,samp_filter_size_ext
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
V4195_CB_eDNA_BT,GTCGGTAAAACTCGTGCCAGC,CATAGTGGGGTATCTAATCCCAGTTTG,,-80,Poretics,0.22um
CN19Sc24_04_eDNA_BT,GTCGGTAAAACTCGTGCCAGC,CATAGTGGGGTATCTAATCCCAGTTTG,,-80,Poretics,0.22um
CN19Sc24_09_eDNA_BT,GTCGGTAAAACTCGTGCCAGC,CATAGTGGGGTATCTAATCCCAGTTTG,,-80,Poretics,0.22um
CN19Sc26_02_eDNA_BT,GTCGGTAAAACTCGTGCCAGC,CATAGTGGGGTATCTAATCCCAGTTTG,,-80,Poretics,0.22um
CN19Sc26_04_eDNA_BT,GTCGGTAAAACTCGTGCCAGC,CATAGTGGGGTATCTAATCCCAGTTTG,,-80,Poretics,0.22um
...,...,...,...,...,...,...
ArtComm1_CE,GTCGGTAAAACTCGTGCCAGC,CATAGTGGGGTATCTAATCCCAGTTTG,,-80,Poretics,0.22um
ArtComm2_CE,GTCGGTAAAACTCGTGCCAGC,CATAGTGGGGTATCTAATCCCAGTTTG,,-80,Poretics,0.22um
CN19SP3_OSU_EB1_CE,GTCGGTAAAACTCGTGCCAGC,CATAGTGGGGTATCTAATCCCAGTTTG,,-80,Poretics,0.22um
CN19SP3_OSU_EB2_CE,GTCGGTAAAACTCGTGCCAGC,CATAGTGGGGTATCTAATCCCAGTTTG,,-80,Poretics,0.22um


## Import Filtration metadata

In [28]:
# environmental samples
file = '../data/metadata/CN19S_extraction_metadata.csv'
df = pd.read_csv(file)
df.set_index('sample_name', inplace=True)
extract = df.copy()
df

Unnamed: 0_level_0,extraction_date,water_filt_mL,DNA_concentration_ng_uL,measurement_method
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CN19Sc03_1_eDNA_JJ,20190916,900.000,1.44,qubit
CN19Sc03_10_eDNA_JJ,20190916,1000.000,16.10,nanodrop
CN19Sc03_12_eDNA_JJ,20190916,1000.000,68.00,nanodrop
CN19Sc03_3_eDNA_JJ,20190916,1000.000,4.20,nanodrop
CN19Sc03_4_eDNA_JJ,20190916,700.000,5.10,nanodrop
...,...,...,...,...
CN19SESPMV1_SC52_eDNA_CE,20200814,1000.000,7.86,qubit
CN19SESPMV1_SC53_eDNA_CE,20200814,484.645,3.76,qubit
CN19SESPMV1_SC54_eDNA_CE,20200814,1000.000,3.99,qubit
CN19SESPMV1_SC55_eDNA_CE,20200814,1000.000,4.86,qubit


## Join and make metadata columns needed by SRA

In [95]:
df = pd.concat([metas[0], extract], axis=1)
df = pd.concat([df, meta_all[['R1', 'R2']]], axis=1)
#df = df.loc[df['water_filt_mL'].isna()==True]
# drop columns that are only for internal use
cols = ['ESP_name', 'diel', 'month', 'day', 'hour', 'time_label', 'local_time', 'SC', 'ESP', 'Plates', 'Markers', 'Status', 'Dewar_name']
df = df.drop(cols, axis=1)



# convert names to NOAA standard terms
df = df.rename(columns={'SAMPLING_cruise':'cruise_id',
                       'FilterID':'source_material_id',
                        'SAMPLING_station_number':'station',
                        'SAMPLING_bottle':'ctd_bottle_no',
                        'water_filt_mL':'samp_vol_we_dna_ext',
                        'SAMPLING_platform':'samp_collect_device',
                        'SAMPLING_station':'locationID',
                        'DNA_concentration_ng_uL':'dna_conc',
                        'concentrationUnit' : 'measurement_method',
                        'library' : 'library_ID',
                        'R1': 'filename',
                        'R2': 'filename2'
                       })



# Standard Columns to Add
# df['organism'] = 'seawater metagenome'  # only if environmental sample
df['organism'] = ''
df.loc[df['sample_type']=='environmental', 'organism'] = 'seawater metagenome'
# term 'control sample' can be used for negative controls: (https://www.ddbj.nig.ac.jp/biosample/submission-e.html)
df.loc[df['sample_type'].isin(['negative','MSU_control'])==True, 'organism'] = 'control sample'
# for artificial/mock communities, term is synthetic metagenome
df.loc[df.index.str.contains('ArtC|Art_C', case=False), 'organism'] = 'synthetic metagenome'
# ESP positive controls are seawater
df.loc[(df['sample_type']=='positive') & (df['Sampling_method']=='ESP'), 'organism'] = 'seawater metagenome'

print(df['organism'].unique())
# If sample is from seawater the following applies:
df.loc[df['organism']=='seawater metagenome','env_broad_scale' ] = 'marine biome [ENVO:00000447]'
df.loc[df['organism']=='seawater metagenome','env_local_scale' ] = 'marine mesopelagic zone [ENVO:00000213]'
df.loc[df['organism']=='seawater metagenome','env_medium' ] = 'sea water [ENVO:00002149]'
df.loc[df['organism']=='seawater metagenome','geo_loc_name' ] = 'USA: Monterey Bay'
df.loc[df['organism']=='seawater metagenome','size_frac' ] = '0.22 micrometer'

df['collection_method'] = df['Sampling_method']
df['collection_method'] = df['collection_method'].replace('CTD', 'CTD rosette')
df['collection_method'] = df['collection_method'].replace('ESP', 'Environmental Sample Processor onboard long-range autonomous underwater vehicle (LRAUV-ESP)')
print(df['collection_method'].unique())
# date in UTC
df['collection_date'] = df['eventDate']
# need to be in these formats:
# "DD-Mmm-YYYY", "Mmm-YYYY", "YYYY" or ISO 8601 standard "YYYY-mm-dd", "YYYY-mm", "YYYY-mm-ddThh:mm:ss";
df['collection_date'] = pd.to_datetime(df['collection_date'])
df['collection_date'] = df['collection_date'].dt.strftime('%Y-%m-%dT%H:%M:%S')
df['collection_date'] = df['collection_date'] +'Z'
# join lat lon information
# since we are adding 'W' we can drop the negative before the longitude value
df['decimalLongitude_str'] = -df['decimalLongitude']
df['lat_lon'] = df['decimalLatitude'].astype(str) +' N ' + df['decimalLongitude_str'].astype(str) + ' W'
df['lat_lon'] = df['lat_lon'].str.replace('nan N nan W', '')
# study level metadata
df['project_id'] = 'CN19S'
df['project_name'] = 'eDNA from Controlled Agile and Novel Observing Network (CANON) cruise in spring of 2019 (CN19S)'
df['project_id_external'] = 'MBARI-BOG-CN19S'
df['project_contact'] = 'Kathleen Pitz, kpitz@mbari.org, https://orcid.org/0000-0002-4931-8592'
df['project_description'] = ''
df['type'] = 'Occurence'
df['license'] = 'CC-BY 4.0'
# Amplicon Prep Data Columns (NOAA - becomes SRA metadata)
df['title'] = '12S amplicon metabarcoding of marine metagenome: Monterey Bay, CA (USA)'
df['library_strategy'] = 'AMPLICON'
df['library_source'] = 'METAGENOMIC'
df['library_selection'] = 'PCR'
# library_layout - 'lib_layout' in NOAA sheet
df['library_layout'] = 'paired'
df['platform'] = 'ILLUMINA'
df['instrument_model'] = 'Illumina MiSeq'
df['design_description'] = 'Seawater was filtered onto 0.22 PVDF membrane filters through shipboard and vehicle sampling'
df['filetype'] = 'fastq'
# design description supposed to be a mini 'methods' section:
df.loc[df['PlateID'].isin(['JJ', 'RR']), 'design_description'] = 'Seawater was filtered onto 0.22 PVDF membrane filters through shipboard and vehicle sampling. \
DNA was extracted using the Qiagen DNeasy Blood and Tissue Kit. A two-step PCR approach was used, targeting mitochondrial 12S with primers MiFish-U-F (5-GTCGGTAAAACTCGTGCCAGC-3) \
and MiFish-U-R (5-CATAGTGGGGTATCTAATCCCAGTTTG-3). Fluidigm tail sequences CS1 (5-ACACTGACGACATGGTTCTACA-3) and CS2 (5-TACGGTAGCAGAGACTTGGTCT-3) were added to the 5-prime ends of \
the forward and reverse primer sequences. Secondary amplification and next-generation sequencing were performed at Michigan State University where Illumina compatible adapters with \
barcodes targeted CS1/CS2 ends of the primery PCR products. Pooled product was sequenced in a 2x250bp paired end format on an Illumina MiSeq.'

df.loc[df['PlateID'].isin(['CE', 'BT']), 'design_description'] = 'Seawater was filtered onto 0.22 PVDF membrane filters through shipboard and vehicle sampling. \
DNA was extracted using the Qiagen DNeasy Blood and Tissue Kit. A two-step PCR approach was used, targeting mitochondrial 12S with primers MiFish-U-F (5-GTCGGTAAAACTCGTGCCAGC-3) \
and MiFish-U-R (5-CATAGTGGGGTATCTAATCCCAGTTTG-3). Fluidigm tail sequences CS1 (5-ACACTGACGACATGGTTCTACA-3) and CS2 (5-TACGGTAGCAGAGACTTGGTCT-3) were added to the 5-prime ends of \
the forward and reverse primer sequences. Secondary amplification and next-generation sequencing were performed at Michigan State University where Illumina compatible adapters with \
barcodes targeted CS1/CS2 ends of the primery PCR products. Pooled product was size selected with a Pippin HT to select for the vertebrate/fish band (~350bp) and remove co-amplified \
bacteria (~435bp). It was then sequenced in a 2x250bp paired end format on an Illumina MiSeq.'

print(df['design_description'].unique())

# we use sample_type differently - if 'environmental', should be 'seawater'
df['sample_type'] = df['sample_type'].str.replace('environmental', 'seawater')

# mandatory columns, followed by optional MixS MIMARKS columns; index is sample_name (mandatory); export as tab deliminated
Biosample_cols = ['organism', 'collection_date', 'depth', 'env_broad_scale', 'env_local_scale', 'env_medium', 'geo_loc_name', 'lat_lon',
                 'chlorophyll', 'density', 'diss_oxygen', 'fluor', 'nitrate', 'pressure', 'salinity', 'samp_collect_device',
                  'samp_vol_we_dna_ext', 'source_material_id', 'temp']
# additional NOAA columns in water_sample_data tab - can include
NOAA_water_sample_data = ['cruise_id', 'station', 'locationID', 'ctd_bottle_no', 'decimalLongitude', 'decimalLatitude', 'sample_type',
                         'dna_conc','measurement_method','minimumDepthInMeters', 'maximumDepthInMeters', 'collection_method']
# Used to generate BioProject
NOAA_study_data = ['project_id', 'project_name','project_id_external','project_contact', 'project_description', 'type','license'  ]
# Will need to be submitted as SRA metadata
NOAA_amplicon_prep_data = ['library_ID', 'title', 'library_strategy', 'library_source', 'library_selection', 'library_layout',
                            'platform', 'instrument_model', 'design_description', 'filetype', 'filename', 'filename2']
#df = df[Biosample_cols]



# need to add units to measurements
# density is currently sigma-t (kg/m^3 - 1000)
df['density'] = df['density'] + 1000
cols = ['density', 'chlorophyll', 'depth', 'diss_oxygen', 'fluor', 'nitrate', 'pressure', 'salinity', 'samp_vol_we_dna_ext', 'temp', 'dna_conc']
add_units = ['kilogram per cubic meter', 'milligram per cubic meter', 'meter', 'milliliter per liter', 'volts',
             'micromole per liter', 'decibar', 'parts per thousand', 'milliliter', 'degree Celsius', '']

for i, unit in zip(cols,add_units):
    print(i)
    print(unit)
    #df.loc[df[i].isna()==False,df[i]] = df[i].astype(str) + ' ' + unit
    df[i] = df[i].astype('string') + ' ' + unit
    # if was nan beforehand then don't want a field now

# samp_collect_device has controlled vocabulary need to change
# concentrationUnit - dna_conc unit
df['concentrationUnit'] = 'nanograms per milliliter'

# For the biosample submission, remove control samples and just include environmental samples
# only environmental samples have 'depth'
df = df.loc[df['depth'].isna()==False]

# change sample name to be the filter name, not the sequenced sample id (remove the plate ID)
df = df.reset_index()

df['sample_name'] = df['sample_name'].str.split('_').str[:-1]
df['sample_name'] = df['sample_name'].str.join('_')
print(len(df['sample_name'].unique()))
df.set_index('sample_name', inplace=True)

# remove samples not included because they had fewer than 500 reads:
df = df.loc[df['samp_vol_we_dna_ext'].isna()==False]
print(len(df.index))

file = '/Users/kpitz/Documents/biosample_metadata.csv'
df[Biosample_cols+NOAA_water_sample_data].to_csv(file, sep='\t')

# SRA: If you are submitting metagenomic and/or metatranscriptomic data sets, sequence data should be split by each sample barcode, for individual data files

file = '/Users/kpitz/Documents/sra_metadata.csv'
df[NOAA_amplicon_prep_data].to_csv(file, sep='\t')


# generate script (.sh file) to copy fastq files to the same folder to make submission easier?
# /Volumes/MBON/raw_sequence_data/12S/CE_20220629_12S_PE250/CE1/CN19SESPKoa_SC60_eDNA_S1_L001_R1_001.fastq.gz
# cp [original_location + filename] [new_location + filename]
#plate_folders = ['CE_20220629_12S_PE250', 'BT_20220425_12S_PE250', 'JJ_20191127_12S_PE250', 'RR_20200702_12S_PE250']
#plate_ids = ['CE', 'BT', 'JJ', 'RR']
original_location = '/Volumes/MBON/raw_sequence_data/12S/'

new_location = '/Users/kpitz/Projects/SRA_submission/DVM_CN19S/'

R1s = df['filename'].tolist()
R2s = df['filename2'].tolist()
plates = df['PlateID'].tolist()
lib_ids = df['library_ID'].tolist()
print(df['PlateID'].unique())
outfile = '/Users/kpitz/Documents/copy_fastq_files.sh'

##Write File
resultsFile = open(outfile, "w") #open resultsfile
pfolder=[]
for i in range(len(R1s)):
    if plates[i] == 'CE':
        pfolder = 'CE_20220629_12S_PE250'
    elif plates[i] == 'BT':
        pfolder = 'BT_20220425_12S_PE250'
    elif plates[i] == 'JJ':
        pfolder = 'JJ_20191127_12S_PE250'    
    elif plates[i] == 'RR':
        pfolder = 'RR_20200702_12S_PE250'    
    else:
        print('ERROR: unknown plate ', plates[i])
        break
    resultsFile.write('cp '+ original_location + pfolder+'/'+lib_ids[i]+ '/'+R1s[i]+' '+ new_location+R1s[i]+'\n')
    resultsFile.write('cp '+ original_location + pfolder+'/'+lib_ids[i]+ '/'+R2s[i]+' '+ new_location+R2s[i]+'\n')
resultsFile.close()

print('Done!')




# look at what's left:
df = df.drop(Biosample_cols, axis=1)
df = df.drop(NOAA_water_sample_data, axis=1)
df = df.drop(NOAA_study_data, axis=1)

print(df['Sampling_method'].unique())
print(list(df))

df

['seawater metagenome' 'control sample' 'synthetic metagenome']
['Environmental Sample Processor onboard long-range autonomous underwater vehicle (LRAUV-ESP)'
 'CTD rosette' nan]
['Seawater was filtered onto 0.22 PVDF membrane filters through shipboard and vehicle sampling. DNA was extracted using the Qiagen DNeasy Blood and Tissue Kit. A two-step PCR approach was used, targeting mitochondrial 12S with primers MiFish-U-F (5-GTCGGTAAAACTCGTGCCAGC-3) and MiFish-U-R (5-CATAGTGGGGTATCTAATCCCAGTTTG-3). Fluidigm tail sequences CS1 (5-ACACTGACGACATGGTTCTACA-3) and CS2 (5-TACGGTAGCAGAGACTTGGTCT-3) were added to the 5-prime ends of the forward and reverse primer sequences. Secondary amplification and next-generation sequencing were performed at Michigan State University where Illumina compatible adapters with barcodes targeted CS1/CS2 ends of the primery PCR products. Pooled product was size selected with a Pippin HT to select for the vertebrate/fish band (~350bp) and remove co-amplified bacter

  df.loc[df['organism']=='seawater metagenome','env_broad_scale' ] = 'marine biome [ENVO:00000447]'
  df.loc[df['organism']=='seawater metagenome','env_local_scale' ] = 'marine mesopelagic zone [ENVO:00000213]'
  df.loc[df['organism']=='seawater metagenome','env_medium' ] = 'sea water [ENVO:00002149]'
  df.loc[df['organism']=='seawater metagenome','geo_loc_name' ] = 'USA: Monterey Bay'
  df.loc[df['organism']=='seawater metagenome','size_frac' ] = '0.22 micrometer'


Unnamed: 0_level_0,target_gene,PlateID,library_ID,Sampling_method,replicate,SAMPLING_rdepth,start_GMT,end_GMT,sigmat,spice,...,title,library_strategy,library_source,library_selection,library_layout,platform,instrument_model,design_description,filetype,concentrationUnit
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CN19SESPMV1_SC58_eDNA,12S,CE,CE40,ESP,,,5/29/19 9:42,5/29/19 10:07,25.0274,0.2338,...,12S amplicon metabarcoding of marine metagenom...,AMPLICON,METAGENOMIC,PCR,paired,ILLUMINA,Illumina MiSeq,Seawater was filtered onto 0.22 PVDF membrane ...,fastq,nanograms per milliliter
CN19SESPKOA_SC58_eDNA,12S,CE,CE3,ESP,,,5/29/19 19:32,5/29/19 20:18,25.4315,0.0592,...,12S amplicon metabarcoding of marine metagenom...,AMPLICON,METAGENOMIC,PCR,paired,ILLUMINA,Illumina MiSeq,Seawater was filtered onto 0.22 PVDF membrane ...,fastq,nanograms per milliliter
CN19SESPKOA_SC57_eDNA,12S,CE,CE4,ESP,,,5/29/19 20:47,5/29/19 21:28,26.5216,0.0742,...,12S amplicon metabarcoding of marine metagenom...,AMPLICON,METAGENOMIC,PCR,paired,ILLUMINA,Illumina MiSeq,Seawater was filtered onto 0.22 PVDF membrane ...,fastq,nanograms per milliliter
CN19SESPKOA_SC56_eDNA,12S,CE,CE5,ESP,,,5/29/19 21:50,5/29/19 22:33,26.6542,0.0243,...,12S amplicon metabarcoding of marine metagenom...,AMPLICON,METAGENOMIC,PCR,paired,ILLUMINA,Illumina MiSeq,Seawater was filtered onto 0.22 PVDF membrane ...,fastq,nanograms per milliliter
CN19SESPKOA_SC55_eDNA,12S,CE,CE6,ESP,,,5/30/19 6:17,5/30/19 7:00,25.6208,-0.0159,...,12S amplicon metabarcoding of marine metagenom...,AMPLICON,METAGENOMIC,PCR,paired,ILLUMINA,Illumina MiSeq,Seawater was filtered onto 0.22 PVDF membrane ...,fastq,nanograms per milliliter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CN19SESPKOA_SC07_eDNA,12S,CE,CE32,ESP,,,6/6/19 7:21,6/6/19 8:04,25.8611,0.0925,...,12S amplicon metabarcoding of marine metagenom...,AMPLICON,METAGENOMIC,PCR,paired,ILLUMINA,Illumina MiSeq,Seawater was filtered onto 0.22 PVDF membrane ...,fastq,nanograms per milliliter
CN19SESPKOA_SC06_eDNA,12S,CE,CE33,ESP,,,6/6/19 8:24,6/6/19 9:07,26.2025,0.1392,...,12S amplicon metabarcoding of marine metagenom...,AMPLICON,METAGENOMIC,PCR,paired,ILLUMINA,Illumina MiSeq,Seawater was filtered onto 0.22 PVDF membrane ...,fastq,nanograms per milliliter
CN19SESPKOA_SC05_eDNA,12S,CE,CE34,ESP,,,6/6/19 9:24,6/6/19 10:06,26.2557,0.1402,...,12S amplicon metabarcoding of marine metagenom...,AMPLICON,METAGENOMIC,PCR,paired,ILLUMINA,Illumina MiSeq,Seawater was filtered onto 0.22 PVDF membrane ...,fastq,nanograms per milliliter
CN19SESPKOA_SC04_eDNA,12S,CE,CE35,ESP,,,6/6/19 10:24,6/6/19 11:05,26.3305,0.1342,...,12S amplicon metabarcoding of marine metagenom...,AMPLICON,METAGENOMIC,PCR,paired,ILLUMINA,Illumina MiSeq,Seawater was filtered onto 0.22 PVDF membrane ...,fastq,nanograms per milliliter
