# Export Taxa List

- Just 12S data
- Just unique taxonomy; will put into ecological categories

In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import glob

#For illustrator import:
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

### Import Filtered data

In [2]:
#location of data:
directory = '../data/filtered_seq_data/'
markers = ['12S']

otus= []
taxas=[]
metas=[]
for marker in markers:
    print('XXXXXX    ',marker)
    df = pd.read_csv(directory+'CN19S_'+marker+'_otu_Filtered.csv')
    df = df.rename(columns={'Unnamed: 0':'ASV'})
    df.set_index('ASV', inplace=True)
    otus.append(df)
    print('Number samples in otu_table:', len(list(df)))
    df = pd.read_csv(directory+'CN19S_'+marker+'_meta_Filtered.csv')
    df.set_index('sample_name', inplace=True)
    #date handling
    print(list(df))
    df['eventDate'] = pd.to_datetime(df['eventDate'])
    df['month'] = df['eventDate'].dt.month
    metas.append(df)
    print('Number samples in metadata table:', len(df.index))
    df = pd.read_csv(directory+'CN19S_'+marker+'_taxa_Filtered.csv')
    df = df.rename(columns={'Unnamed: 0':'ASV'})
    df.set_index('ASV', inplace=True)
    print('Number ASVs in taxa table:', len(df.index))
    taxas.append(df)
taxas[0].head()

XXXXXX     12S
Number samples in otu_table: 245
['FilterID', 'target_gene', 'PlateID', 'library', 'local_time', 'time_label', 'SAMPLING_cruise', 'depth', 'SAMPLING_platform', 'SC', 'ESP', 'SAMPLING_station_number', 'SAMPLING_station', 'SAMPLING_bottle', 'decimalLongitude', 'decimalLatitude', 'sample_type', 'Plates', 'Markers', 'Status', 'Dewar_name', 'Sampling_method', 'replicate', 'SAMPLING_rdepth', 'project_name', 'nitrate', 'fluor', 'density', 'pressure', 'minimumDepthInMeters', 'maximumDepthInMeters', 'start_GMT', 'end_GMT', 'temp', 'salinity', 'sigmat', 'spice', 'diss_oxygen', 'PAR (umol/s/m2)', 'altitude', 'chlorophyll', 'bbp470 (count)', 'bbp650 (count)', 'SAMPLING_project', 'ESP_name', 'diel', 'month', 'day', 'hour', 'eventDate']
Number samples in metadata table: 245
Number ASVs in taxa table: 442


Unnamed: 0_level_0,Kingdom,Phylum,Class,Order,Family,Genus,Species
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ASV_1,Eukaryota,Chordata,Actinopteri,Clupeiformes,Engraulidae,Engraulis,Engraulis mordax
ASV_2,Eukaryota,Chordata,Actinopteri,Myctophiformes,Myctophidae,Diaphus,Diaphus theta
ASV_3,Eukaryota,Chordata,Actinopteri,Gadiformes,Macrouridae,unassigned,unassigned
ASV_4,Eukaryota,Chordata,Actinopteri,Gadiformes,Merlucciidae,Merluccius,Merluccius productus
ASV_5,Eukaryota,Chordata,Actinopteri,Myctophiformes,Myctophidae,Stenobrachius,Stenobrachius leucopsarus


### Deduplicate taxa list; get summed reads and number of samples

In [3]:
levels = list(taxas[0])
df = pd.concat([taxas[0],otus[0]], axis=1)
df = df.groupby(levels).sum()
df['Total_reads'] = df.sum(axis=1)
df['Total_samples'] = df[df>=1].count(axis=1) -1 #subtract 1 for the Total_reads column
#df = df.sort_values(['Total_reads', 'Total_samples'], ascending=False)
print(list(df))
print(len(list(df)))
df = df[['Total_reads', 'Total_samples']]
df = df.sort_values(levels)
df.to_csv('../data/Deduplicated_Taxa_list.csv')
df

['CN19SESPMV1_SC58_eDNA_CE', 'CN19SESPKOA_SC58_eDNA_CE', 'CN19SESPKOA_SC57_eDNA_CE', 'CN19SESPKOA_SC56_eDNA_CE', 'CN19SESPKOA_SC55_eDNA_CE', 'CN19SESPMV1_SC55_eDNA_CE', 'CN19SESPMV1_SC54_eDNA_CE', 'CN19SESPKOA_SC54_eDNA_CE', 'CN19SESPKOA_SC53_eDNA_CE', 'CN19SESPMV1_SC53_eDNA_CE', 'CN19SESPMV1_SC52_eDNA_CE', 'CN19Sc03_12_eDNA_JJ', 'CN19Sc03_9_eDNA_JJ', 'CN19Sc03_1_eDNA_JJ', 'CN19Sc03_3_eDNA_JJ', 'CN19Sc03_7_eDNA_JJ', 'CN19Sc03_5_eDNA_JJ', 'CN19Sc03_4_eDNA_JJ', 'CN19Sc03_10_eDNA_JJ', 'CN19SESPMV1_SC51_eDNA_CE', 'CN19Sc10_9_eDNA_JJ', 'CN19Sc10_2_eDNA_JJ', 'CN19Sc10_10_eDNA_JJ', 'CN19Sc10_4_eDNA_JJ', 'CN19Sc10_7_eDNA_JJ', 'CN19Sc10_12_eDNA_JJ', 'CN19Sc10_1_eDNA_JJ', 'CN19Sc10_3_eDNA_JJ', 'CN19Sc10_5_eDNA_JJ', 'CN19Sc11_10_eDNA_JJ', 'CN19Sc11_7_eDNA_JJ', 'CN19Sc11_5_eDNA_JJ', 'CN19SESPMV1_SC50_eDNA_CE', 'CN19Sc12_5_eDNA_JJ', 'CN19Sc12_1_eDNA_JJ', 'CN19Sc12_2_eDNA_JJ', 'CN19Sc12_3_eDNA_JJ', 'CN19Sc12_12_eDNA_JJ', 'CN19Sc12_10_eDNA_JJ', 'CN19Sc12_9_eDNA_JJ', 'CN19Sc12_7_eDNA_JJ', 'CN19Sc12_4_

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Total_reads,Total_samples
Kingdom,Phylum,Class,Order,Family,Genus,Species,Unnamed: 7_level_1,Unnamed: 8_level_1
Eukaryota,Chordata,Actinopteri,Alepocephaliformes,Platytroctidae,Holtbyrnia,s_,7361,9
Eukaryota,Chordata,Actinopteri,Alepocephaliformes,Platytroctidae,Sagamichthys,Sagamichthys abei,15304,11
Eukaryota,Chordata,Actinopteri,Anabantiformes,Channidae,g_,s_,72,1
Eukaryota,Chordata,Actinopteri,Argentiniformes,Argentinidae,g_,s_,26340,21
Eukaryota,Chordata,Actinopteri,Argentiniformes,Bathylagidae,Bathylagus,Bathylagus pacificus,170,10
Eukaryota,Chordata,...,...,...,...,...,...,...
Eukaryota,Chordata,Mammalia,Cetacea,Ziphiidae,Ziphius,Ziphius cavirostris,25621,14
Eukaryota,Chordata,Mammalia,Cetacea,Ziphiidae,g_,s_,88,3
Eukaryota,Chordata,Mammalia,Cetacea,unassigned,g_,s_,71,4
Eukaryota,Chordata,Mammalia,unassigned,unassigned,g_,s_,6,1
