# Transcripts per L

In [1]:
import pandas as pd
import plotly.express as px



Transcripts per L have been calculated [here](../../scripts/analysis/SpikeIn_analysis_Normalisation.ipynb). Transcripts per L are TPM values scaled to the amount of seawater that has been processed. This might reflect ecosystem productivity better than TPM values!
Let's start by loading the data and adding it to the dataframe.

## Data preparation

In [2]:
# Load the necessary transcripts per L data
transcripts_per_L = pd.read_csv('../../data/kallisto/transcripts_per_L.csv', engine='pyarrow')
## This is the transcript read mapping, prior to protein prediction!
# Rename the first column to transcript_id
transcripts_per_L = transcripts_per_L.rename(columns={'target_id': 'transcript_id'})
# Transform the data to the long format
transcripts_per_L = transcripts_per_L.melt(id_vars=['transcript_id'],var_name='sample', value_name='Transcripts_per_L')
transcripts_per_L.head()

Unnamed: 0,transcript_id,sample,Transcripts_per_L
0,c_000000000001,1_130_S31,0.0
1,c_000000000002,1_130_S31,0.0
2,c_000000000003,1_130_S31,0.0
3,c_000000000004,1_130_S31,0.0
4,c_000000000005,1_130_S31,0.0


In [3]:
# Add whether or not we found a taxonomic match for each transcript
p_id = 60
annotation = pd.read_table(f'../../data/annotation/taxonomy_eukprot/eukprot_DB.firsthit.{p_id}plus_alnscore.m8',  engine='pyarrow', header=None)
annotation.iloc[:, 0] = annotation.iloc[:, 0].str.split(".", expand=True).drop(columns=1)

# Add a 1 or 0 to the transcripts_per_L dataframe to indicate whether or not we found a taxonomic match
transcripts_per_L['taxonomic_match'] = transcripts_per_L['transcript_id'].isin(annotation[0])

# View the data
transcripts_per_L.head()

Unnamed: 0,transcript_id,sample,Transcripts_per_L,taxonomic_match
0,c_000000000001,1_130_S31,0.0,False
1,c_000000000002,1_130_S31,0.0,False
2,c_000000000003,1_130_S31,0.0,False
3,c_000000000004,1_130_S31,0.0,False
4,c_000000000005,1_130_S31,0.0,False


In [4]:
# Check
transcripts_per_L['taxonomic_match'].value_counts()

taxonomic_match
False    295220626
True     145816932
Name: count, dtype: int64

In [5]:
# Add metadata to the data
meta = pd.read_csv('../../samples.csv', sep=';', index_col=0)

transcripts_per_L = transcripts_per_L.merge(meta, on='sample')

transcripts_per_L.head()

Unnamed: 0,transcript_id,sample,Transcripts_per_L,taxonomic_match,date,time,month,station
0,c_000000000001,1_130_S31,0.0,False,18/01/2021,9:05,January_2021,130
1,c_000000000002,1_130_S31,0.0,False,18/01/2021,9:05,January_2021,130
2,c_000000000003,1_130_S31,0.0,False,18/01/2021,9:05,January_2021,130
3,c_000000000004,1_130_S31,0.0,False,18/01/2021,9:05,January_2021,130
4,c_000000000005,1_130_S31,0.0,False,18/01/2021,9:05,January_2021,130


## Average sum of transcripts per L per station

Let's first plot the percentage of transcripts that has been annotated with a taxonomic identifier.

In [6]:
# Prepare the rates of annotation per month
annotation_grouped = transcripts_per_L[transcripts_per_L['Transcripts_per_L'] > 1][['month', 'taxonomic_match']]
annotation_grouped = annotation_grouped.groupby(['month']).sum()

# Divide the sum of taxonomic matches by the total number of transcript ids per month
annotation_grouped['taxonomic_match_rate'] = (annotation_grouped['taxonomic_match'] / transcripts_per_L[transcripts_per_L['Transcripts_per_L'] > 1][['month', 'transcript_id']].groupby(['month']).count()['transcript_id']) * 100

annotation_grouped.reset_index(inplace=True)

annotation_grouped['month'] = pd.Categorical(annotation_grouped['month'], ["July_2020", "August_2020", "September_2020", 
                                                        "November_2020", "December_2020", "January_2021", 
                                                        "February_2021", "April_2021", "May_2021", 
                                                        "June_2021", "July_2021"])

# Plot the data
fig = px.histogram(annotation_grouped.sort_values("month", ascending=False), 
                x="taxonomic_match_rate",
                y="month", 
                #text_auto='.2f',
                orientation ='h'
                )

fig.update_layout(
    font=dict(
        family="Times New Roman, serif",  # Set the font family to Times New Roman
        size=12,  # Set the font size
        color="#7f7f7f"  # Set the font color
    ),
    autosize=False,
    width=500,
    height=350,
    margin=dict( # Set the margins
        l=0,  # Left margin
        r=25,  # Right margin
        b=25,  # Bottom margin
        t=25  # Top margin
    ),
    # Set x limits
    xaxis_range=[0, 100],
    xaxis_title_text='Percentage of transcripts with taxonomic match'
)

fig.show()

# Save figure
fig.write_image("../../figures/taxonomic_analysis/percent_annotated_transcripts_updated.svg", scale=2)
fig.write_image("../../figures/taxonomic_analysis/percent_annotated_transcripts_updated.png", scale=2)

In [7]:
# Report the mean and standard deviation of the annotation rate
mean = annotation_grouped["taxonomic_match_rate"].mean()
std = annotation_grouped['taxonomic_match_rate'].std()
print('In any given month, we have annotated {:.2f}% (+- {:.2f} SD) of the transcripts.'.format(mean, std))

In any given month, we have annotated 37.18% (+- 3.18 SD) of the transcripts.


In [8]:
# Prepare the transcripts per L data
grouped_df = transcripts_per_L[transcripts_per_L['Transcripts_per_L'] > 1][['month', 'Transcripts_per_L']]
grouped_df = grouped_df.groupby(['month']).agg({'Transcripts_per_L':['sum', 'median', 'mean', 'min', 'max']})
grouped_df.columns = ['Transcripts_per_L_sum', 'Transcripts_per_L_median', 'Transcripts_per_L_mean', 'Transcripts_per_L_min', 'Transcripts_per_L_max']
grouped_df  = grouped_df.reset_index()
grouped_df['month'] = pd.Categorical(grouped_df['month'], ["July_2020", "August_2020", "September_2020", 
                                                        "November_2020", "December_2020", "January_2021", 
                                                        "February_2021", "April_2021", "May_2021", 
                                                        "June_2021", "July_2021"])

# Normalize the sums by dividing by the total sum of transcripts per L by the amount of stations visited
station_mapper = {'July_2020': 5, 'August_2020': 6, 'September_2020': 6, 'November_2020': 6, 'December_2020': 6, 'January_2021': 5, 'February_2021': 5, 'April_2021': 4, 'May_2021': 6, 'June_2021': 6, 'July_2021': 6}
grouped_df['Transcripts_per_L_sum_norm'] = grouped_df['Transcripts_per_L_sum'] / grouped_df['month'].map(station_mapper)

In [9]:
# Plot
fig = px.histogram(grouped_df.sort_values("month", ascending=False), 
                x="Transcripts_per_L_sum_norm",
                y="month", 
                #text_auto='.2f',
                orientation ='h'
                )

fig.update_layout(
    font=dict(
        family="Times New Roman, serif",  # Set the font family to Times New Roman
        size=12,  # Set the font size
        color="#7f7f7f"  # Set the font color
    ),
    autosize=False,
    width=500,
    height=350,
    margin=dict( # Set the margins
        l=0,  # Left margin
        r=25,  # Right margin
        b=25,  # Bottom margin
        t=25  # Top margin
    ),
    xaxis_title_text='Average sum of transcripts per L'
)

fig.show()

# Save figure
fig.write_image("../../figures/taxonomic_analysis/average_sum_transcripts_per_L_per_month_updated.svg", scale=2)

## Spatial variance in transcripts per L

In [10]:
# Show sum of transcripts per L per month and station
grouped_df = transcripts_per_L[transcripts_per_L['Transcripts_per_L'] > 1].groupby(['month', 'station']).agg({'Transcripts_per_L':['sum', 'median', 'mean', 'min', 'max']})
grouped_df.columns = ['Transcripts_per_L_sum', 'Transcripts_per_L_median', 'Transcripts_per_L_mean', 'Transcripts_per_L_min', 'Transcripts_per_L_max']
grouped_df  = grouped_df.reset_index()
grouped_df['month'] = pd.Categorical(grouped_df['month'], ["July_2020", "August_2020", "September_2020", 
                                                        "November_2020", "December_2020", "January_2021", 
                                                        "February_2021", "April_2021", "May_2021", 
                                                        "June_2021", "July_2021"])

# Visualize the sum of transcripts per station, per month

fig = px.bar(grouped_df,
            x = 'month',
            y = 'Transcripts_per_L_sum',
            color='station',
            category_orders={"station": ["ZG02", "330", "780", "120", "130", "700"],
                             "month": ["July_2020", "August_2020", "September_2020",
                                        "November_2020", "December_2020", "January_2021", 
                                        "February_2021", "April_2021", "May_2021", 
                                        "June_2021", "July_2021"]},            
            barmode='group')

fig.show()

In [11]:
stations = ['ZG02', '120', '330', '130', '780', '700']
for station in stations:
    # Prepare the data
    grouped_df = transcripts_per_L[(transcripts_per_L['Transcripts_per_L'] > 1) & (transcripts_per_L['station'] == station)][['month', 'station', 'Transcripts_per_L']]
    grouped_df = grouped_df.groupby(['month', 'station']).agg({'Transcripts_per_L':['sum', 'median', 'mean', 'min', 'max']})
    grouped_df.columns = ['Transcripts_per_L_sum', 'Transcripts_per_L_median', 'Transcripts_per_L_mean', 'Transcripts_per_L_min', 'Transcripts_per_L_max']
    grouped_df  = grouped_df.reset_index()
    grouped_df['month'] = pd.Categorical(grouped_df['month'], ["July_2020", "August_2020", "September_2020", 
                                                            "November_2020", "December_2020", "January_2021", 
                                                            "February_2021", "April_2021", "May_2021", 
                                                            "June_2021", "July_2021"])

    # Plot
    fig = px.histogram(grouped_df.sort_values("month", ascending=False), 
                    x="Transcripts_per_L_sum",
                    y="month",
                    category_orders={"station": ["780", "700", "330", "130", "ZG02", "120"],
                                     "month": ["July_2020", "August_2020", "September_2020",
                                                "November_2020", "December_2020", "January_2021", 
                                                "February_2021", "April_2021", "May_2021", 
                                                "June_2021", "July_2021"]},
                    #text_auto='.2f',
                    orientation ='h'
                    )

    fig.update_layout(
    font=dict(
        family="Times New Roman, serif",  # Set the font family to Times New Roman
        size=12,  # Set the font size
        color="#7f7f7f"  # Set the font color
    ),
    autosize=False,
    width=500,
    height=350,
    margin=dict( # Set the margins
        l=0,  # Left margin
        r=25,  # Right margin
        b=25,  # Bottom margin
        t=25  # Top margin
    ),
        xaxis_title_text='Sum of transcripts per L',
        title_text='Station {}'.format(station),
        # Set x limits
        xaxis_range=[0, 4500000000]
    )

    fig.show()

    # Save figure
    fig.write_image("../../figures/taxonomic_analysis/TPL_per_month_{}_updated.png".format(station), scale=2)
    fig.write_image("../../figures/taxonomic_analysis/TPL_per_month_{}_updated.svg".format(station), scale=2)

In [12]:
annotation.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,c_000004644880,EP00450_Noctiluca_scintillans_P011804,0.913,136,12,0,1,136,257,392,2.865e-78,262
1,c_000001576219,EP00113_Trichoplax_adhaerens_P002960,0.859,178,25,0,3,180,496,673,2.935e-97,319
2,c_000001590569,EP00090_Calanus_glacialis_P028022,0.717,144,40,0,1,144,1,144,3.8490000000000004e-60,210
3,c_000001590608,EP00450_Noctiluca_scintillans_P037545,0.881,270,28,0,1,239,61,330,1.496e-157,496
4,c_000001612250,EP00521_Asterionellopsis_glacialis_P000010,0.862,872,120,0,28,899,7,875,0.0,1559


In [13]:
# Create an annotation x transcripts per L dataframe
## Expand annotation
## In the second column, split of the EukProt ID off
eukprot_ID = annotation.iloc[:, 1].str.split("_", expand=True)[0]
annotation.iloc[:, 1] = eukprot_ID
annotation.columns = ['query_id', 'target_id', 'p_ident', 'alnlen', 'mismatch', 'gapopen', 'qstart', 'qend', 'tstart', 'tend', 'evalue', 'bits']

## Add taxonomic information
eukprot_taxonomy = pd.read_table('../../data/annotation/taxonomy_eukprot/EukProt_included_data_sets.v03.2021_11_22.txt')

# Drop the columns that are not needed
eukprot_taxonomy.drop(columns=['Previous_Names', 'Replaces_EukProt_ID', 'Data_Source_URL', 'Data_Source_Name', 'Paper_DOI', 'Actions_Prior_to_Use',
       'Data_Source_Type', 'Notes', 'Columns_Modified_Since_Previous_Version', 'Merged_Strains',
       'Alternative_Strain_Names', '18S_Sequence_GenBank_ID', '18S_Sequence',
       '18S_Sequence_Source', '18S_Sequence_Other_Strain_GenBank_ID',
       '18S_Sequence_Other_Strain_Name', '18S_and_Taxonomy_Notes'], inplace=True)

# Swap the _ to a space in the Name_to_Use column
eukprot_taxonomy['Name_to_Use'] = eukprot_taxonomy['Name_to_Use'].str.replace('_', ' ')

# Merge the annotation and taxonomy files
annotation = annotation.merge(eukprot_taxonomy, left_on='target_id', right_on='EukProt_ID', how='left')

# Drop the columns that are not needed
annotation.drop(columns=['target_id', 'gapopen', 'qstart', 'qend', 'tstart', 'tend', 'evalue', 'bits'], inplace=True)

transcripts_per_L_2 = annotation.merge(transcripts_per_L, left_on='query_id', right_on='transcript_id', how='left')
transcripts_per_L_2 = transcripts_per_L_2.drop(columns='query_id')

transcripts_per_L_2.head()

Unnamed: 0,p_ident,alnlen,mismatch,EukProt_ID,Name_to_Use,Strain,Genus_UniEuk,Epithet_UniEuk,Supergroup_UniEuk,Taxogroup1_UniEuk,Taxogroup2_UniEuk,Taxonomy_UniEuk,transcript_id,sample,Transcripts_per_L,taxonomic_match,date,time,month,station
0,0.913,136,12,EP00450,Noctiluca scintillans,,Noctiluca,scintillans,Alveolata,Dinoflagellata,core-Noctilucales,Eukaryota;Diaphoretickes;Sar;Alveolata;Myzozoa...,c_000004644880,1_130_S31,0.0,True,18/01/2021,9:05,January_2021,130
1,0.913,136,12,EP00450,Noctiluca scintillans,,Noctiluca,scintillans,Alveolata,Dinoflagellata,core-Noctilucales,Eukaryota;Diaphoretickes;Sar;Alveolata;Myzozoa...,c_000004644880,1_330_S34,0.0,True,18/01/2021,14:05,January_2021,330
2,0.913,136,12,EP00450,Noctiluca scintillans,,Noctiluca,scintillans,Alveolata,Dinoflagellata,core-Noctilucales,Eukaryota;Diaphoretickes;Sar;Alveolata;Myzozoa...,c_000004644880,1_700_S32,0.0,True,18/01/2021,11:45,January_2021,700
3,0.913,136,12,EP00450,Noctiluca scintillans,,Noctiluca,scintillans,Alveolata,Dinoflagellata,core-Noctilucales,Eukaryota;Diaphoretickes;Sar;Alveolata;Myzozoa...,c_000004644880,1_780_S33,0.0,True,18/01/2021,13:00,January_2021,780
4,0.913,136,12,EP00450,Noctiluca scintillans,,Noctiluca,scintillans,Alveolata,Dinoflagellata,core-Noctilucales,Eukaryota;Diaphoretickes;Sar;Alveolata;Myzozoa...,c_000004644880,1_ZG02_S35,0.0,True,18/01/2021,15:35,January_2021,ZG02


In [14]:
stations = ['ZG02', '120', '330', '130', '780', '700']

tax_level = 'Taxogroup2_UniEuk'

cm_to_px = 96 / 2.54  # Conversion factor from centimeters to pixels
width_in_cm = 7
height_in_cm = 6

tax_colors = {}

# Change Transcripts per L column name to TPL
transcripts_per_L_2 = transcripts_per_L_2.rename(columns={'Transcripts_per_L': 'TPL'})

for station in stations:
    grouped_df = transcripts_per_L_2[(transcripts_per_L_2['TPL'] > 1) & (transcripts_per_L_2['station'] == station)][[tax_level, 'month', 'TPL']]
    grouped_df = grouped_df.groupby([tax_level, 'month']).agg({'TPL':['sum', 'mean', 'min', 'max']})
    grouped_df.columns = ['TPL_sum', 'TPL_mean', 'TPL_min', 'TPL_max']
    grouped_df  = grouped_df.reset_index()
    grouped_df['month'] = pd.Categorical(grouped_df['month'], ["July_2020", "August_2020", "September_2020", 
                                                            "November_2020", "December_2020", "January_2021", 
                                                            "February_2021", "April_2021", "May_2021", 
                                                            "June_2021", "July_2021"])

    # Normalise sum of TPL values to the total TPL of that month
    grouped_df["rel_expression_per_month"] = grouped_df.TPL_sum / grouped_df.groupby('month').TPL_sum.transform('sum')

    # Combine low-abundant groups
    rare_groups = grouped_df['rel_expression_per_month'] <= 0.02
    grouped_df.loc[rare_groups, tax_level] = 'Rare'
    taxonomic_order = sorted(grouped_df[tax_level].unique())
    # always put 'Rare' first
    taxonomic_order.remove('Rare')
    taxonomic_order.insert(0, 'Rare')
    
    grouped_df[tax_level] = pd.Categorical(grouped_df[tax_level], categories=taxonomic_order, ordered=True)

    fig = px.histogram(grouped_df.sort_values(["month", tax_level], ascending=[False, True]), 
                x="TPL_sum", 
                y="month", 
                color=tax_level,
                color_discrete_map={
                    "Rare": "#545454",
                    "core-Noctilucales": "#56B4E8",
                    'Odontella': "#C44601",
                    'Prymnesiophyceae': "#009E73",
                    "Diatomeae": "#E69F00",
                    "Dinophyceae": "#56B4E9",
                    "Spirotrichea": "#F0E442",
                    "Arthropoda": "#0072B2",
                    "Ctenophora": "#8F4500",
                    "Foraminifera": "#D55E00",
                    "Platyhelminthes": "#CC79A7",
                    "gregarinomorphea": "#999999",
                    "Echinodermata": "#009A83",
                    "Oligohymenophorea": "#900101",
                    "Colpodellida": "#B1CE00",
                    "Annelida": "#FFF365",
                    "Urochordata": "#FFCC99",
                    "Vertebrata": "#6600CC",
                    "Acantharea": "#BA91DE",
                    "Mollusca": "#FF6600",
                    "Brachiopoda": "#006C0A",
                    "Hemistasiidae": "#FF1818",
                    "Diplonemidae": "#50C878",
                    "Pelagophyceae": "#A0F8FF",
                    "Florideophyceae": "#FF009D"
                },
                orientation='h',
                # Specify all the months that need to be included, 
                # even if no sample has been taken
                category_orders={"month": ["July_2020", "August_2020", "September_2020",
                                           "November_2020", "December_2020", "January_2021",
                                           "February_2021", "April_2021", "May_2021",
                                           "June_2021", "July_2021"],
                                "Taxogroup2_UniEuk": ['Rare',  'Diatomeae',  'Dinophyceae',  'core-Noctilucales', 'Annelida',
                                                      'Arthropoda', 'Brachiopoda', 'Cnidaria', 'Colpodellida', 'Ctenophora',
                                                        'Diplonemidae', 'Florideophyceae', 'Hemistasiidae', 'Mollusca', 'Pelagophyceae', 
                                                        'Spirotrichea', 'Vertebrata', 'gregarinomorphea']},
                )

    fig.update_layout(
        font=dict(
            family="Times New Roman, serif",  # Set the font family to Times New Roman
            size=8,  # Set the font size
            color="black"  # Set the font color
        ),
        autosize=False,
        width=width_in_cm * cm_to_px,
        height=height_in_cm * cm_to_px,
        margin=dict( # Set the margins
            l=0,  # Left margin
            r=25,  # Right margin
            b=25,  # Bottom margin
            t=25  # Top margin
        ),
        showlegend=False,
        xaxis_title_text='Sum of Transcripts per L',
        title_text='Station {}'.format(station),
        yaxis_title_text=''
        )

    fig.show()

    for trace in fig.data:
        # If trace is not in the dictionary, add it
        if trace.name not in tax_colors:
            # If trace contains a color already in use, give an error
            if trace.marker.color in tax_colors.values():
                raise ValueError(f'Color for {trace.name} already in use, review color disctrete map')
            tax_colors[trace.name] = trace.marker.color
           
    # Save figure as png
    fig.write_image("../../figures/taxonomic_analysis/eukprot_transcripts_per_L_per_month_{}_{}_p_id_{}.png".format(tax_level, station, p_id), scale=1)
    fig.write_image("../../figures/taxonomic_analysis/eukprot_transcripts_per_L_per_month_{}_{}_p_id_{}.svg".format(tax_level, station, p_id), scale=1)

























In [15]:
for station in stations:
    grouped_df = transcripts_per_L_2[(transcripts_per_L_2['TPL'] > 1) & (transcripts_per_L_2['station'] == station)][[tax_level, 'month', 'TPL']]
    grouped_df = grouped_df.groupby([tax_level, 'month']).agg({'TPL':['sum', 'mean', 'min', 'max']})
    grouped_df.columns = ['TPL_sum', 'TPL_mean', 'TPL_min', 'TPL_max']
    grouped_df  = grouped_df.reset_index()
    grouped_df['month'] = pd.Categorical(grouped_df['month'], ["July_2020", "August_2020", "September_2020", 
                                                            "November_2020", "December_2020", "January_2021", 
                                                            "February_2021", "April_2021", "May_2021", 
                                                            "June_2021", "July_2021"])

    # Normalise sum of TPL values to the total TPL of that month
    grouped_df["rel_expression_per_month"] = grouped_df.TPL_sum / grouped_df.groupby('month').TPL_sum.transform('sum')

    # Combine low-abundant groups
    rare_groups = grouped_df['rel_expression_per_month'] <= 0.02
    grouped_df.loc[rare_groups, tax_level] = 'Rare'
    taxonomic_order = sorted(grouped_df[tax_level].unique())
    # always put 'Rare' first
    taxonomic_order.remove('Rare')
    taxonomic_order.insert(0, 'Rare')
    
    grouped_df[tax_level] = pd.Categorical(grouped_df[tax_level], categories=taxonomic_order, ordered=True)

    fig = px.histogram(grouped_df.sort_values(["month", tax_level], ascending=[False, True]), 
                x="rel_expression_per_month", 
                y="month", 
                color=tax_level,
                color_discrete_map={
                    "Rare": "#545454",
                    "core-Noctilucales": "#56B4E8",
                    'Odontella': "#C44601",
                    'Prymnesiophyceae': "#009E73",
                    "Diatomeae": "#E69F00",
                    "Dinophyceae": "#56B4E9",
                    "Spirotrichea": "#F0E442",
                    "Arthropoda": "#0072B2",
                    "Ctenophora": "#8F4500",
                    "Foraminifera": "#D55E00",
                    "Platyhelminthes": "#CC79A7",
                    "gregarinomorphea": "#999999",
                    "Echinodermata": "#009A83",
                    "Oligohymenophorea": "#900101",
                    "Colpodellida": "#B1CE00",
                    "Annelida": "#FFF365",
                    "Urochordata": "#FFCC99",
                    "Vertebrata": "#6600CC",
                    "Acantharea": "#BA91DE",
                    "Mollusca": "#FF6600",
                    "Brachiopoda": "#006C0A",
                    "Hemistasiidae": "#FF1818",
                    "Diplonemidae": "#50C878",
                    "Pelagophyceae": "#A0F8FF",
                    "Florideophyceae": "#FF009D"
                },
                orientation='h',
                # Specify all the months that need to be included, 
                # even if no sample has been taken
                category_orders={"month": ["July_2020", "August_2020", "September_2020",
                                           "November_2020", "December_2020", "January_2021",
                                           "February_2021", "April_2021", "May_2021",
                                           "June_2021", "July_2021"],
                                "Taxogroup2_UniEuk": ['Rare',  'Diatomeae',  'Dinophyceae',  'core-Noctilucales', 'Annelida',
                                                      'Arthropoda', 'Brachiopoda', 'Cnidaria', 'Colpodellida', 'Ctenophora',
                                                        'Diplonemidae', 'Florideophyceae', 'Hemistasiidae', 'Mollusca', 'Pelagophyceae', 
                                                        'Spirotrichea', 'Vertebrata', 'gregarinomorphea']},
                )

    fig.update_layout(
        font=dict(
            family="Times New Roman, serif",  # Set the font family to Times New Roman
            size=8,  # Set the font size
            color="black"  # Set the font color
        ),
        autosize=False,
        width=width_in_cm * cm_to_px,
        height=height_in_cm * cm_to_px,
        margin=dict( # Set the margins
            l=0,  # Left margin
            r=25,  # Right margin
            b=25,  # Bottom margin
            t=25  # Top margin
        ),
        showlegend=False,
        xaxis_title_text='Sum of Transcripts per L',
        title_text='Station {}'.format(station),
        yaxis_title_text=''
        )

    fig.show()

    for trace in fig.data:
        # If trace is not in the dictionary, add it
        if trace.name not in tax_colors:
            # If trace contains a color already in use, give an error
            if trace.marker.color in tax_colors.values():
                raise ValueError(f'Color for {trace.name} already in use, review color disctrete map')
            tax_colors[trace.name] = trace.marker.color
           
    # Save figure as png
    fig.write_image("../../figures/taxonomic_analysis/eukprot_transcripts_per_L_fraction_per_month_{}_{}_p_id_{}.png".format(tax_level, station, p_id), scale=1)
    fig.write_image("../../figures/taxonomic_analysis/eukprot_transcripts_per_L_fraction_per_month_{}_{}_p_id_{}.svg".format(tax_level, station, p_id), scale=1)























