In [3]:
import os
import pandas as pd
import plotly.express as px
from tqdm import tqdm

## Data Preparation

In [4]:
# Load eukprot taxonomy with only necessary columns
taxonomy_cols = ['EukProt_ID', 'Name_to_Use', 'Taxogroup2_UniEuk', 'Genus_UniEuk']  # Adjust columns as necessary
eukprot_taxonomy = pd.read_table('../data/annotation/taxonomy_eukprot/EukProt_included_data_sets.v03.2021_11_22.txt', usecols=taxonomy_cols)
eukprot_taxonomy['Name_to_Use'] = eukprot_taxonomy['Name_to_Use'].str.replace('_', ' ')

# Load eukprot annotations for station 130
eukprot_annotation_cols = ['query_id', 'target_id', 'p_ident', 'alnlen', 'mismatch',
                           'gapopen', 'qstart', 'qend', 'tstart', 'tend', 'evalue', 'bits']
eukprot_annotation_130 = pd.read_table('../data/annotation/taxonomy_eukprot/130/eukprot_annotation.m8',
                                   header=None, names=eukprot_annotation_cols)

# Fix transcript names efficiently
eukprot_annotation_130['query_id'] = eukprot_annotation_130['query_id'].str.split(".", n=1, expand=True)[0]
eukprot_annotation_130['target_id'] = eukprot_annotation_130['target_id'].str.split("_", n=1, expand=True)[0]

# Merge annotation and taxonomy
eukprot_annotation_130 = eukprot_annotation_130.merge(eukprot_taxonomy, left_on='target_id', right_on='EukProt_ID', how='left')
print(f'The merged annotation and taxonomy file for station 130 contains {len(eukprot_annotation_130)} rows')

# Load eukprot annotations for station 51
eukprot_annotation_51 = pd.read_table('../data/annotation/taxonomy_eukprot/51/eukprot_annotation.m8',
                                   header=None, names=eukprot_annotation_cols)

# Fix transcript names efficiently
eukprot_annotation_51['query_id'] = eukprot_annotation_51['query_id'].str.split(".", n=1, expand=True)[0]
eukprot_annotation_51['target_id'] = eukprot_annotation_51['target_id'].str.split("_", n=1, expand=True)[0]

# Merge annotation and taxonomy
eukprot_annotation_51 = eukprot_annotation_51.merge(eukprot_taxonomy, left_on='target_id', right_on='EukProt_ID', how='left')
print(f'The merged annotation and taxonomy file for station 51 contains {len(eukprot_annotation_51)} rows')

The merged annotation and taxonomy file for station 130 contains 768693 rows
The merged annotation and taxonomy file for station 51 contains 239786 rows


In [5]:
# Load sample metadata
meta = pd.read_excel('../data/samples.xlsx')

# Convert date and time columns to string
meta['date'] = meta['date'].astype(str)
meta['time'] = meta['time'].astype(str)

# Create a datetime column from the date and time columns
meta['datetime'] = pd.to_datetime(meta['date'] + ' ' + meta['time'], format='%Y-%m-%d %H:%M:%S')

meta.head()

Unnamed: 0,sample_name,date,time,station,SW filtered (L),Eluate volume (mL),RNA extraction volume (mL),datetime
0,51_1,2023-04-18,13:00:00,51,50,9.0,4.0,2023-04-18 13:00:00
1,51_2,2023-04-18,14:00:00,51,50,8.0,4.0,2023-04-18 14:00:00
2,51_3,2023-04-18,15:00:00,51,50,8.0,4.0,2023-04-18 15:00:00
3,51_4,2023-04-18,16:00:00,51,50,7.0,3.0,2023-04-18 16:00:00
4,51_5,2023-04-18,17:00:00,51,50,7.0,3.0,2023-04-18 17:00:00


In [6]:
# Load tpm data
tpm_130 = pd.read_csv('../data/quantification/130_tpm.csv')
tpm_130.rename(columns={'target_id': 'transcript_id'}, inplace=True)
tpm_130.set_index('transcript_id', inplace=True)

# Optional: Remove rows with row sums < 1
tpm_130 = tpm_130[tpm_130.sum(axis=1) >= 1].reset_index()
tpm_130 = tpm_130.melt(id_vars=['transcript_id'], var_name='sample', value_name='TPM')
tpm_130['sample'] = tpm_130['sample'].astype('category')
tpm_130['TPM'] = tpm_130['TPM'].astype('float32')

In [7]:
# Load tpm data
tpm_51 = pd.read_csv('../data/quantification/51_tpm.csv')
tpm_51.rename(columns={'target_id': 'transcript_id'}, inplace=True)
tpm_51.set_index('transcript_id', inplace=True)

# Optional: Remove rows with row sums < 1
tpm_51 = tpm_51[tpm_51.sum(axis=1) >= 1].reset_index()
tpm_51 = tpm_51.melt(id_vars=['transcript_id'], var_name='sample', value_name='TPM')
tpm_51['sample'] = tpm_51['sample'].astype('category')
tpm_51['TPM'] = tpm_51['TPM'].astype('float32')

In [8]:
# Merge tpm into annotation
data_130 = eukprot_annotation_130.merge(tpm_130, left_on='query_id', right_on='transcript_id', how='left').drop(columns=['query_id'])
# Remove tpm and annotation from memory
#del tpm, eukprot_annotation

data_130 = data_130.merge(meta, left_on='sample', right_on='sample_name', how='left')
data_130.drop(columns='sample_name', inplace=True)

# Merge tpm into annotation
data_51 = eukprot_annotation_51.merge(tpm_51, left_on='query_id', right_on='transcript_id', how='left').drop(columns=['query_id'])
# Remove tpm and annotation from memory
#del tpm, eukprot_annotation

data_51 = data_51.merge(meta, left_on='sample', right_on='sample_name', how='left')
data_51.drop(columns='sample_name', inplace=True)

## Taxonomic turnover

### Taxonomic Classes

In [None]:
# Original color mapping
color_discrete_map = {
    "Rare": "#545454",
    "core-Noctilucales": "#56B4E8",
    'Odontella': "#C44601",
    'Prymnesiophyceae': "#009E73",
    "Diatomeae": "#E69F00",
    "Dinophyceae": "#56B4E9",
    "Spirotrichea": "#F0E442",
    "Arthropoda": "#0072B2",
    "Ctenophora": "#ADA7A7",
    "Vertebrata": "#FF5349",
    "Phaeodarea": "#D55E00",
    "Florideophyceae": "#CC79A7",
    "Acantharea": "#719A10",
}

In [42]:
# Filter data based on TPM and p_ident thresholds
filtered_data = data_130[(data_130['p_ident'] >= 0.75)]

# Group by taxonomic level and sample, then sum TPM
grouped_df = filtered_data.groupby(['Taxogroup2_UniEuk', 'datetime'])['TPM'].sum().reset_index()

grouped_df.head()
# Normalize TPM sums to get relative expression per hour
total_tpm_per_hour = grouped_df.groupby('datetime')['TPM'].transform('sum')
grouped_df['rel_expression_per_hour'] = grouped_df['TPM'] / total_tpm_per_hour

# Combine low-abundant groups into a single 'Rare' category
threshold = 0.05
grouped_df.loc[grouped_df['rel_expression_per_hour'] <= threshold, 'Taxogroup2_UniEuk'] = 'Rare'

# Combine 'core-Noctilucales' and 'Dinophyceae'
grouped_df['Taxogroup2_UniEuk'] = grouped_df['Taxogroup2_UniEuk'].replace({'core-Noctilucales': 'Dinophyceae'})

# Remove categories with no expression
grouped_df = grouped_df[grouped_df['rel_expression_per_hour'] > 0]

# Group once more to combine Rares
grouped_df = grouped_df.groupby(['Taxogroup2_UniEuk', 'datetime'])['rel_expression_per_hour'].sum().reset_index()


# Plot
fig = px.bar(grouped_df, 
            x='datetime', 
            y='rel_expression_per_hour', 
            color='Taxogroup2_UniEuk',
            title='Relative Expression Per Hour at Station 130',
            category_orders={"Taxogroup2_UniEuk": ['Arthropoda', 'Acantharea', 'Ctenophora', 'Diatomeae', 'Dinophyceae', 
                                                    'Florideophyceae','Odontella', 'Phaeodarea', 'Prymnesiophyceae',
                                                    'Spirotrichea', 'Vertebrata', 'Rare',]},
            color_discrete_map=color_discrete_map
            )

fig.update_layout(
    width=800,
    height=400,
    xaxis_title='Date and Time',
    yaxis_title='Relative Expression',
    xaxis_tickangle=-45,
    legend_title='Taxonomic Group'
)

# Save the figure
filename_base = "../figures/metatranscriptomics/130_relative_expression_per_hour"
fig.write_image(filename_base + ".png", scale=1)
fig.write_image(filename_base + ".svg", scale=1)

fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [43]:
# Group by taxonomic level and sample, then sum TPM
grouped_df = filtered_data.groupby(['Taxogroup2_UniEuk', 'datetime'])['TPM'].sum().reset_index()

grouped_df.head()
# Normalize TPM sums to get relative expression per hour
total_tpm_per_hour = grouped_df.groupby('datetime')['TPM'].transform('sum')
grouped_df['rel_expression_per_hour'] = grouped_df['TPM'] / total_tpm_per_hour

# Combine low-abundant groups into a single 'Rare' category
threshold = 0.05
grouped_df.loc[grouped_df['rel_expression_per_hour'] <= threshold, 'Taxogroup2_UniEuk'] = 'Rare'

# Combine 'core-Noctilucales' and 'Dinophyceae'
grouped_df['Taxogroup2_UniEuk'] = grouped_df['Taxogroup2_UniEuk'].replace({'core-Noctilucales': 'Dinophyceae'})

# Remove categories with no expression
grouped_df = grouped_df[grouped_df['rel_expression_per_hour'] > 0]

# Group once more to combine Rares
grouped_df = grouped_df.groupby(['Taxogroup2_UniEuk', 'datetime'])['TPM'].sum().reset_index()

# Plot
fig = px.bar(grouped_df, 
            x='datetime', 
            y='TPM', 
            color='Taxogroup2_UniEuk',
            title='TPM Expression Per hour at Station 130',
            category_orders={"Taxogroup2_UniEuk": ['Arthropoda', 'Acantharea', 'Ctenophora', 'Diatomeae', 'Dinophyceae', 
                                                    'Florideophyceae','Odontella', 'Phaeodarea', 'Prymnesiophyceae',
                                                    'Spirotrichea', 'Vertebrata', 'Rare',]},
            color_discrete_map=color_discrete_map
            )

fig.update_layout(
    width=800,
    height=400,
    xaxis_title='Date and Time',
    yaxis_title='Relative Expression',
    xaxis_tickangle=-45,
    legend_title='Taxonomic Group'
)

# Save the figure
filename_base = "../figures/metatranscriptomics/130_expression_per_hour"
fig.write_image(filename_base + ".png", scale=1)
fig.write_image(filename_base + ".svg", scale=1)

fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [24]:
# Filter data based on TPM and p_ident thresholds
filtered_data = data_51[(data_51['p_ident'] >= 0.75)]

# Group by taxonomic level and sample, then sum TPM
grouped_df = filtered_data.groupby(['Taxogroup2_UniEuk', 'datetime'])['TPM'].sum().reset_index()

grouped_df.head()
# Normalize TPM sums to get relative expression per hour
total_tpm_per_hour = grouped_df.groupby('datetime')['TPM'].transform('sum')
grouped_df['rel_expression_per_hour'] = grouped_df['TPM'] / total_tpm_per_hour

# Combine low-abundant groups into a single 'Rare' category
threshold = 0.05
grouped_df.loc[grouped_df['rel_expression_per_hour'] <= threshold, 'Taxogroup2_UniEuk'] = 'Rare'

# Combine 'core-Noctilucales' and 'Dinophyceae'
grouped_df['Taxogroup2_UniEuk'] = grouped_df['Taxogroup2_UniEuk'].replace({'core-Noctilucales': 'Dinophyceae'})

# Remove categories with no expression
grouped_df = grouped_df[grouped_df['rel_expression_per_hour'] > 0]

# Group once more to combine Rares
grouped_df = grouped_df.groupby(['Taxogroup2_UniEuk', 'datetime'])['rel_expression_per_hour'].sum().reset_index()

# Plot
fig = px.bar(grouped_df, 
            x='datetime', 
            y='rel_expression_per_hour', 
            color='Taxogroup2_UniEuk',
            title='Relative Expression Per Hour at Station 51',
            category_orders={"Taxogroup2_UniEuk": ['Arthropoda', 'Acantharea', 'Ctenophora', 'Diatomeae', 'Dinophyceae', 
                                                    'Florideophyceae','Odontella', 'Phaeodarea', 'Prymnesiophyceae',
                                                    'Spirotrichea', 'Vertebrata', 'Rare',]},
            color_discrete_map=color_discrete_map
            )

fig.update_layout(
    width=800,
    height=400,
    xaxis_title='Date and Time',
    yaxis_title='Relative Expression',
    xaxis_tickangle=-45,
    legend_title='Taxonomic Group'
)

# Save the figure
filename_base = "../figures/metatranscriptomics/51_relative_expression_per_hour"
fig.write_image(filename_base + ".png", scale=1)
fig.write_image(filename_base + ".svg", scale=1)

fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [25]:
# Group by taxonomic level and sample, then sum TPM
grouped_df = filtered_data.groupby(['Taxogroup2_UniEuk', 'datetime'])['TPM'].sum().reset_index()

grouped_df.head()
# Normalize TPM sums to get relative expression per hour
total_tpm_per_hour = grouped_df.groupby('datetime')['TPM'].transform('sum')
grouped_df['rel_expression_per_hour'] = grouped_df['TPM'] / total_tpm_per_hour

# Combine low-abundant groups into a single 'Rare' category
threshold = 0.05
grouped_df.loc[grouped_df['rel_expression_per_hour'] <= threshold, 'Taxogroup2_UniEuk'] = 'Rare'

# Combine 'core-Noctilucales' and 'Dinophyceae'
grouped_df['Taxogroup2_UniEuk'] = grouped_df['Taxogroup2_UniEuk'].replace({'core-Noctilucales': 'Dinophyceae'})

# Remove categories with no expression
grouped_df = grouped_df[grouped_df['rel_expression_per_hour'] > 0]

# Group once more to combine Rares
grouped_df = grouped_df.groupby(['Taxogroup2_UniEuk', 'datetime'])['TPM'].sum().reset_index()

# Plot
fig = px.bar(grouped_df, 
            x='datetime', 
            y='TPM', 
            color='Taxogroup2_UniEuk',
            title='TPM Expression Per hour at Station 51',
            category_orders={"Taxogroup2_UniEuk": ['Arthropoda', 'Acantharea', 'Ctenophora', 'Diatomeae', 'Dinophyceae', 
                                                    'Florideophyceae','Odontella', 'Phaeodarea', 'Prymnesiophyceae',
                                                    'Spirotrichea', 'Vertebrata', 'Rare',]},
            color_discrete_map=color_discrete_map
            )

fig.update_layout(
    width=800,
    height=400,
    xaxis_title='Date and Time',
    yaxis_title='Relative Expression',
    xaxis_tickangle=-45,
    legend_title='Taxonomic Group'
)

# Save the figure
filename_base = "../figures/metatranscriptomics/51_expression_per_hour"
fig.write_image(filename_base + ".png", scale=1)
fig.write_image(filename_base + ".svg", scale=1)

fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



### Genera

In [26]:
# Original color mapping
color_discrete_map = {
    "Rare": "#545454",
    "Noctiluca": "#56B4E8",
    'Odontella': "#C44601",
    'Phaeocystis': "#009E73",
    "Diatomeae": "#E69F00",
    "Dinophyceae": "#56B4E9",
    "Spirotrichea": "#F0E442",
    "Arthropoda": "#0072B2",
    "Ctenophora": "#ADA7A7",
    "Vertebrata": "#FF5349",
    "Phaeodarea": "#D55E00",
    "Florideophyceae": "#CC79A7",
    "Acantharea": "#719A10",
}

In [27]:
# Filter data based on TPM and p_ident thresholds
filtered_data = data_130[(data_130['p_ident'] >= 0.95)]

# Group by taxonomic level and 'year_hour', then sum TPM
grouped_df = filtered_data.groupby(['Genus_UniEuk', 'datetime'])['TPM'].sum().reset_index()

grouped_df.head()
# Normalize TPM sums to get relative expression per hour
total_tpm_per_hour = grouped_df.groupby('datetime')['TPM'].transform('sum')
grouped_df['rel_expression_per_hour'] = grouped_df['TPM'] / total_tpm_per_hour

# Combine low-abundant groups into a single 'Rare' category
threshold = 0.05
grouped_df.loc[grouped_df['rel_expression_per_hour'] <= threshold, 'Genus_UniEuk'] = 'Rare'


# Remove categories with no expression
grouped_df = grouped_df[grouped_df['rel_expression_per_hour'] > 0]

# Group once more to combine Rares
grouped_df = grouped_df.groupby(['Genus_UniEuk', 'datetime'])['rel_expression_per_hour'].sum().reset_index()

# Plot
fig = px.bar(grouped_df, 
            x='datetime', 
            y='rel_expression_per_hour', 
            color='Genus_UniEuk',
            title='Relative Expression Per Hour at Station 130',
            category_orders={"Genus_UniEuk": ['Arthropoda', 'Acantharea', 'Ctenophora', 'Diatomeae', 'Dinophyceae', 
                                                    'Florideophyceae','Odontella', 'Phaeodarea', 'Prymnesiophyceae',
                                                    'Spirotrichea', 'Vertebrata', 'Rare',]},
            color_discrete_map=color_discrete_map
            )

fig.update_layout(
    width=800,
    height=400,
    xaxis_title='Date and Time',
    yaxis_title='Relative Expression',
    xaxis_tickangle=-45,
    legend_title='Taxonomic Group'
)

# Save the figure
filename_base = "../figures/metatranscriptomics/130_relative_expression_per_hour_genus"
fig.write_image(filename_base + ".png", scale=1)
fig.write_image(filename_base + ".svg", scale=1)

fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [28]:
# Filter data based on TPM and p_ident thresholds
filtered_data = data_51[(data_51['p_ident'] >= 0.95)]

# Group by taxonomic level and 'year_hour', then sum TPM
grouped_df = filtered_data.groupby(['Genus_UniEuk', 'datetime'])['TPM'].sum().reset_index()

grouped_df.head()
# Normalize TPM sums to get relative expression per hour
total_tpm_per_hour = grouped_df.groupby('datetime')['TPM'].transform('sum')
grouped_df['rel_expression_per_hour'] = grouped_df['TPM'] / total_tpm_per_hour

# Combine low-abundant groups into a single 'Rare' category
threshold = 0.05
grouped_df.loc[grouped_df['rel_expression_per_hour'] <= threshold, 'Genus_UniEuk'] = 'Rare'


# Remove categories with no expression
grouped_df = grouped_df[grouped_df['rel_expression_per_hour'] > 0]

# Group once more to combine Rares
grouped_df = grouped_df.groupby(['Genus_UniEuk', 'datetime'])['rel_expression_per_hour'].sum().reset_index()

# Plot
fig = px.bar(grouped_df, 
            x='datetime', 
            y='rel_expression_per_hour', 
            color='Genus_UniEuk',
            title='Relative Expression Per Hour at Station 51',
            category_orders={"Genus_UniEuk": ['Arthropoda', 'Acantharea', 'Ctenophora', 'Diatomeae', 'Dinophyceae', 
                                                    'Florideophyceae','Odontella', 'Phaeodarea', 'Prymnesiophyceae',
                                                    'Spirotrichea', 'Vertebrata', 'Rare',]},
            color_discrete_map=color_discrete_map
            )

fig.update_layout(
    width=800,
    height=400,
    xaxis_title='Date and Time',
    yaxis_title='Relative Expression',
    xaxis_tickangle=-45,
    legend_title='Taxonomic Group'
)

# Save the figure
filename_base = "../figures/metatranscriptomics/51_relative_expression_per_hour_genus"
fig.write_image(filename_base + ".png", scale=1)
fig.write_image(filename_base + ".svg", scale=1)

fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



### Transcripts per L
Here, we will calculate the number of transcripts per L for each sample. and plot the distribution of the number of transcripts per L for each sample.

In [24]:
# Use TPM sums to calculate Transcripts per Liter
ERCC_normalisation = pd.read_csv('../data/ERCC92/ERCC_normalisation.csv')

# Merge ERCC normalisation data into dataframe
data_130 = data_130.merge(ERCC_normalisation, left_on='sample', right_on='sample_name', how='left')
data_51 = data_51.merge(ERCC_normalisation, left_on='sample', right_on='sample_name', how='left')

# Calculate TPL
data_130['TPL'] = data_130['TPM'] * data_130['ERCC_norm_factor']
data_51['TPL'] = data_51['TPM'] * data_51['ERCC_norm_factor']

In [31]:
# Filter data based on TPL and p_ident thresholds
filtered_data = data_130[(data_130['p_ident'] >= 0.95)]

# Group by taxonomic level and sample, then sum TPL
grouped_df = filtered_data.groupby(['Genus_UniEuk', 'datetime'])['TPL'].sum().reset_index()

grouped_df.head()
# Normalize TPL sums to get relative expression per hour
total_TPL_per_hour = grouped_df.groupby('datetime')['TPL'].transform('sum')
grouped_df['rel_expression_per_hour'] = grouped_df['TPL'] / total_TPL_per_hour

# Combine low-abundant groups into a single 'Rare' category
threshold = 0.05
grouped_df.loc[grouped_df['rel_expression_per_hour'] <= threshold, 'Genus_UniEuk'] = 'Rare'


# Remove categories with no expression
grouped_df = grouped_df[grouped_df['rel_expression_per_hour'] > 0]

# Group once more to combine Rares
grouped_df = grouped_df.groupby(['Genus_UniEuk', 'datetime'])['TPL'].sum().reset_index()

# Plot
fig = px.bar(grouped_df, 
            x='datetime', 
            y='TPL', 
            color='Genus_UniEuk',
            title='TPL sum at Station 130',
            category_orders={"Genus_UniEuk": ['Arthropoda', 'Acantharea', 'Ctenophora', 'Diatomeae', 'Dinophyceae', 
                                                    'Florideophyceae','Odontella', 'Phaeodarea', 'Prymnesiophyceae',
                                                    'Spirotrichea', 'Vertebrata', 'Rare',]},
            color_discrete_map=color_discrete_map
            )

fig.update_layout(
    width=800,
    height=400,
    xaxis_title='Date and Time',
    yaxis_title='TPL',
    xaxis_tickangle=-45,
    legend_title='Taxonomic Group'
)

# Save the figure
filename_base = "../figures/metatranscriptomics/130_TPL_per_hour_genus"
fig.write_image(filename_base + ".png", scale=1)
fig.write_image(filename_base + ".svg", scale=1)

fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [30]:
# Filter data based on TPL and p_ident thresholds
filtered_data = data_51[(data_51['p_ident'] >= 0.95)]

# Group by taxonomic level and sample, then sum TPL
grouped_df = filtered_data.groupby(['Genus_UniEuk', 'datetime'])['TPL'].sum().reset_index()

grouped_df.head()
# Normalize TPL sums to get relative expression per hour
total_TPL_per_hour = grouped_df.groupby('datetime')['TPL'].transform('sum')
grouped_df['rel_expression_per_hour'] = grouped_df['TPL'] / total_TPL_per_hour

# Combine low-abundant groups into a single 'Rare' category
threshold = 0.05
grouped_df.loc[grouped_df['rel_expression_per_hour'] <= threshold, 'Genus_UniEuk'] = 'Rare'


# Remove categories with no expression
grouped_df = grouped_df[grouped_df['rel_expression_per_hour'] > 0]

# Group once more to combine Rares
grouped_df = grouped_df.groupby(['Genus_UniEuk', 'datetime'])['TPL'].sum().reset_index()

# Plot
fig = px.bar(grouped_df, 
            x='datetime', 
            y='TPL', 
            color='Genus_UniEuk',
            title='TPL sum at Station 51',
            category_orders={"Genus_UniEuk": ['Arthropoda', 'Acantharea', 'Ctenophora', 'Diatomeae', 'Dinophyceae', 
                                                    'Florideophyceae','Odontella', 'Phaeodarea', 'Prymnesiophyceae',
                                                    'Spirotrichea', 'Vertebrata', 'Rare',]},
            color_discrete_map=color_discrete_map
            )

fig.update_layout(
    width=800,
    height=400,
    xaxis_title='Date and Time',
    yaxis_title='TPL',
    xaxis_tickangle=-45,
    legend_title='Taxonomic Group'
)

# Save the figure
filename_base = "../figures/metatranscriptomics/51_TPL_per_hour_genus"
fig.write_image(filename_base + ".png", scale=1)
fig.write_image(filename_base + ".svg", scale=1)

fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



## Species Binning
Here, I want to extract the transcriptome of Phaeocystis to be used in subsequent analyses.

In [9]:
# Aggregate the number of transcripts per species
transcript_count = eukprot_annotation_130[eukprot_annotation_130['p_ident'] >= 0.95]['Genus_UniEuk'].value_counts()
# Print the first 5 entries
print(transcript_count.head())

# Get the top 5 genera with the most transcripts
top_genera = transcript_count.head(5)

# Filter the eukprot_annotation_130 set of annotated transcripts for each of the top genera and save to separate files
for genus in top_genera.index:
    genus_eukprot_annotation_130 = eukprot_annotation_130[eukprot_annotation_130['Genus_UniEuk'] == genus]
    filename = f"{genus}_transcriptome_bin.csv"
    genus_eukprot_annotation_130.to_csv(os.path.join('../data/annotation/taxonomy_eukprot/130/genus_bins/', filename), index=False)
    print(f"{genus}: {top_genera[genus]} transcripts - Saved")

print(f"Files have been saved for the top {len(top_genera)} genus with the most transcripts.")

Genus_UniEuk
Noctiluca      41201
Phaeocystis    22095
Hemistasia      1660
Acartia         1149
Eurytemora       819
Name: count, dtype: int64
Noctiluca: 41201 transcripts - Saved
Phaeocystis: 22095 transcripts - Saved
Hemistasia: 1660 transcripts - Saved
Acartia: 1149 transcripts - Saved
Eurytemora: 819 transcripts - Saved
Files have been saved for the top 5 genus with the most transcripts.


In [12]:
# Aggregate the number of transcripts per species
transcript_count = eukprot_annotation_51[eukprot_annotation_51['p_ident'] >= 0.95]['Genus_UniEuk'].value_counts()
# Print the first 5 entries
print(transcript_count.head())

# Get the top 5 genera with the most transcripts
top_genera = transcript_count.head(5)

# Filter the eukprot_annotation_51 set of annotated transcripts for each of the top genera and save to separate files
for genus in top_genera.index:
    genus_eukprot_annotation_51 = eukprot_annotation_51[eukprot_annotation_51['Genus_UniEuk'] == genus]
    filename = f"{genus}_transcriptome_bin.csv"
    genus_eukprot_annotation_51.to_csv(os.path.join('../data/annotation/taxonomy_eukprot/51/genus_bins/', filename), index=False)
    print(f"{genus}: {top_genera[genus]} transcripts - Saved")

print(f"Files have been saved for the top {len(top_genera)} genus with the most transcripts.")

Genus_UniEuk
Noctiluca      11549
Phaeocystis     3314
Homo            3057
Calanus         2284
Acartia         2082
Name: count, dtype: int64
Noctiluca: 11549 transcripts - Saved
Phaeocystis: 3314 transcripts - Saved
Homo: 3057 transcripts - Saved
Calanus: 2284 transcripts - Saved
Acartia: 2082 transcripts - Saved
Files have been saved for the top 5 genus with the most transcripts.


## Phaeocystis
For station 130, we'll extract the transcriptome bin of Phaeocystis. Once the sum of TPM values, once the TPL values.

In [13]:
# Define genera of interest
genera = [
    'Phaeocystis'
]

In [14]:
for genus in tqdm(genera, desc='Binning Genera'):
    print(f'Processing {genus}')
    genus_transcripts = pd.read_csv(f'../data/annotation/taxonomy_eukprot/130/genus_bins/{genus}_transcriptome_bin.csv', usecols=['query_id', 'p_ident'])
    # Only retain transcripts with a p_ident of 0.8 or higher
    genus_transcripts = genus_transcripts[genus_transcripts['p_ident'] >= 0.8]
    
    # Extract transcript counts that also belong to the genus of interest
    genus_data = tpm_130[tpm_130['transcript_id'].isin(genus_transcripts['query_id'])]
    
    # Print the amount of transcripts
    print(f'{genus} has {len(genus_data['transcript_id'].unique())} transcripts')
    # Create the matrix
    genus_data = genus_data.reset_index().pivot(index='transcript_id', columns='sample')['TPM']

    # Save the data
    genus_data.to_csv(f'../data/annotation/taxonomy_eukprot/130/genus_bins/{genus}_transcript_expression_sum.csv')

Binning Genera:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Phaeocystis
Phaeocystis has 111973 transcripts


Binning Genera: 100%|██████████| 1/1 [00:14<00:00, 14.41s/it]


In [18]:
phaeocystis_bin = data_130[data_130['Name_to_Use'] == 'Phaeocystis globosa']
# Remove NaN values
phaeocystis_bin = phaeocystis_bin.dropna()
phaeocystis_bin = phaeocystis_bin[['sample', 'TPM']].groupby('sample').sum().reset_index()

# Export the data
phaeocystis_bin.to_csv('../data/analysis/phaeocystis_bin_tpm.csv', index=False)

# Use TPM sums to calculate Transcripts per Liter
ERCC_normalisation = pd.read_csv('../data/ERCC92/ERCC_normalisation.csv')

# Merge ERCC normalisation data with the TPM sums
phaeocystis_bin = phaeocystis_bin.merge(ERCC_normalisation, left_on='sample', right_on='sample_name', how='left')

# Add a column for Transcripts per Liter
phaeocystis_bin['TPL'] = phaeocystis_bin['TPM'] * phaeocystis_bin['ERCC_norm_factor']

# Remove the ERCC normalisation factor
phaeocystis_bin.drop(columns='ERCC_norm_factor', inplace=True)

# Export the data
phaeocystis_bin[['sample', 'TPL']].to_csv('../data/analysis/phaeocystis_bin_tpl.csv', index=False)

For the RCM analysis, we need a dataframe which is KEGG KO IDs x samples containing summed count values.
Let's create this dataframe.

In [19]:
genus_transcripts.head()

Unnamed: 0,query_id,p_ident
0,c_000000852232,0.992
1,c_000000852269,0.985
4,c_000001418346,0.821
6,c_000002247011,0.916
8,c_000002692290,1.0


In [20]:
phaeocystis_bin.head()

Unnamed: 0,sample,TPM,sample_name,TPL
0,130_1,113736.367188,130_1,16027850.0
1,130_10,120183.765625,130_10,27304760.0
2,130_11,119309.898438,130_11,24200540.0
3,130_12,195464.265625,130_12,56681710.0
4,130_13,100914.476562,130_13,36623220.0


In [22]:
# Read in the functional annotation data
functional_annotation = pd.read_table('../data/annotation/functional/130/functional_annotation.emapper.annotations')

functional_annotation.head()

Unnamed: 0,#query,seed_ortholog,evalue,score,eggNOG_OGs,max_annot_lvl,COG_category,Description,Preferred_name,GOs,...,KEGG_ko,KEGG_Pathway,KEGG_Module,KEGG_Reaction,KEGG_rclass,BRITE,KEGG_TC,CAZy,BiGG_Reaction,PFAMs
0,NODE_43315_length_315_cov_15.627660_g42640_i0.p2,3218.PP1S18_265V6.1,5.151e-43,158.0,"COG1643@1|root,KOG0925@2759|Eukaryota,37KVD@33...",35493|Streptophyta,A,Pre-mRNA-splicing factor ATP-dependent RNA hel...,-,"GO:0003674,GO:0003676,GO:0003723,GO:0003724,GO...",...,ko:K12820,"ko03040,map03040",-,-,-,"ko00000,ko00001,ko01000,ko03009,ko03041",-,-,-,"AAA_22,DEAD,HA2,Helicase_C,OB_NTP_bind"
1,NODE_43321_length_312_cov_4.410811_g42892_i0.p1,7213.XP_004521134.1,7.741e-08,57.0,"COG0526@1|root,KOG0191@2759|Eukaryota,39X8I@33...",33208|Metazoa,O,It is involved in the biological process descr...,-,-,...,-,-,-,-,-,-,-,-,-,"Thioredoxin,Thioredoxin_6"
2,NODE_43329_length_505_cov_1.023810_g42339_i0.p1,2903.EOD39159,2.059e-46,172.0,"2CNC4@1|root,2QV4U@2759|Eukaryota",2759|Eukaryota,S,Phytanoyl-CoA dioxygenase (PhyH),-,-,...,-,-,-,-,-,-,-,-,-,PhyH
3,NODE_43331_length_505_cov_1.015873_g42341_i0.p1,157072.XP_008880268.1,9.688e-49,179.0,"COG0325@1|root,KOG3157@2759|Eukaryota",2759|Eukaryota,F,pyridoxal phosphate binding,-,-,...,ko:K06997,-,-,-,-,ko00000,-,-,-,Ala_racemase_N
4,NODE_43338_length_415_cov_5.663194_g42112_i0.p1,7897.ENSLACP00000017934,0.0002876,48.0,"KOG3714@1|root,KOG3714@2759|Eukaryota,38H9H@33...",33208|Metazoa,O,metalloendopeptidase activity,BMP1,"GO:0000003,GO:0000578,GO:0000902,GO:0000904,GO...",...,"ko:K05502,ko:K08076,ko:K09608,ko:K13045,ko:K13...",-,-,-,-,"ko00000,ko01000,ko01002,ko04052",-,-,-,"Astacin,CUB,EGF_CA,FXa_inhibition"
