In [1]:
import os
import pandas as pd
import plotly.express as px
from tqdm import tqdm

## Data Preparation

In [2]:
# Load eukprot taxonomy with only necessary columns
taxonomy_cols = ['EukProt_ID', 'Name_to_Use', 'Taxogroup2_UniEuk', 'Genus_UniEuk']  # Adjust columns as necessary
eukprot_taxonomy = pd.read_table('../data/annotation/taxonomy_eukprot/EukProt_included_data_sets.v03.2021_11_22.txt', usecols=taxonomy_cols)
eukprot_taxonomy['Name_to_Use'] = eukprot_taxonomy['Name_to_Use'].str.replace('_', ' ')

# Load eukprot annotations for station 130
eukprot_annotation_cols = ['query_id', 'target_id', 'p_ident', 'alnlen', 'mismatch',
                           'gapopen', 'qstart', 'qend', 'tstart', 'tend', 'evalue', 'bits']
eukprot_annotation_130 = pd.read_table('../data/annotation/taxonomy_eukprot/130/eukprot_annotation.m8',
                                   header=None, names=eukprot_annotation_cols)

# Fix transcript names efficiently
eukprot_annotation_130['query_id'] = eukprot_annotation_130['query_id'].str.split(".", n=1, expand=True)[0]
eukprot_annotation_130['target_id'] = eukprot_annotation_130['target_id'].str.split("_", n=1, expand=True)[0]

# Merge annotation and taxonomy
eukprot_annotation_130 = eukprot_annotation_130.merge(eukprot_taxonomy, left_on='target_id', right_on='EukProt_ID', how='left')
# Drop rows with missing taxonomy
eukprot_annotation_130 = eukprot_annotation_130.dropna()
# Remove transcripts annotated as vertebrates (most are'Homo'), this occurs in different samples for the different sequencing sets and is most likely contamination
eukprot_annotation_130 = eukprot_annotation_130[~eukprot_annotation_130['Taxogroup2_UniEuk'].isin(['Vertebrata'])]
print(f'The merged annotation and taxonomy file for station 130 contains {len(eukprot_annotation_130)} rows')

# Load eukprot annotations for station 51
eukprot_annotation_51 = pd.read_table('../data/annotation/taxonomy_eukprot/51/eukprot_annotation.m8',
                                   header=None, names=eukprot_annotation_cols)

# Fix transcript names efficiently
eukprot_annotation_51['query_id'] = eukprot_annotation_51['query_id'].str.split(".", n=1, expand=True)[0]
eukprot_annotation_51['target_id'] = eukprot_annotation_51['target_id'].str.split("_", n=1, expand=True)[0]

# Merge annotation and taxonomy
eukprot_annotation_51 = eukprot_annotation_51.merge(eukprot_taxonomy, left_on='target_id', right_on='EukProt_ID', how='left')
# Drop rows with missing taxonomy
eukprot_annotation_51 = eukprot_annotation_51.dropna()
# Remove transcripts annotated as vertebrates (most are'Homo'), this occurs in different samples for the different sequencing sets and is most likely contamination
eukprot_annotation_51 = eukprot_annotation_51[~eukprot_annotation_51['Taxogroup2_UniEuk'].isin(['Vertebrata'])]
print(f'The merged annotation and taxonomy file for station 51 contains {len(eukprot_annotation_51)} rows')

The merged annotation and taxonomy file for station 130 contains 1066978 rows
The merged annotation and taxonomy file for station 51 contains 385769 rows


In [3]:
# Load sample metadata
meta = pd.read_csv('../data/samples_env.csv')

# Create a datetime column from the date and time columns
meta['Date'] = pd.to_datetime(meta['Date'], format='%Y-%m-%d %H:%M:%S')

meta.head()

Unnamed: 0,Station,StationPrefix,StationSuffix,Latitude,Longitude,Date,day_moment,day_length,Temperature,Salinity,...,Fluorescence,NH4,NO2,NO3,NOX,PO4,Si,TEP,sea_surface_height_above_sea_level,surface_baroclinic_sea_water_velocity
0,51_1,51,1,51.531661,3.182804,2023-04-18 11:11:00,Day,14.033333,9.9259,32.513,...,2.001848,3.08,0.16,6.96,7.12,0.1,7.24,65.846667,1.817039,0.737313
1,51_2,51,2,51.533392,3.184085,2023-04-18 12:05:00,Day,14.033333,9.9105,32.683,...,,3.45,0.15,6.19,6.34,0.14,7.06,167.438667,2.129479,0.933577
2,51_3,51,3,51.532763,3.185113,2023-04-18 13:08:00,Day,14.033333,9.9231,32.661,...,1.472718,3.53,0.15,6.13,6.28,0.12,7.13,210.709333,1.852063,0.844862
3,51_4,51,4,51.533244,3.184346,2023-04-18 14:05:00,Day,14.033333,9.9013,32.703,...,1.423711,,,,,,,255.861333,1.210772,0.645538
4,51_5,51,5,51.533104,3.183575,2023-04-18 15:07:00,Day,14.033333,9.9336,32.59,...,1.315579,4.18,0.15,6.54,6.69,0.15,7.53,223.878667,0.336479,0.489504


In [4]:
# Load tpm data
tpm_130 = pd.read_csv('../data/quantification/130/130_tpm.csv')
tpm_130.rename(columns={'target_id': 'transcript_id'}, inplace=True)
tpm_130.set_index('transcript_id', inplace=True)

# Optional: Remove rows with row sums < 1
tpm_130 = tpm_130[tpm_130.sum(axis=1) >= 1].reset_index()
tpm_130 = tpm_130.melt(id_vars=['transcript_id'], var_name='sample', value_name='TPM')
tpm_130['sample'] = tpm_130['sample'].astype('category')
tpm_130['TPM'] = tpm_130['TPM'].astype('float32')

In [5]:
# Load tpm data
tpm_51 = pd.read_csv('../data/quantification/51/51_tpm.csv')
tpm_51.rename(columns={'target_id': 'transcript_id'}, inplace=True)
tpm_51.set_index('transcript_id', inplace=True)

# Optional: Remove rows with row sums < 1
tpm_51 = tpm_51[tpm_51.sum(axis=1) >= 1].reset_index()
tpm_51 = tpm_51.melt(id_vars=['transcript_id'], var_name='sample', value_name='TPM')
tpm_51['sample'] = tpm_51['sample'].astype('category')
tpm_51['TPM'] = tpm_51['TPM'].astype('float32')

In [6]:
# Merge tpm into annotation
data_130 = eukprot_annotation_130.merge(tpm_130, left_on='query_id', right_on='transcript_id', how='left').drop(columns=['query_id'])
# Remove tpm and annotation from memory
#del tpm, eukprot_annotation

data_130 = data_130.merge(meta, left_on='sample', right_on='Station', how='left')
data_130.drop(columns='Station', inplace=True)

# Merge tpm into annotation
data_51 = eukprot_annotation_51.merge(tpm_51, left_on='query_id', right_on='transcript_id', how='left').drop(columns=['query_id'])
# Remove tpm and annotation from memory
#del tpm, eukprot_annotation

data_51 = data_51.merge(meta, left_on='sample', right_on='Station', how='left')
data_51.drop(columns='Station', inplace=True)

## Taxonomic turnover

### Taxonomic Classes

In [7]:
# Original color mapping
color_discrete_map = {
    "Rare": "#545454",
    "core-Noctilucales": "#56B4E8",
    'Odontella': "#C44601",
    'Prymnesiophyceae': "#009E73",
    "Diatomeae": "#E69F00",
    "Dinophyceae": "#56B4E9",
    "Spirotrichea": "#F0E442",
    "Arthropoda": "#0072B2",
    "Ctenophora": "#ADA7A7",
    "Vertebrata": "#FF5349",
    "Phaeodarea": "#D55E00",
    "Florideophyceae": "#CC79A7",
    "Acantharea": "#719A10",
}

In [8]:
# Filter data based on TPM and p_ident thresholds
filtered_data = data_130[(data_130['p_ident'] >= 0.75)]

# Group by taxonomic level and sample, then sum TPM
grouped_df = filtered_data.groupby(['Taxogroup2_UniEuk', 'Date'])['TPM'].sum().reset_index()

grouped_df.head()
# Normalize TPM sums to get relative expression per hour
total_tpm_per_hour = grouped_df.groupby('Date')['TPM'].transform('sum')
grouped_df['rel_expression_per_hour'] = grouped_df['TPM'] / total_tpm_per_hour

# Combine low-abundant groups into a single 'Rare' category
threshold = 0.05
grouped_df.loc[grouped_df['rel_expression_per_hour'] <= threshold, 'Taxogroup2_UniEuk'] = 'Rare'

# Combine 'core-Noctilucales' and 'Dinophyceae'
grouped_df['Taxogroup2_UniEuk'] = grouped_df['Taxogroup2_UniEuk'].replace({'core-Noctilucales': 'Dinophyceae'})

# Remove categories with no expression
grouped_df = grouped_df[grouped_df['rel_expression_per_hour'] > 0]

# Group once more to combine Rares
grouped_df = grouped_df.groupby(['Taxogroup2_UniEuk', 'Date'])['rel_expression_per_hour'].sum().reset_index()


# Plot
fig = px.bar(grouped_df, 
            x='Date', 
            y='rel_expression_per_hour', 
            color='Taxogroup2_UniEuk',
            title='Relative Expression Per Hour at Station 130',
            category_orders={"Taxogroup2_UniEuk": ['Arthropoda', 'Acantharea', 'Ctenophora', 'Diatomeae', 'Dinophyceae', 
                                                    'Florideophyceae','Odontella', 'Phaeodarea', 'Prymnesiophyceae',
                                                    'Spirotrichea', 'Vertebrata', 'Rare',]},
            color_discrete_map=color_discrete_map
            )

fig.update_layout(
    width=800,
    height=400,
    xaxis_title='Date and Time',
    yaxis_title='Relative Expression',
    xaxis_tickangle=-45,
    legend_title='Taxonomic Group'
)

# Save the figure
filename_base = "../figures/metatranscriptomics/130_relative_expression_per_hour"
fig.write_image(filename_base + ".png", scale=1)
fig.write_image(filename_base + ".svg", scale=1)

fig.show()

  v = v.dt.to_pydatetime()


In [9]:
# Group by taxonomic level and sample, then sum TPM
grouped_df = filtered_data.groupby(['Taxogroup2_UniEuk', 'Date'])['TPM'].sum().reset_index()

grouped_df.head()
# Normalize TPM sums to get relative expression per hour
total_tpm_per_hour = grouped_df.groupby('Date')['TPM'].transform('sum')
grouped_df['rel_expression_per_hour'] = grouped_df['TPM'] / total_tpm_per_hour

# Combine low-abundant groups into a single 'Rare' category
threshold = 0.05
grouped_df.loc[grouped_df['rel_expression_per_hour'] <= threshold, 'Taxogroup2_UniEuk'] = 'Rare'

# Combine 'core-Noctilucales' and 'Dinophyceae'
grouped_df['Taxogroup2_UniEuk'] = grouped_df['Taxogroup2_UniEuk'].replace({'core-Noctilucales': 'Dinophyceae'})

# Remove categories with no expression
grouped_df = grouped_df[grouped_df['rel_expression_per_hour'] > 0]

# Group once more to combine Rares
grouped_df = grouped_df.groupby(['Taxogroup2_UniEuk', 'Date'])['TPM'].sum().reset_index()

# Plot
fig = px.bar(grouped_df, 
            x='Date', 
            y='TPM', 
            color='Taxogroup2_UniEuk',
            title='TPM Expression Per hour at Station 130',
            category_orders={"Taxogroup2_UniEuk": ['Arthropoda', 'Acantharea', 'Ctenophora', 'Diatomeae', 'Dinophyceae', 
                                                    'Florideophyceae','Odontella', 'Phaeodarea', 'Prymnesiophyceae',
                                                    'Spirotrichea', 'Vertebrata', 'Rare',]},
            color_discrete_map=color_discrete_map
            )

fig.update_layout(
    width=800,
    height=400,
    xaxis_title='Date and Time',
    yaxis_title='Relative Expression',
    xaxis_tickangle=-45,
    legend_title='Taxonomic Group'
)

# Save the figure
filename_base = "../figures/metatranscriptomics/130_expression_per_hour"
fig.write_image(filename_base + ".png", scale=1)
fig.write_image(filename_base + ".svg", scale=1)

fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [10]:
# Filter data based on TPM and p_ident thresholds
filtered_data = data_51[(data_51['p_ident'] >= 0.75)]

# Group by taxonomic level and sample, then sum TPM
grouped_df = filtered_data.groupby(['Taxogroup2_UniEuk', 'Date'])['TPM'].sum().reset_index()

grouped_df.head()
# Normalize TPM sums to get relative expression per hour
total_tpm_per_hour = grouped_df.groupby('Date')['TPM'].transform('sum')
grouped_df['rel_expression_per_hour'] = grouped_df['TPM'] / total_tpm_per_hour

# Combine low-abundant groups into a single 'Rare' category
threshold = 0.05
grouped_df.loc[grouped_df['rel_expression_per_hour'] <= threshold, 'Taxogroup2_UniEuk'] = 'Rare'

# Combine 'core-Noctilucales' and 'Dinophyceae'
grouped_df['Taxogroup2_UniEuk'] = grouped_df['Taxogroup2_UniEuk'].replace({'core-Noctilucales': 'Dinophyceae'})

# Remove categories with no expression
grouped_df = grouped_df[grouped_df['rel_expression_per_hour'] > 0]

# Group once more to combine Rares
grouped_df = grouped_df.groupby(['Taxogroup2_UniEuk', 'Date'])['rel_expression_per_hour'].sum().reset_index()

# Plot
fig = px.bar(grouped_df, 
            x='Date', 
            y='rel_expression_per_hour', 
            color='Taxogroup2_UniEuk',
            title='Relative Expression Per Hour at Station 51',
            category_orders={"Taxogroup2_UniEuk": ['Arthropoda', 'Acantharea', 'Ctenophora', 'Diatomeae', 'Dinophyceae', 
                                                    'Florideophyceae','Odontella', 'Phaeodarea', 'Prymnesiophyceae',
                                                    'Spirotrichea', 'Vertebrata', 'Rare',]},
            color_discrete_map=color_discrete_map
            )

fig.update_layout(
    width=800,
    height=400,
    xaxis_title='Date and Time',
    yaxis_title='Relative Expression',
    xaxis_tickangle=-45,
    legend_title='Taxonomic Group'
)

# Save the figure
filename_base = "../figures/metatranscriptomics/51_relative_expression_per_hour"
fig.write_image(filename_base + ".png", scale=1)
fig.write_image(filename_base + ".svg", scale=1)

fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [11]:
# Group by taxonomic level and sample, then sum TPM
grouped_df = filtered_data.groupby(['Taxogroup2_UniEuk', 'Date'])['TPM'].sum().reset_index()

grouped_df.head()
# Normalize TPM sums to get relative expression per hour
total_tpm_per_hour = grouped_df.groupby('Date')['TPM'].transform('sum')
grouped_df['rel_expression_per_hour'] = grouped_df['TPM'] / total_tpm_per_hour

# Combine low-abundant groups into a single 'Rare' category
threshold = 0.05
grouped_df.loc[grouped_df['rel_expression_per_hour'] <= threshold, 'Taxogroup2_UniEuk'] = 'Rare'

# Combine 'core-Noctilucales' and 'Dinophyceae'
grouped_df['Taxogroup2_UniEuk'] = grouped_df['Taxogroup2_UniEuk'].replace({'core-Noctilucales': 'Dinophyceae'})

# Remove categories with no expression
grouped_df = grouped_df[grouped_df['rel_expression_per_hour'] > 0]

# Group once more to combine Rares
grouped_df = grouped_df.groupby(['Taxogroup2_UniEuk', 'Date'])['TPM'].sum().reset_index()

# Plot
fig = px.bar(grouped_df, 
            x='Date', 
            y='TPM', 
            color='Taxogroup2_UniEuk',
            title='TPM Expression Per hour at Station 51',
            category_orders={"Taxogroup2_UniEuk": ['Arthropoda', 'Acantharea', 'Ctenophora', 'Diatomeae', 'Dinophyceae', 
                                                    'Florideophyceae','Odontella', 'Phaeodarea', 'Prymnesiophyceae',
                                                    'Spirotrichea', 'Vertebrata', 'Rare',]},
            color_discrete_map=color_discrete_map
            )

fig.update_layout(
    width=800,
    height=400,
    xaxis_title='Date and Time',
    yaxis_title='Relative Expression',
    xaxis_tickangle=-45,
    legend_title='Taxonomic Group'
)

# Save the figure
filename_base = "../figures/metatranscriptomics/51_expression_per_hour"
fig.write_image(filename_base + ".png", scale=1)
fig.write_image(filename_base + ".svg", scale=1)

fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



### Genera

In [12]:
# Original color mapping
color_discrete_map = {
    "Rare": "#545454",
    "Noctiluca": "#56B4E8",
    'Odontella': "#C44601",
    'Phaeocystis': "#009E73",
    "Diatomeae": "#E69F00",
    "Dinophyceae": "#56B4E9",
    "Spirotrichea": "#F0E442",
    "Arthropoda": "#0072B2",
    "Ctenophora": "#ADA7A7",
    "Vertebrata": "#FF5349",
    "Phaeodarea": "#D55E00",
    "Florideophyceae": "#CC79A7",
    "Acantharea": "#719A10",
}

In [13]:
# Filter data based on TPM and p_ident thresholds
filtered_data = data_130[(data_130['p_ident'] >= 0.95)]

# Group by taxonomic level and 'year_hour', then sum TPM
grouped_df = filtered_data.groupby(['Genus_UniEuk', 'Date'])['TPM'].sum().reset_index()

grouped_df.head()
# Normalize TPM sums to get relative expression per hour
total_tpm_per_hour = grouped_df.groupby('Date')['TPM'].transform('sum')
grouped_df['rel_expression_per_hour'] = grouped_df['TPM'] / total_tpm_per_hour

# Combine low-abundant groups into a single 'Rare' category
threshold = 0.05
grouped_df.loc[grouped_df['rel_expression_per_hour'] <= threshold, 'Genus_UniEuk'] = 'Rare'


# Remove categories with no expression
grouped_df = grouped_df[grouped_df['rel_expression_per_hour'] > 0]

# Group once more to combine Rares
grouped_df = grouped_df.groupby(['Genus_UniEuk', 'Date'])['rel_expression_per_hour'].sum().reset_index()

# Plot
fig = px.bar(grouped_df, 
            x='Date', 
            y='rel_expression_per_hour', 
            color='Genus_UniEuk',
            title='Relative Expression Per Hour at Station 130',
            category_orders={"Genus_UniEuk": ['Arthropoda', 'Acantharea', 'Ctenophora', 'Diatomeae', 'Dinophyceae', 
                                                    'Florideophyceae','Odontella', 'Phaeodarea', 'Prymnesiophyceae',
                                                    'Spirotrichea', 'Vertebrata', 'Rare',]},
            color_discrete_map=color_discrete_map
            )

fig.update_layout(
    width=800,
    height=400,
    xaxis_title='Date and Time',
    yaxis_title='Relative Expression',
    xaxis_tickangle=-45,
    legend_title='Taxonomic Group'
)

# Save the figure
filename_base = "../figures/metatranscriptomics/130_relative_expression_per_hour_genus"
fig.write_image(filename_base + ".png", scale=1)
fig.write_image(filename_base + ".svg", scale=1)

fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [14]:
# Filter data based on TPM and p_ident thresholds
filtered_data = data_51[(data_51['p_ident'] >= 0.95)]

# Group by taxonomic level and 'year_hour', then sum TPM
grouped_df = filtered_data.groupby(['Genus_UniEuk', 'Date'])['TPM'].sum().reset_index()

grouped_df.head()
# Normalize TPM sums to get relative expression per hour
total_tpm_per_hour = grouped_df.groupby('Date')['TPM'].transform('sum')
grouped_df['rel_expression_per_hour'] = grouped_df['TPM'] / total_tpm_per_hour

# Combine low-abundant groups into a single 'Rare' category
threshold = 0.05
grouped_df.loc[grouped_df['rel_expression_per_hour'] <= threshold, 'Genus_UniEuk'] = 'Rare'


# Remove categories with no expression
grouped_df = grouped_df[grouped_df['rel_expression_per_hour'] > 0]

# Group once more to combine Rares
grouped_df = grouped_df.groupby(['Genus_UniEuk', 'Date'])['rel_expression_per_hour'].sum().reset_index()

# Plot
fig = px.bar(grouped_df, 
            x='Date', 
            y='rel_expression_per_hour', 
            color='Genus_UniEuk',
            title='Relative Expression Per Hour at Station 51',
            category_orders={"Genus_UniEuk": ['Arthropoda', 'Acantharea', 'Ctenophora', 'Diatomeae', 'Dinophyceae', 
                                                    'Florideophyceae','Odontella', 'Phaeodarea', 'Prymnesiophyceae',
                                                    'Spirotrichea', 'Vertebrata', 'Rare',]},
            color_discrete_map=color_discrete_map
            )

fig.update_layout(
    width=800,
    height=400,
    xaxis_title='Date and Time',
    yaxis_title='Relative Expression',
    xaxis_tickangle=-45,
    legend_title='Taxonomic Group'
)

# Save the figure
filename_base = "../figures/metatranscriptomics/51_relative_expression_per_hour_genus"
fig.write_image(filename_base + ".png", scale=1)
fig.write_image(filename_base + ".svg", scale=1)

fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



### Transcripts per L
Here, we will calculate the number of transcripts per L for each sample. and plot the distribution of the number of transcripts per L for each sample.

In [15]:
# Use TPM sums to calculate Transcripts per Liter
ERCC_normalisation = pd.read_csv('../data/ERCC92/ERCC_normalisation.csv')

# Merge ERCC normalisation data into dataframe
data_130 = data_130.merge(ERCC_normalisation, left_on='sample', right_on='sample_name', how='left')
data_51 = data_51.merge(ERCC_normalisation, left_on='sample', right_on='sample_name', how='left')

# Calculate TPL
data_130['TPL'] = data_130['TPM'] * data_130['ERCC_norm_factor']
data_51['TPL'] = data_51['TPM'] * data_51['ERCC_norm_factor']

In [16]:
# Filter data based on TPL and p_ident thresholds
filtered_data = data_130[(data_130['p_ident'] >= 0.95)]

# Group by taxonomic level and sample, then sum TPL
grouped_df = filtered_data.groupby(['Genus_UniEuk', 'Date'])['TPL'].sum().reset_index()

grouped_df.head()
# Normalize TPL sums to get relative expression per hour
total_TPL_per_hour = grouped_df.groupby('Date')['TPL'].transform('sum')
grouped_df['rel_expression_per_hour'] = grouped_df['TPL'] / total_TPL_per_hour

# Combine low-abundant groups into a single 'Rare' category
threshold = 0.05
grouped_df.loc[grouped_df['rel_expression_per_hour'] <= threshold, 'Genus_UniEuk'] = 'Rare'


# Remove categories with no expression
grouped_df = grouped_df[grouped_df['rel_expression_per_hour'] > 0]

# Group once more to combine Rares
grouped_df = grouped_df.groupby(['Genus_UniEuk', 'Date'])['TPL'].sum().reset_index()

# Plot
fig = px.bar(grouped_df, 
            x='Date', 
            y='TPL', 
            color='Genus_UniEuk',
            title='TPL sum at Station 130',
            category_orders={"Genus_UniEuk": ['Arthropoda', 'Acantharea', 'Ctenophora', 'Diatomeae', 'Dinophyceae', 
                                                    'Florideophyceae','Odontella', 'Phaeodarea', 'Prymnesiophyceae',
                                                    'Spirotrichea', 'Vertebrata', 'Rare',]},
            color_discrete_map=color_discrete_map
            )

fig.update_layout(
    width=800,
    height=400,
    xaxis_title='Date and Time',
    yaxis_title='TPL',
    xaxis_tickangle=-45,
    legend_title='Taxonomic Group'
)

# Save the figure
filename_base = "../figures/metatranscriptomics/130_TPL_per_hour_genus"
fig.write_image(filename_base + ".png", scale=1)
fig.write_image(filename_base + ".svg", scale=1)

fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [17]:
# Filter data based on TPL and p_ident thresholds
filtered_data = data_130[(data_130['p_ident'] >= 0.95)]

# Group by taxonomic level and sample, then sum TPL
grouped_df = filtered_data.groupby(['Genus_UniEuk', 'Date'])['TPL'].sum().reset_index()

grouped_df.head()
# Normalize TPL sums to get relative expression per hour
total_TPL_per_hour = grouped_df.groupby('Date')['TPL'].transform('sum')
grouped_df['rel_expression_per_hour'] = grouped_df['TPL'] / total_TPL_per_hour

# Combine low-abundant groups into a single 'Rare' category
threshold = 0.05
grouped_df.loc[grouped_df['rel_expression_per_hour'] <= threshold, 'Genus_UniEuk'] = 'Rare'


# Remove categories with no expression
grouped_df = grouped_df[grouped_df['rel_expression_per_hour'] > 0]

# Group once more to combine Rares
grouped_df = grouped_df.groupby(['Genus_UniEuk', 'Date'])['TPL'].sum().reset_index()

# Plot
fig = px.bar(grouped_df, 
            x='Date', 
            y='TPL', 
            color='Genus_UniEuk',
            title='TPL sum at Station 130',
            category_orders={"Genus_UniEuk": ['Arthropoda', 'Acantharea', 'Ctenophora', 'Diatomeae', 'Dinophyceae', 
                                                    'Florideophyceae','Odontella', 'Phaeodarea', 'Prymnesiophyceae',
                                                    'Spirotrichea', 'Vertebrata', 'Rare',]},
            color_discrete_map=color_discrete_map
            )

fig.update_layout(
    width=800,
    height=400,
    xaxis_title='Date and Time',
    yaxis_title='TPL',
    xaxis_tickangle=-45,
    legend_title='Taxonomic Group'
)

# Save the figure
filename_base = "../figures/metatranscriptomics/130_TPL_per_hour_genus"
fig.write_image(filename_base + ".png", scale=1)
fig.write_image(filename_base + ".svg", scale=1)

fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [18]:
# Filter data based on TPL and p_ident thresholds
filtered_data = data_51[(data_51['p_ident'] >= 0.95)]

# Group by taxonomic level and sample, then sum TPL
grouped_df = filtered_data.groupby(['Genus_UniEuk', 'Date'])['TPL'].sum().reset_index()

grouped_df.head()
# Normalize TPL sums to get relative expression per hour
total_TPL_per_hour = grouped_df.groupby('Date')['TPL'].transform('sum')
grouped_df['rel_expression_per_hour'] = grouped_df['TPL'] / total_TPL_per_hour

# Combine low-abundant groups into a single 'Rare' category
threshold = 0.05
grouped_df.loc[grouped_df['rel_expression_per_hour'] <= threshold, 'Genus_UniEuk'] = 'Rare'


# Remove categories with no expression
grouped_df = grouped_df[grouped_df['rel_expression_per_hour'] > 0]

# Group once more to combine Rares
grouped_df = grouped_df.groupby(['Genus_UniEuk', 'Date'])['TPL'].sum().reset_index()

# Plot
fig = px.bar(grouped_df, 
            x='Date', 
            y='TPL', 
            color='Genus_UniEuk',
            title='TPL sum at Station 51',
            category_orders={"Genus_UniEuk": ['Arthropoda', 'Acantharea', 'Ctenophora', 'Diatomeae', 'Dinophyceae', 
                                                    'Florideophyceae','Odontella', 'Phaeodarea', 'Prymnesiophyceae',
                                                    'Spirotrichea', 'Vertebrata', 'Rare',]},
            color_discrete_map=color_discrete_map
            )

fig.update_layout(
    width=800,
    height=400,
    xaxis_title='Date and Time',
    yaxis_title='TPL',
    xaxis_tickangle=-45,
    legend_title='Taxonomic Group'
)

# Save the figure
filename_base = "../figures/metatranscriptomics/51_TPL_per_hour_genus"
fig.write_image(filename_base + ".png", scale=1)
fig.write_image(filename_base + ".svg", scale=1)

fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [19]:
# Export a species x sample TPL/TPM expression matrix for station 130
export = data_130[data_130['p_ident'] >= 0.9].pivot_table(index='Name_to_Use', columns='sample', values='TPL', aggfunc='sum')
export.to_csv('../data/analysis/130_TPL_tax_matrix.csv')

export = data_130[data_130['p_ident'] >= 0.9].pivot_table(index='Name_to_Use', columns='sample', values='TPM', aggfunc='sum')
export.to_csv('../data/analysis/130_TPM_tax_matrix.csv')

## Species Binning
Here, I want to extract the transcriptome of Phaeocystis to be used in subsequent analyses.

In [20]:
# Aggregate the number of transcripts per species
transcript_count = eukprot_annotation_130[eukprot_annotation_130['p_ident'] >= 0.95]['Genus_UniEuk'].value_counts()
# Print the first 5 entries
print(transcript_count.head())

# Get the top 5 genera with the most transcripts
top_genera = transcript_count.head(5)

# Filter the eukprot_annotation_130 set of annotated transcripts for each of the top genera and save to separate files
for genus in top_genera.index:
    genus_eukprot_annotation_130 = eukprot_annotation_130[eukprot_annotation_130['Genus_UniEuk'] == genus]
    filename = f"{genus}_transcriptome_bin.csv"
    genus_eukprot_annotation_130.to_csv(os.path.join('../data/annotation/taxonomy_eukprot/130/genus_bins/', filename), index=False)
    print(f"{genus}: {len(genus_eukprot_annotation_130)} transcripts - Saved")

print(f"Files have been saved for the top {len(top_genera)} genera with the most transcripts.")

Genus_UniEuk
Noctiluca        55551
Phaeocystis      27786
Hemistasia        3462
Acartia           1770
Pleurobrachia     1473
Name: count, dtype: int64
Noctiluca: 227576 transcripts - Saved
Phaeocystis: 357651 transcripts - Saved
Hemistasia: 13605 transcripts - Saved
Acartia: 20507 transcripts - Saved
Pleurobrachia: 7413 transcripts - Saved
Files have been saved for the top 5 genera with the most transcripts.


In [21]:
# Aggregate the number of transcripts per species
transcript_count = eukprot_annotation_51[eukprot_annotation_51['p_ident'] >= 0.95]['Genus_UniEuk'].value_counts()
# Print the first 5 entries
print(transcript_count.head())

# Get the top 5 genera with the most transcripts
top_genera = transcript_count.head(5)

# Filter the eukprot_annotation_51 set of annotated transcripts for each of the top genera and save to separate files
for genus in top_genera.index:
    genus_eukprot_annotation_51 = eukprot_annotation_51[eukprot_annotation_51['Genus_UniEuk'] == genus]
    filename = f"{genus}_transcriptome_bin.csv"
    genus_eukprot_annotation_51.to_csv(os.path.join('../data/annotation/taxonomy_eukprot/51/genus_bins/', filename), index=False)
    print(f"{genus}: {len(genus_eukprot_annotation_51)} transcripts - Saved")

print(f"Files have been saved for the top {len(top_genera)} genus with the most transcripts.")

Genus_UniEuk
Noctiluca      21074
Phaeocystis     5147
Calanus         3312
Acartia         3122
Eurytemora      2839
Name: count, dtype: int64
Noctiluca: 84056 transcripts - Saved
Phaeocystis: 33826 transcripts - Saved
Calanus: 29881 transcripts - Saved
Acartia: 38889 transcripts - Saved
Eurytemora: 46448 transcripts - Saved
Files have been saved for the top 5 genus with the most transcripts.


Genus_UniEuk
Noctiluca      11549
Phaeocystis     3314
Homo            3057
Calanus         2284
Acartia         2082
Name: count, dtype: int64
Noctiluca: 11549 transcripts - Saved
Phaeocystis: 3314 transcripts - Saved
Homo: 3057 transcripts - Saved
Calanus: 2284 transcripts - Saved
Acartia: 2082 transcripts - Saved
Files have been saved for the top 5 genus with the most transcripts.

## Phaeocystis
For station 130, we'll extract the transcriptome bin of Phaeocystis. Once the sum of TPM values, once the TPL values.

In [22]:
# Define genera of interest
genera = [
    'Phaeocystis'
]

In [23]:
for genus in tqdm(genera, desc='Binning Genera'):
    print(f'Processing {genus}')
    genus_transcripts = pd.read_csv(f'../data/annotation/taxonomy_eukprot/130/genus_bins/{genus}_transcriptome_bin.csv', usecols=['query_id', 'p_ident'])
    # Only retain transcripts with a p_ident of 0.8 or higher
    genus_transcripts = genus_transcripts[genus_transcripts['p_ident'] >= 0.8]
    
    # Extract transcript counts that also belong to the genus of interest
    genus_data = tpm_130[tpm_130['transcript_id'].isin(genus_transcripts['query_id'])]
    
    # Print the amount of transcripts
    n_transcripts = len(genus_data['transcript_id'].unique())
    print(f'{genus} has {n_transcripts} transcripts')
    # Create the matrix
    genus_data = genus_data.reset_index().pivot(index='transcript_id', columns='sample')['TPM']

    # Save the data
    genus_data.to_csv(f'../data/annotation/taxonomy_eukprot/130/genus_bins/{genus}_transcript_expression_sum.csv')

Binning Genera:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Phaeocystis
Phaeocystis has 118362 transcripts


Binning Genera: 100%|██████████| 1/1 [00:16<00:00, 16.54s/it]


In [24]:
phaeocystis_bin = data_130[data_130['Name_to_Use'] == 'Phaeocystis globosa']
phaeocystis_bin = phaeocystis_bin[['sample', 'TPM']].groupby('sample').sum().reset_index()

# Export the data
phaeocystis_bin.to_csv('../data/analysis/phaeocystis_bin_tpm_130.csv', index=False)

# Use TPM sums to calculate Transcripts per Liter
ERCC_normalisation = pd.read_csv('../data/ERCC92/ERCC_normalisation.csv')

# Merge ERCC normalisation data with the TPM sums
phaeocystis_bin = phaeocystis_bin.merge(ERCC_normalisation, left_on='sample', right_on='sample_name', how='left')

# Add a column for Transcripts per Liter
phaeocystis_bin['TPL'] = phaeocystis_bin['TPM'] * phaeocystis_bin['ERCC_norm_factor']

# Remove the ERCC normalisation factor
phaeocystis_bin.drop(columns='ERCC_norm_factor', inplace=True)

# Export the data
phaeocystis_bin[['sample', 'TPL']].to_csv('../data/analysis/phaeocystis_bin_tpl_130.csv', index=False)

In [25]:
# Repeat the process for station 51
phaeocystis_bin = data_51[data_51['Name_to_Use'] == 'Phaeocystis globosa']
phaeocystis_bin = phaeocystis_bin[['sample', 'TPM']].groupby('sample').sum().reset_index()

# Export the data
phaeocystis_bin.to_csv('../data/analysis/phaeocystis_bin_tpm_51.csv', index=False)

# Merge ERCC normalisation data with the TPM sums
phaeocystis_bin = phaeocystis_bin.merge(ERCC_normalisation, left_on='sample', right_on='sample_name', how='left')

# Add a column for Transcripts per Liter
phaeocystis_bin['TPL'] = phaeocystis_bin['TPM'] * phaeocystis_bin['ERCC_norm_factor']

# Remove the ERCC normalisation factor
phaeocystis_bin.drop(columns='ERCC_norm_factor', inplace=True)

# Export the data
phaeocystis_bin[['sample', 'TPL']].to_csv('../data/analysis/phaeocystis_bin_tpl_51.csv', index=False)

For the RCM analysis, we need a dataframe which is KEGG KO IDs x samples containing summed count values.
Let's create this dataframe.

In [26]:
# Extract Phaeocystis globosa transcripts
phaeocystis_bin = data_130[(data_130['Name_to_Use'] == 'Phaeocystis globosa') & (data_130['p_ident'] >= 0.85)]

# Add count data
phaeocystis_counts = pd.read_csv('../data/quantification/130/130_count.csv')
## Change into long format
phaeocystis_counts = phaeocystis_counts.melt(id_vars='target_id', var_name='sample', value_name='count')
# Rename the target_id column
phaeocystis_counts.rename(columns={'target_id': 'transcript_id'}, inplace=True)
phaeocystis_bin = phaeocystis_bin.merge(phaeocystis_counts, on=['transcript_id', 'sample'], how='left')

# Read in the functional annotation data
functional_annotation = pd.read_table('../data/annotation/functional_eggnog/130/functional_annotation.emapper.annotations')
# Cut off weird characters from the transcript names
functional_annotation['#query'] = functional_annotation['#query'].str.split(".", n=1, expand=True)[0]
# Rename the query_id column
functional_annotation.rename(columns={'#query': 'transcript_id'}, inplace=True)
# Define the annotation column of interest:
functional_category = 'Description'
# Remove rows with missing functional annotation
functional_annotation = functional_annotation[functional_annotation[functional_category] != '-']

# Add functional information to the new set
export = phaeocystis_bin.merge(functional_annotation[['transcript_id', functional_category]], left_on = "transcript_id", right_on="transcript_id", how='left')

# Keep only the columns of interest
export = export[['sample', 'count', functional_category]]
# Group by sample and functional category
export = export.groupby(['sample', functional_category])['count'].sum().reset_index()

# Export the data as sample x functional category matrix
export = export.pivot(index='sample', columns=functional_category, values='count').reset_index()
export.head()

Description,sample,"'FY-rich' domain, C-terminal region",'de novo' UMP biosynthetic process,( 3 oxidation state) methyltransferase,(1->6)-beta-D-glucan biosynthetic process,(ABC) transporter,(Acyl-carrier-protein) S-malonyltransferase,(R)-3-amino-2-methylpropionate-pyruvate transaminase activity,"1,2-diacylglycerol 3-beta-galactosyltransferase activity","1,3-beta-D-glucan synthase activity",...,xaa-Pro aminopeptidase,"xylan 1,4-beta-xylosidase activity",xyloglucan 6-xylosyltransferase activity,zinc finger,zinc finger CCCH domain-containing protein,zinc finger family protein,zinc ion binding,zinc ion transmembrane transporter activity,zinc-finger of a C2HC-type,zonula adherens maintenance
0,130_1,0.0,0.0,533.11,406.02746,3.0,0.0,0.0,12.0,223.268,...,0.0,0.0,356.9449,0.0,0.0,0.0,600.18475,455.0,0.0,0.0
1,130_10,0.0,375.42,155.826,380.0624,108.479,33.1902,0.0,0.0,37.0,...,50.1938,315.30349,0.0,53.0,0.0,0.0,1435.538877,101.00055,61.7623,1.38966
2,130_11,69.7959,917.241,247.0,525.00274,0.0,0.0,0.0,42.9113,77.5593,...,150.004,315.47737,133.0,6.0,0.0,0.0,1237.79123,357.6727,4.0693,0.0
3,130_12,0.0,95.0304,1.65674,7.94615,0.0,0.0,0.0,60.8076,66.06243,...,19.7302,26.0,27.2772,0.0,0.0,0.0,590.7368,35.2179,0.0,3.94323e-07
4,130_13,174.766401,550.2718,47.8271,839.3172,27.501,0.0,0.0,238.311,205.84212,...,0.0,309.138,192.96938,0.0,0.0,0.0,1683.374212,207.4751,137.1279,19.4169


In [27]:
# Save the data
export.to_csv('../data/analysis/phaeocystis_metabolic_functions_counts.csv', index=False)