# Transcripts per L

In [3]:
import pandas as pd
import plotly.express as px

Transcripts per L have been calculated [here](../../scripts/analysis/SpikeIn_analysis_Normalisation.ipynb). Transcripts per L are TPM values scaled to the amount of seawater that has been processed. This might reflect ecosystem productivity better than TPM values!
Let's start by loading the data and adding it to the dataframe.

## Data preparation

In [4]:
# Load the necessary transcripts per L data
transcripts_per_L = pd.read_csv('../../data/kallisto/transcripts_per_L.csv', engine='pyarrow')
## This is the transcript read mapping, prior to protein prediction!
# Rename the first column to transcript_id
transcripts_per_L = transcripts_per_L.rename(columns={'target_id': 'transcript_id'})
# Transform the data to the long format
transcripts_per_L = transcripts_per_L.melt(id_vars=['transcript_id'],var_name='sample', value_name='Transcripts_per_L')
transcripts_per_L.head()

Unnamed: 0,transcript_id,sample,Transcripts_per_L
0,c_000000000001,1_130_S31,0.0
1,c_000000000002,1_130_S31,0.0
2,c_000000000003,1_130_S31,0.0
3,c_000000000004,1_130_S31,0.0
4,c_000000000005,1_130_S31,0.0


In [5]:
# Add whether or not we found a taxonomic match for each transcript
annotation = pd.read_table('../../data/annotation/taxonomy_phyloDB_extended/phylodb_extended.firsthit.90plus_alnscore.m8',  engine='pyarrow', header=None)
annotation.iloc[:, 0] = annotation.iloc[:, 0].str.split(".", expand=True).drop(columns=1)

# Add a 1 or 0 to the transcripts_per_L dataframe to indicate whether or not we found a taxonomic match
transcripts_per_L['taxonomic_match'] = transcripts_per_L['transcript_id'].isin(annotation[0])

# View the data
transcripts_per_L.head()

Unnamed: 0,transcript_id,sample,Transcripts_per_L,taxonomic_match
0,c_000000000001,1_130_S31,0.0,False
1,c_000000000002,1_130_S31,0.0,False
2,c_000000000003,1_130_S31,0.0,False
3,c_000000000004,1_130_S31,0.0,False
4,c_000000000005,1_130_S31,0.0,False


In [6]:
# Check
transcripts_per_L['taxonomic_match'].value_counts()

False    381881126
True      59156432
Name: taxonomic_match, dtype: int64

In [7]:
# Add metadata to the data
meta = pd.read_csv('../../samples.csv', sep=';', index_col=0)

transcripts_per_L = transcripts_per_L.merge(meta, on='sample')

transcripts_per_L.head()

Unnamed: 0,transcript_id,sample,Transcripts_per_L,taxonomic_match,date,time,month,station
0,c_000000000001,1_130_S31,0.0,False,18/01/2021,9:05,January_2021,130
1,c_000000000002,1_130_S31,0.0,False,18/01/2021,9:05,January_2021,130
2,c_000000000003,1_130_S31,0.0,False,18/01/2021,9:05,January_2021,130
3,c_000000000004,1_130_S31,0.0,False,18/01/2021,9:05,January_2021,130
4,c_000000000005,1_130_S31,0.0,False,18/01/2021,9:05,January_2021,130


## Average sum of transcripts per L per station

Let's first plot the percentage of transcripts that has been annotated with a taxonomic identifier.

In [8]:
# Prepare the rates of annotation per month
annotation_grouped = transcripts_per_L[transcripts_per_L['Transcripts_per_L'] > 1][['month', 'taxonomic_match']]
annotation_grouped = annotation_grouped.groupby(['month']).sum()

# Divide the sum of taxonomic matches by the total number of transcript ids per month
annotation_grouped['taxonomic_match_rate'] = (annotation_grouped['taxonomic_match'] / transcripts_per_L[transcripts_per_L['Transcripts_per_L'] > 1][['month', 'transcript_id']].groupby(['month']).count()['transcript_id']) * 100

annotation_grouped.reset_index(inplace=True)

annotation_grouped['month'] = pd.Categorical(annotation_grouped['month'], ["July_2020", "August_2020", "September_2020", 
                                                        "November_2020", "December_2020", "January_2021", 
                                                        "February_2021", "April_2021", "May_2021", 
                                                        "June_2021", "July_2021"])

# Plot the data
fig = px.histogram(annotation_grouped.sort_values("month", ascending=False), 
                x="taxonomic_match_rate",
                y="month", 
                #text_auto='.2f',
                orientation ='h'
                )

fig.update_layout(
    font=dict(
        family="Times New Roman, serif",  # Set the font family to Times New Roman
        size=12,  # Set the font size
        color="#7f7f7f"  # Set the font color
    ),
    autosize=False,
    width=500,
    height=350,
    margin=dict( # Set the margins
        l=0,  # Left margin
        r=25,  # Right margin
        b=25,  # Bottom margin
        t=25  # Top margin
    ),
    # Set x limits
    xaxis_range=[0, 100],
    xaxis_title_text='Percentage of transcripts with taxonomic match'
)

fig.show()

# Save figure
fig.write_image("../../figures/taxonomic_analysis/percent_annotated_transcripts.svg", scale=2)
fig.write_image("../../figures/taxonomic_analysis/percent_annotated_transcripts.png", scale=2)

In [9]:
# Report the mean and standard deviation of the annotation rate
mean = annotation_grouped["taxonomic_match_rate"].mean()
std = annotation_grouped['taxonomic_match_rate'].std()
print('In any given month, we have annotated {:.2f}% (+- {:.2f} SD) of the transcripts.'.format(mean, std))

In any given month, we have annotated 15.56% (+- 3.72 SD) of the transcripts.


In [10]:
# Prepare the transcripts per L data
grouped_df = transcripts_per_L[transcripts_per_L['Transcripts_per_L'] > 1][['month', 'Transcripts_per_L']]
grouped_df = grouped_df.groupby(['month']).agg({'Transcripts_per_L':['sum', 'median', 'mean', 'min', 'max']})
grouped_df.columns = ['Transcripts_per_L_sum', 'Transcripts_per_L_median', 'Transcripts_per_L_mean', 'Transcripts_per_L_min', 'Transcripts_per_L_max']
grouped_df  = grouped_df.reset_index()
grouped_df['month'] = pd.Categorical(grouped_df['month'], ["July_2020", "August_2020", "September_2020", 
                                                        "November_2020", "December_2020", "January_2021", 
                                                        "February_2021", "April_2021", "May_2021", 
                                                        "June_2021", "July_2021"])

# Normalize the sums by dividing by the total sum of transcripts per L by the amount of stations visited
station_mapper = {'July_2020': 5, 'August_2020': 6, 'September_2020': 6, 'November_2020': 6, 'December_2020': 6, 'January_2021': 5, 'February_2021': 5, 'April_2021': 4, 'May_2021': 6, 'June_2021': 6, 'July_2021': 6}
grouped_df['Transcripts_per_L_sum_norm'] = grouped_df['Transcripts_per_L_sum'] / grouped_df['month'].map(station_mapper)

In [11]:
# Plot
fig = px.histogram(grouped_df.sort_values("month", ascending=False), 
                x="Transcripts_per_L_sum_norm",
                y="month", 
                #text_auto='.2f',
                orientation ='h'
                )

fig.update_layout(
    font=dict(
        family="Times New Roman, serif",  # Set the font family to Times New Roman
        size=12,  # Set the font size
        color="#7f7f7f"  # Set the font color
    ),
    autosize=False,
    width=500,
    height=350,
    margin=dict( # Set the margins
        l=0,  # Left margin
        r=25,  # Right margin
        b=25,  # Bottom margin
        t=25  # Top margin
    ),
    xaxis_title_text='Average sum of transcripts per L'
)

fig.show()

# Save figure
fig.write_image("../../figures/taxonomic_analysis/average_sum_transcripts_per_L_per_month.svg", scale=2)

## Spatial variance in transcripts per L

In [19]:
# Show sum of transcripts per L per month and station
grouped_df = transcripts_per_L[transcripts_per_L['Transcripts_per_L'] > 1].groupby(['month', 'station']).agg({'Transcripts_per_L':['sum', 'median', 'mean', 'min', 'max']})
grouped_df.columns = ['Transcripts_per_L_sum', 'Transcripts_per_L_median', 'Transcripts_per_L_mean', 'Transcripts_per_L_min', 'Transcripts_per_L_max']
grouped_df  = grouped_df.reset_index()
grouped_df['month'] = pd.Categorical(grouped_df['month'], ["July_2020", "August_2020", "September_2020", 
                                                        "November_2020", "December_2020", "January_2021", 
                                                        "February_2021", "April_2021", "May_2021", 
                                                        "June_2021", "July_2021"])

# Visualize the sum of transcripts per station, per month

fig = px.bar(grouped_df,
            x = 'month',
            y = 'Transcripts_per_L_sum',
            color='station',
            category_orders={"station": ["ZG02", "330", "780", "120", "130", "700"],
                             "month": ["July_2020", "August_2020", "September_2020",
                                        "November_2020", "December_2020", "January_2021", 
                                        "February_2021", "April_2021", "May_2021", 
                                        "June_2021", "July_2021"]},            
            barmode='group')

fig.show()

In [16]:
stations = ['ZG02', '120', '330', '130', '780', '700']
for station in stations:
    # Prepare the data
    grouped_df = transcripts_per_L[(transcripts_per_L['Transcripts_per_L'] > 1) & (transcripts_per_L['station'] == station)][['month', 'station', 'Transcripts_per_L']]
    grouped_df = grouped_df.groupby(['month', 'station']).agg({'Transcripts_per_L':['sum', 'median', 'mean', 'min', 'max']})
    grouped_df.columns = ['Transcripts_per_L_sum', 'Transcripts_per_L_median', 'Transcripts_per_L_mean', 'Transcripts_per_L_min', 'Transcripts_per_L_max']
    grouped_df  = grouped_df.reset_index()
    grouped_df['month'] = pd.Categorical(grouped_df['month'], ["July_2020", "August_2020", "September_2020", 
                                                            "November_2020", "December_2020", "January_2021", 
                                                            "February_2021", "April_2021", "May_2021", 
                                                            "June_2021", "July_2021"])

    # Plot
    fig = px.histogram(grouped_df.sort_values("month", ascending=False), 
                    x="Transcripts_per_L_sum",
                    y="month",
                    category_orders={"station": ["780", "700", "330", "130", "ZG02", "120"],
                                     "month": ["July_2020", "August_2020", "September_2020",
                                                "November_2020", "December_2020", "January_2021", 
                                                "February_2021", "April_2021", "May_2021", 
                                                "June_2021", "July_2021"]},
                    #text_auto='.2f',
                    orientation ='h'
                    )

    fig.update_layout(
    font=dict(
        family="Times New Roman, serif",  # Set the font family to Times New Roman
        size=12,  # Set the font size
        color="#7f7f7f"  # Set the font color
    ),
    autosize=False,
    width=500,
    height=350,
    margin=dict( # Set the margins
        l=0,  # Left margin
        r=25,  # Right margin
        b=25,  # Bottom margin
        t=25  # Top margin
    ),
        xaxis_title_text='Sum of transcripts per L',
        title_text='Station {}'.format(station),
        # Set x limits
        xaxis_range=[0, 110000000]
    )

    fig.show()

    # Save figure
    fig.write_image("../../figures/taxonomic_analysis/TPL_per_month_{}.png".format(station), scale=2)
    fig.write_image("../../figures/taxonomic_analysis/TPL_per_month_{}.svg".format(station), scale=2)