### Imports

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path 
from IPython.display import display
from pandas import DataFrame, Series
from typing import Optional
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None) 

### Functions

In [2]:
def transform_rfs(
    df: DataFrame, type_p_column: str, score: str, median_from: Optional[str] = None
) -> Series:
    """
    transform_rfs calculates relative fitness scores based on a set of synonmous
    and non-synonymous variants.

    Parameters
    ----------
    df : DataFrame
        DataFrame with unnormalized scores.
    type_p_column : str
        A column indicating the type of the variant, must contain "syn" and "non".
    score : str
        The column containing the raw scores.
    median_from : Optional[str], optional
        If needed, the medians can be calculated from another score column,
        e.g. for transforming confidence intervals, by default the score column
        is used.

    Returns
    -------
    Series
        A Series containing transformed scores.
    """
    median_score = median_from
    if median_from is None:
        median_score = score
    syn_median = df.loc[df[type_p_column] == "syn", median_score].median()
    non_median = df.loc[df[type_p_column] == "non", median_score].median()
    difference = abs(non_median - syn_median)
    return (((df[score] - non_median) / difference) * 2) + 1


### Settings

In [3]:
incoming = Path("incoming")
out = Path("out")
out.mkdir(exist_ok=True, parents=True)
raw_count_file = incoming / "Exon5-8_read-counts.tsv" 
out_file = out / 'Exon5-8_RFS.tsv'

# the column names of the raw counts
raw_counts = [
    "read_count_dmso_1", "read_count_dmso_2", "read_count_dmso_3", 
    "read_count_n3a_1", "read_count_n3a_2", "read_count_n3a_3", 
    "read_count_donor"
]

# column names for relative abundance, i.e. the raw counts divided by the sum of counts
relative_abundance = [
    "abundance_dmso_1", "abundance_dmso_2", "abundance_dmso_3", 
    "abundance_n3a_1", "abundance_n3a_2", "abundance_n3a_3",
    "abundance_donor"	
]
# column indicating the type of the variant, e.g. synonymous ("syn"), nonsense ("non"), ... 
type_p_column = 'type_p'

### Calculate RFS

In [4]:
# initialize an empty list to store the processed DataFrames
processed_dataframes = []

# load the data
df_counts = pd.read_csv(raw_count_file, index_col=[0], sep="\t")
display(df_counts.head())

for exon, df_exon in df_counts.groupby("library_id"):
    # normalize the data per exon
    df_exon[relative_abundance] = (df_exon.loc[:, raw_counts] / df_exon[df_exon['duplicated']==False].loc[:, raw_counts].sum()).values

    # calculate enrichment scores
    for i in range(1, 4):
        df_exon[f'es_{i}'] = df_exon[f'abundance_n3a_{i}'].div(df_exon[f'abundance_dmso_{i}'])
    
    # calculate logarithmic enrichment scores
    df_exon[['loges_1', 'loges_2', 'loges_3']] = df_exon[['es_1', 'es_2', 'es_3']].apply("log2").values
    
    # calculate Relative Fitness Scores (RFS) for the replicates
    for i in range(1, 4):
        df_exon[f"rfs_{i}"] = transform_rfs(df_exon, type_p_column, f"loges_{i}")

    # add the median RFS value
    df_exon['rfs_median'] = df_exon[['rfs_1', 'rfs_2', 'rfs_3']].median(axis=1)
    
    # append the processed DataFrame to the list
    processed_dataframes.append(df_exon)

# combine the processed DataFrames
combined_df = pd.concat(processed_dataframes)

# display the combined DataFrame
combined_df = combined_df.reset_index()
display(combined_df.head())
print(combined_df.shape)

# to file
combined_df.to_csv(out_file, index=False, sep="\t")


Unnamed: 0_level_0,hg38_genomic,hg38_cDNA,hg38_protein,read_count_dmso_1,read_count_dmso_2,read_count_dmso_3,read_count_n3a_1,read_count_n3a_2,read_count_n3a_3,read_count_donor,library_id,type_p,location,type_g,full_sequence,duplicated,effect,codon,codon_ref,codon_alt,aa_ref,aa_alt
mut_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,NC_000017.11:g.7675248_7675250del,NM_000546.6:c.376-12_376-10del,NP_000537.3:p.?,842,614,624,8,58,237,3767,Ex5,,Intron,del,CTCTGTCTCCTTCTTCCTACAGTACTCCCCTGCCCTCAACAAGATG...,False,,,,,,
2,NC_000017.11:g.7675247_7675248del,NM_000546.6:c.376-10_376-9del,NP_000537.3:p.?,6506,2489,3260,1103,545,776,25047,Ex5,,Intron,del,CTCTGTCTCCTTCCTTCCTACAGTACTCCCCTGCCCTCAACAAGAT...,False,,,,,,
3,NC_000017.11:g.7675249del,NM_000546.6:c.376-12del,NP_000537.3:p.?,4726,2135,2142,747,164,920,17449,Ex5,,Intron,del,CTCTGTCTCCTTCTCTTCCTACAGTACTCCCCTGCCCTCAACAAGA...,False,,,,,,
4,NC_000017.11:g.7675248G>A,NM_000546.6:c.376-12C>T,NP_000537.3:p.?,4441,1576,2528,616,51,489,14982,Ex5,,Intron,sub,CTCTGTCTCCTTCTTCTTCCTACAGTACTCCCCTGCCCTCAACAAG...,False,,,,,,
5,NC_000017.11:g.7675248G>C,NM_000546.6:c.376-12C>G,NP_000537.3:p.?,1680,937,622,375,2,113,5006,Ex5,,Intron,sub,CTCTGTCTCCTTCGTCTTCCTACAGTACTCCCCTGCCCTCAACAAG...,False,,,,,,


Unnamed: 0,mut_ID,hg38_genomic,hg38_cDNA,hg38_protein,read_count_dmso_1,read_count_dmso_2,read_count_dmso_3,read_count_n3a_1,read_count_n3a_2,read_count_n3a_3,read_count_donor,library_id,type_p,location,type_g,full_sequence,duplicated,effect,codon,codon_ref,codon_alt,aa_ref,aa_alt,abundance_dmso_1,abundance_dmso_2,abundance_dmso_3,abundance_n3a_1,abundance_n3a_2,abundance_n3a_3,abundance_donor,es_1,es_2,es_3,loges_1,loges_2,loges_3,rfs_1,rfs_2,rfs_3,rfs_median
0,1,NC_000017.11:g.7675248_7675250del,NM_000546.6:c.376-12_376-10del,NP_000537.3:p.?,842,614,624,8,58,237,3767,Ex5,,Intron,del,CTCTGTCTCCTTCTTCCTACAGTACTCCCCTGCCCTCAACAAGATG...,False,,,,,,,5.5e-05,0.000107,7e-05,3.844101e-07,8.369896e-06,2.1e-05,0.000105,0.007012,0.078028,0.29718,-7.155859,-3.679855,-1.750592,-2.188773,-0.836737,-0.012253,-0.836737
1,2,NC_000017.11:g.7675247_7675248del,NM_000546.6:c.376-10_376-9del,NP_000537.3:p.?,6506,2489,3260,1103,545,776,25047,Ex5,,Intron,del,CTCTGTCTCCTTCCTTCCTACAGTACTCCCCTGCCCTCAACAAGAT...,False,,,,,,,0.000424,0.000435,0.000367,5.300055e-05,7.864816e-05,6.8e-05,0.000698,0.125128,0.180869,0.186251,-2.99852,-2.466979,-2.424676,-0.418412,-0.284316,-0.341278,-0.341278
2,3,NC_000017.11:g.7675249del,NM_000546.6:c.376-12del,NP_000537.3:p.?,4726,2135,2142,747,164,920,17449,Ex5,,Intron,del,CTCTGTCTCCTTCTCTTCCTACAGTACTCCCCTGCCCTCAACAAGA...,False,,,,,,,0.000308,0.000373,0.000241,3.58943e-05,2.36666e-05,8.1e-05,0.000487,0.11666,0.063451,0.336066,-3.099622,-3.97821,-1.573185,-0.461465,-0.972626,0.074341,-0.461465
3,4,NC_000017.11:g.7675248G>A,NM_000546.6:c.376-12C>T,NP_000537.3:p.?,4441,1576,2528,616,51,489,14982,Ex5,,Intron,sub,CTCTGTCTCCTTCTTCTTCCTACAGTACTCCCCTGCCCTCAACAAG...,False,,,,,,,0.000289,0.000275,0.000285,2.959958e-05,7.359736e-06,4.3e-05,0.000418,0.102375,0.026731,0.151352,-3.288065,-5.225368,-2.724023,-0.541712,-1.540661,-0.487392,-0.541712
4,5,NC_000017.11:g.7675248G>C,NM_000546.6:c.376-12C>G,NP_000537.3:p.?,1680,937,622,375,2,113,5006,Ex5,,Intron,sub,CTCTGTCTCCTTCGTCTTCCTACAGTACTCCCCTGCCCTCAACAAG...,False,,,,,,,0.000109,0.000164,7e-05,1.801922e-05,2.886171e-07,1e-05,0.00014,0.164746,0.001763,0.142149,-2.601681,-9.147647,-2.814525,-0.249422,-3.327116,-0.531566,-0.531566


(9225, 40)
