### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from pandas import DataFrame, Series
from typing import Optional

### Functions

In [2]:
def transform_rfs(
    df: DataFrame, type_p_column: str, score: str, median_from: Optional[str] = None
) -> Series:
    """
    transform_rfs calculates relative fitness scores based on a set of synonmous
    and non-synonymous variants.

    Parameters
    ----------
    df : DataFrame
        DataFrame with unnormalized scores.
    type_p_column : str
        A column indicating the type of the variant, must contain "syn" and "non".
    score : str
        The column containing the raw scores.
    median_from : Optional[str], optional
        If needed, the medians can be calculated from another score column,
        e.g. for transforming confidence intervals, by default the score column
        is used.

    Returns
    -------
    Series
        A Series containing transformed scores.
    """
    median_score = median_from
    if median_from is None:
        median_score = score
    syn_median = df.loc[df[type_p_column] == "syn", median_score].median()
    non_median = df.loc[df[type_p_column] == "non", median_score].median()
    difference = abs(non_median - syn_median)
    return (((df[score] - non_median) / difference) * 2) + 1


### Settings

In [3]:
# Load the CSV file with Enrich2 scores
incoming = Path("enrich2_files")
out_folder = Path("out")
out_file = out_folder / "Exon5-8_Enrich2_RFS.tsv"
enrich2_result_file = incoming / "Exon5678_identifier_scores.tsv"   

exon_column = "library_id"
type_column = "type_p"
id_column = "mut_ID"


In [4]:
# read in the enrich2 data and clean

enrich2_data = pd.read_csv(enrich2_result_file, sep="\t")
print(enrich2_data.shape)
enrich2_data = enrich2_data.set_index(id_column)

# Drop rows with NaN values in 'rfs_median' and 'score' columns if there are any
enrich2_data_cleaned = enrich2_data.dropna(subset=['rfs_median', 'score'])
enrich2_data_cleaned = enrich2_data_cleaned.rename(columns={"score": "score_enrich2", "SE": "SE_enrich2"})
# Create new columns 'score_CIup' and 'score_CIdown'
enrich2_data_cleaned['score_CIup'] = enrich2_data_cleaned['score_enrich2'] + 1.96 * enrich2_data_cleaned['SE_enrich2']
enrich2_data_cleaned['score_CIdown'] = enrich2_data_cleaned['score_enrich2'] - 1.96 * enrich2_data_cleaned['SE_enrich2']
print(enrich2_data_cleaned.shape)
display(enrich2_data.head())

(9225, 49)
(9225, 50)


Unnamed: 0_level_0,hg38_genomic,hg38_cDNA,hg38_protein,read_count_dmso_1,read_count_dmso_2,read_count_dmso_3,read_count_n3a_1,read_count_n3a_2,read_count_n3a_3,read_count_donor,...,rfs_median,SE,epsilon,score,Rep1_SE,Rep1_score,Rep2_SE,Rep2_score,Rep3_SE,Rep3_score
mut_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,NC_000017.11:g.7675248_7675250del,NM_000546.6:c.376-12_376-10del,NP_000537.3:p.?,842,614,624,8,58,237,3767,...,-0.836737,1.019231,0.0,-3.147085,0.344723,-5.120048,0.136826,-2.758029,0.076236,-1.631491
2,NC_000017.11:g.7675247_7675248del,NM_000546.6:c.376-10_376-9del,NP_000537.3:p.?,6506,2489,3260,1103,545,776,25047,...,-0.341278,0.107962,0.0,-2.109415,0.032557,-2.298055,0.047275,-1.924381,0.039932,-2.099547
3,NC_000017.11:g.7675249del,NM_000546.6:c.376-12del,NP_000537.3:p.?,4726,2135,2142,747,164,920,17449,...,-0.461465,0.423292,0.0,-2.280297,0.039362,-2.367947,0.080916,-2.969792,0.03941,-1.509519
4,NC_000017.11:g.7675248G>A,NM_000546.6:c.376-12C>T,NP_000537.3:p.?,4441,1576,2528,616,51,489,14982,...,-0.541712,0.474068,2.220446e-16,-2.869279,0.04298,-2.49843,0.141605,-3.827627,0.049381,-2.306705
5,NC_000017.11:g.7675248G>C,NM_000546.6:c.376-12C>G,NP_000537.3:p.?,1680,937,622,375,2,113,5006,...,-0.531566,1.349988,8.881784e-16,-3.509283,0.057081,-2.022329,0.633298,-6.333172,0.102064,-2.366648


In [5]:
# Initialize an empty list to store the processed DataFrames
processed_dataframes = []

for exon, df_exon in enrich2_data_cleaned.groupby(exon_column):

    # Normalize the data per exon
    for score_column, median_from in [("score_enrich2", None), ("score_CIup", "score_enrich2"), ("score_CIdown", "score_enrich2")]:
        transformed_score_column = f"transformed_{score_column}"
    
        # Calculate RFS for the enrich score and apply transformation to confidence intervals
        df_exon[transformed_score_column] = transform_rfs(df_exon, type_column, score_column, median_from)
    
    # append the processed DataFrame to the list
    processed_dataframes.append(df_exon)

# combine the processed DataFrames
df_out = pd.concat(processed_dataframes)

# sort the frame by mut_ID
df_out = df_out.sort_values(id_column)

# recalculate the SE from the transformed confidence intervals
df_out["transformed_SE_enrich2"] = (df_out["transformed_score_CIup"] - df_out["transformed_score_enrich2"]) / 1.96

# reset index and to file
df_out = df_out.reset_index()
display(df_out.head())
df_out.to_csv(out_file, sep="\t", index=False)


Unnamed: 0,mut_ID,hg38_genomic,hg38_cDNA,hg38_protein,read_count_dmso_1,read_count_dmso_2,read_count_dmso_3,read_count_n3a_1,read_count_n3a_2,read_count_n3a_3,...,Rep2_SE,Rep2_score,Rep3_SE,Rep3_score,score_CIup,score_CIdown,transformed_score_enrich2,transformed_score_CIup,transformed_score_CIdown,transformed_SE_enrich2
0,1,NC_000017.11:g.7675248_7675250del,NM_000546.6:c.376-12_376-10del,NP_000537.3:p.?,842,614,624,8,58,237,...,0.136826,-2.758029,0.076236,-1.631491,-1.149392,-5.144778,-0.970377,0.298027,-2.238781,0.647145
1,2,NC_000017.11:g.7675247_7675248del,NM_000546.6:c.376-10_376-9del,NP_000537.3:p.?,6506,2489,3260,1103,545,776,...,0.047275,-1.924381,0.039932,-2.099547,-1.89781,-2.32102,-0.311525,-0.177169,-0.44588,0.068549
2,3,NC_000017.11:g.7675249del,NM_000546.6:c.376-12del,NP_000537.3:p.?,4726,2135,2142,747,164,920,...,0.080916,-2.969792,0.03941,-1.509519,-1.450645,-3.10995,-0.420024,0.106751,-0.946799,0.268763
3,4,NC_000017.11:g.7675248G>A,NM_000546.6:c.376-12C>T,NP_000537.3:p.?,4441,1576,2528,616,51,489,...,0.141605,-3.827627,0.049381,-2.306705,-1.940106,-3.798452,-0.793988,-0.204025,-1.383952,0.301002
4,5,NC_000017.11:g.7675248G>C,NM_000546.6:c.376-12C>G,NP_000537.3:p.?,1680,937,622,375,2,113,...,0.633298,-6.333172,0.102064,-2.366648,-0.863307,-6.155259,-1.200349,0.479672,-2.880369,0.857153
