# Fig 4c - MultiFunction Hit rate

In [None]:
# Imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image

In [None]:
# Data
Uniform = pd.read_csv('data/modeling_library_production_fitness.csv')
F4F = pd.read_csv('data/fit4function_library_screens.csv')
MuliFxn = pd.read_csv('data/multifunction_library.csv')

In [None]:
# Meta data 
Assays = ['Production','Liver','HepG2_bind','HepG2_tr','THLE_bind','THLE_tr']
Threshold_percentage  = 50; 
Margin_std = 2;

In [None]:
#%% [1] Computing the hit rate for MultiFunction MultiFxnDesigneded library from validation assays  
# Subsets
PositiveControl = MuliFxn[MuliFxn['Label'] == 'Positive Control']
MultiFxnDesigned = MuliFxn[MuliFxn['Label'] == 'MultiFunction']

# Iterate through assays to exclude variants not matching the positive control distributions
InxExclude = set()

for assay in Assays:
    crnt_PositiveControl = np.log2(PositiveControl[assay].values)
    crnt_MultiFxnDesigned = np.log2(MultiFxnDesigned[assay].values)
    
    # Filtering out -inf values from crnt_PositiveControl
    valid_indices = ~np.isinf(crnt_PositiveControl)
    valid_crnt_PositiveControl = crnt_PositiveControl[valid_indices]
    
    mean_PositiveControl = np.nanmean(valid_crnt_PositiveControl)
    std_PositiveControl = np.nanstd(valid_crnt_PositiveControl)
    th_PositiveControl = max(mean_PositiveControl - Margin_std * std_PositiveControl, valid_crnt_PositiveControl.min())
    
    # Exclude those lower than the Threshold
    exclude_indices = np.where(crnt_MultiFxnDesigned < th_PositiveControl)
    InxExclude.update(exclude_indices[0])
    
    # Exclude those that are -inf
    inf_indices = np.where(np.isinf(crnt_MultiFxnDesigned))
    InxExclude.update(inf_indices[0])

# Calculate Hit_rate_MultiFxn
Hit_rate_MultiFxn = (len(MultiFxnDesigned) - len(InxExclude)) / len(MultiFxnDesigned)
print(Hit_rate_MultiFxn)

In [None]:
#%% [2] Estimating the hit rate for Production-fit space from Fit4Function assays 

# Assuming F4F is a pandas DataFrame
F4F['Production'] = F4F['Production2']

Thresholds = []
for assay in Assays:
    crnt = F4F[assay].values
    
    # Filter out infinite and NaN values
    valid_crnt = crnt[~np.isinf(crnt) & ~np.isnan(crnt)]
    
    Thresholds.append(np.percentile(valid_crnt, Threshold_percentage))

# Computing for Fit4Function 
INX = set(range(len(F4F)))

for assay, threshold in zip(Assays,Thresholds):
    crnt = F4F[assay].values
    valid_indices = np.where((crnt > threshold) & ~np.isinf(crnt) & ~np.isnan(crnt))
    
    # Intersect INX with valid_indices
    INX = INX.intersection(valid_indices[0])

Hit_rate_F4F = len(INX) / len(F4F)
print(Hit_rate_F4F)

In [None]:
#%% [3] Estimating the hit rate for the uniform space from Fit4Function assays 
# Production fitness distribution 
# Uncomment if you want to see the bimodal distributions
# X = np.log2(Uniform['Production'])
# X = X[~np.isinf(X)]
# plt.figure()
# plt.hist(X, bins=100) 
# plt.xlabel('Production Fitness (log2)')
# plt.ylabel('Frequency')
# plt.show()

# Processing and Calculations
GMM_midpoint = -2.5
Uniform['Production_log2'] = np.log2(Uniform['Production'])
Uniform = Uniform[Uniform['Label'] == 'Designed']

valid_values = Uniform['Production_log2'][(Uniform['Production_log2'] > GMM_midpoint) & (~np.isinf(Uniform['Production_log2']))]
Production_fit_space = len(valid_values) / len(Uniform)
#print(Production_fit_space)

Hit_rate_Uniform = Hit_rate_F4F * Production_fit_space 
print(Hit_rate_Uniform)

In [None]:
# Meta data

df = pd.DataFrame()
df['Uniform'] = np.array([Hit_rate_Uniform])
df['HighProdFit'] = Hit_rate_F4F
df['Designed'] = Hit_rate_MultiFxn


In [None]:
# Figure 

# Figure Configurations
_new_black = '#000'
sns.set_theme(style='ticks', font_scale=0.75, rc={
    'font.family': 'sans-serif',
    'font.sans-serif': ['Arial', 'DejaVu Sans'],
    'svg.fonttype': 'none',
    'text.usetex': False,
    'pdf.fonttype': 42,
    'ps.fonttype': 42,
    'font.size': 9,
    'axes.labelsize': 9,
    'axes.titlesize': 9,
    'axes.labelpad': 2,
    'axes.linewidth': 0.5,
    'axes.titlepad': 4,
    'lines.linewidth': 0.5,
    'legend.fontsize': 9,
    'legend.title_fontsize': 9,
    'xtick.labelsize': 9,
    'ytick.labelsize': 9,
    'xtick.major.size': 2,
    'xtick.major.pad': 1,
    'xtick.major.width': 0.5,
    'ytick.major.size': 2,
    'ytick.major.pad': 1,
    'ytick.major.width': 0.5,
    'xtick.minor.size': 2,
    'xtick.minor.pad': 1,
    'xtick.minor.width': 0.5,
    'ytick.minor.size': 2,
    'ytick.minor.pad': 1,
    'ytick.minor.width': 0.5,

    # Avoid black unless necessary
    'text.color': _new_black,
    'patch.edgecolor': _new_black,
    'patch.force_edgecolor': False, # Seaborn turns on edgecolors for histograms by default and I don't like it
    'hatch.color': _new_black,
    'axes.edgecolor': _new_black,
    # 'axes.titlecolor': _new_black # should fallback to text.color
    'axes.labelcolor': _new_black,
    'xtick.color': _new_black,
    'ytick.color': _new_black

})


# Plot
fig = plt.figure(figsize=(2.3, 1.5), dpi=150)
gs = fig.add_gridspec(1, 1, left=0.2, right=0.95, top=0.9, bottom=0.25)
ax = fig.add_subplot(gs[0, 0])

y = np.arange(0, 1)

bar_height = 0.3
ax.barh(y - bar_height*0, df['Uniform'], height=bar_height, label='Control', color='#B3B3B3')
ax.barh(y - bar_height*1, df['HighProdFit'], height=bar_height, label='Fit4Fxn', color='#FF5E66')
ax.barh(y - bar_height*2, df['Designed'], height=bar_height, label='MultiFxn', color='#488ABA')

ax.set_xticks(np.arange(0, 1, 0.2))
ax.set_xticklabels([str(l) + '%' for l in np.arange(0, 100, 20)])
ax.tick_params(axis='x', length=2, pad=2.5)

ax.set_yticks(y-bar_height*1)
ax.set_yticklabels(['Liver'])
# ax.tick_params(axis='y', length=0, pad=2, rotation=90)
ax.tick_params(axis='y', length=0, pad=4)
for ticklabel in ax.get_yticklabels():
    ticklabel.set_va('center')

ax.grid(axis='x', color='#DDD', linewidth=0.5)

# Labeling 
ax.set_xlabel('Hit Rate — All Functions', labelpad=4)

label_x_offset = 0.02
ax.text(df.at[0, 'Uniform'] + label_x_offset, 0, 'Uniform ({:.1%})'.format(df.at[0, 'Uniform']), 
        ha='left', va='center', color='#888', fontsize=9)
ax.text(df.at[0, 'HighProdFit'] + label_x_offset, 0-bar_height, 'Fit4Function ({:.1%})'.format(df.at[0, 'HighProdFit']), 
        ha='left', va='center', color='#FF5E66', fontsize=9)
ax.text(df.at[0, 'Designed'] - label_x_offset, 0-(bar_height*2)-0.01, 'MultiFunction ({:.1%})'.format(df.at[0, 'Designed']), 
        ha='right', va='center', color='#FFF', fontsize=9)

# Save and show 
filename = 'figures/fig4c_multifunction_hit_rates'
fig.savefig('{}.png'.format(filename))
fig.savefig('{}_600dpi.svg'.format(filename), dpi = 600)
fig.savefig('{}_1200dpi.svg'.format(filename), dpi= 1200)

plt.close()

Image(filename + '.png')