In [None]:
from collections import Counter
from glob import glob
import os
from pathlib import Path

from loguru import logger
import numpy as np
import pandas as pd
import plotly.express as px
import toml

import single_agent_screens as sas

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Read Config

In [None]:
config_file_path = "sas_config.toml"
with open(config_file_path, "r") as fp:
    config = toml.load(fp)

# Read and Analyze Single Agent Screens

In [None]:
df_drugs, df_missing, df_all, df_report = sas.main(config)

# Show All Drugs

One row per compound (i.e. drug) tested. 

* NCGC SID: unique compound id
* name: name of compound
* target: target of compound

In [None]:
df_drugs

# Examine Missingness in response curves

Some compounds / curves do not have fit parameters (R2, AC50, LAC50) or targets. 

In [None]:
px.bar(df_missing, x='cell_line', y='frac_missing', color='variable', barmode='group')

# Examine ALL 

One row per (cell line, compound) tuple. There are 1,912 compounds. 

In [None]:
df_all

In [None]:
df_all['Cell line'].value_counts()

In [None]:
1912 * 9

In [None]:
# Examine one compound for all cell lines
df_all[df_all['name']=='GDC-0068'].sort_values('Cell line')

In [None]:
# Examine one cell line for all compounds
df_all[df_all['Cell line']=='ipNF02.8'].sort_values('name')

# Examine Distributions and Response Curves

### Example Response Curves

In [None]:
xcols = [f"C{i}" for i in range(11)]
ycols = [f"DATA{i}" for i in range(11)]

In [None]:
df_all["irow"] = range(df_all.shape[0])

In [None]:
df_all[df_all["R2"]<0.4]

In [None]:
ii = 47
print(df_all.iloc[ii]["R2"])
px.line(x=df_all.iloc[ii][xcols], y=df_all.iloc[ii][ycols], log_x=True)

In [None]:
ii = 0
print(df_all.iloc[ii]["R2"])
px.line(x=df_all.iloc[ii][xcols], y=df_all.iloc[ii][ycols], log_x=True)

In [None]:
ii = 1001
print(df_all.iloc[ii]["R2"])
px.line(x=df_all.iloc[ii][xcols], y=df_all.iloc[ii][ycols], log_x=True)

In [None]:
ii = 1005
print(df_all.iloc[ii]["R2"])
px.line(x=df_all.iloc[ii][xcols], y=df_all.iloc[ii][ycols], log_x=True)

In [None]:
ii = 10
print(df_all.iloc[ii]["R2"])
px.line(x=df_all.iloc[ii][xcols], y=df_all.iloc[ii][ycols], log_x=True)

### R2 Distribution

In [None]:
px.histogram(df_all['R2'])

### LAC50 Distribution

In [None]:
px.histogram(df_all['LAC50'])

# Resistance vs Sensitivity

Note that,

resistance = AC50_probe_line / AC50_norm_line

sensitivity = AC50_norm_line / AC50_probe_line

log(resistance) = -log(sensitivity)

In our further analysis we'll just look at log(resistance)


In [None]:
TOL = 1e-9
for ncgc_sid, row in df_all.iterrows():
    for norm_line in ["ipnNF95.11C", "HFF-1"]:
        if pd.isna(row[f'resis_log_{norm_line}']):
            continue
        if row[f'resis_log_{norm_line}'] + row[f'sensi_log_{norm_line}'] > TOL:
            print("ouch")
            break

In [None]:
px.histogram(df_all['resis_log_HFF-1'])

In [None]:
px.histogram(df_all['resis_log_ipnNF95.11C'])

# Examine Report

One row per (compound, quantity, norm_cell_line) where quantity is in ("resis", "sensi")

In [None]:
df_report

In [None]:
# Note that in log space resis = -sensi
df_report[df_report['name']=='GDC-0068'].sort_values("norm_cell_line")

In [None]:
df = df_report[
    (df_report['quantity']=='resis') & 
    (df_report['norm_cell_line']=='ipnNF95.11C') & 
    (df_report['name']=='GDC-0068')
]

In [None]:
df

# Report Plots

In [None]:
id_cols = ["NCGC SID", "name", "target", "norm_cell_line"]
val_cols = [col for col in df_report.columns if col.startswith("log")]
cols = id_cols + val_cols

In [None]:
cols

In [None]:
df_plt = df_report[
    (df_report['quantity']=='resis')
][cols]

In [None]:
df_plt

In [None]:
df_plt = df_plt.melt(id_vars=id_cols, value_vars=val_cols)

In [None]:
df_plt

In [None]:
df_plt = df_plt.rename(columns={
    "value": "Log10 (AC50_probe/AC50_norm)", 
    "variable": "probe cell line", 
    "norm_cell_line": "norm cell line",
})

In [None]:
df_plt

In [None]:
df_plt[df_plt['name']=='SAG']

In [None]:
df_plt['drug'] = df_plt["name"] + " | " + df_plt["target"].astype(str)

In [None]:
df_plt["drug mean"] = df_plt.groupby(["NCGC SID", "norm cell line"])["Log10 (AC50_probe/AC50_norm)"].transform("mean")

In [None]:
df_plt["target mean"] = df_plt.groupby(["target", "norm cell line"])["Log10 (AC50_probe/AC50_norm)"].transform("mean")

In [None]:
df_plt = df_plt.sort_values(["norm cell line", "target mean"])

In [None]:
df_plt[df_plt['name']=='SAG']

In [None]:
df_plt

In [None]:
fig = px.box(
    df_plt,
    x="Log10 (AC50_probe/AC50_norm)",
    #y="drug",
    hover_data=["probe cell line", "drug"], 
    color="norm cell line",
    points="all",
)

ac50_ratio_min = 1.5
x = np.log10(ac50_ratio_min)
fig = fig.add_vline(x=x, line_dash="dash", line_color='black', line_width=1)
fig = fig.add_vline(x=-x, line_dash="dash", line_color='black', line_width=1)

In [None]:
fig.show()
fig.write_html("hts_box_no_groups.html")
fig.write_image("hts_box_no_groups.png")

In [None]:
fig = px.box(
    df_plt,
    x="Log10 (AC50_probe/AC50_norm)",
    y="target",
    hover_data=["probe cell line", "drug"], 
    color="norm cell line",
    points="all",
    height=10000,
    width=1000,
)

fig.update_layout(yaxis_range=[-1,df_plt["target"].nunique()])

for y in range(df_plt["target"].nunique()-1):
    fig = fig.add_hline(y=y+0.5, line_width=1)

ac50_ratio_min = 1.5
x = np.log10(ac50_ratio_min)
fig = fig.add_vline(x=x, line_dash="dash", line_color='black', line_width=1)
fig = fig.add_vline(x=-x, line_dash="dash", line_color='black', line_width=1)

fig = fig.update_yaxes(categoryorder='category descending')
#fig.update_xaxes(categoryorder='array', categoryarray= ['d','a','c','b'])

In [None]:
fig.show()
fig.write_html("hts_box_target_groups.html")
fig.write_image("hts_box_target_groups.png")

In [None]:
df_plt['target'].unique()

In [None]:
for ii, target in enumerate(df_plt["target"].unique()):
    dfp = df_plt[df_plt["target"]==target]
    fig = px.box(
        dfp, 
        x="Log10 (AC50_probe/AC50_norm)", 
        y="drug", 
        hover_data=["probe cell line", "drug mean", "target mean"], 
        color="norm cell line", 
        points="all",
        height=max(250, dfp['drug'].nunique()*80),
        width=1000,
    )
    for y in range(dfp['drug'].nunique()-1):
        fig = fig.add_hline(y=y+0.5, line_width=1)
        
    fig.update_layout(xaxis_range=[-4,4])
           

    x = np.log10(ac50_ratio_min)
    fig = fig.add_vline(x=x, line_dash="dash", line_color='black')
    fig = fig.add_vline(x=-x, line_dash="dash", line_color='black')
    fig.show()
    
    if ii > 80:
        break

In [None]:
fig = px.strip(
    df_plt, 
    x="value", 
    y="drug", 
    hover_data=["variable", "mean"], 
    color="norm_cell_line", 
    #points="all",
    height=35000,
    width=1000,
)

ys = [y for y in range(df_plt['drug'].nunique()-1)]
for ii, y in enumerate(ys):
    fig = fig.add_hline(y=y+0.5)

In [None]:
print(ys)

In [None]:
fig.show()

In [None]:
fig.write_html("hts_strip_all.html")

In [None]:
!ls