Using the reduction filters with 2 log unit difference

In [1]:
import pandas as pd
import numpy as np
import os
import ast

import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import plotly.express as px
import plotly.graph_objects as go

In [None]:
root = os.path.abspath('')
root

In [3]:
ph2_intrinsic = pd.read_csv('reduced_ph2_intrinsic_sol.csv')
ph7_intrinsic = pd.read_csv('reduced_ph7_intrinsic_sol.csv')

In [6]:
def cal_str_to_list(df):
    df['basic_pka'] = df['basic_pka'].apply(ast.literal_eval)
    df['acidic_pka'] = df['acidic_pka'].apply(ast.literal_eval)
    return df

ph2_intrinsic = cal_str_to_list(ph2_intrinsic)
ph7_intrinsic = cal_str_to_list(ph7_intrinsic)

In [7]:
def add_from(df, name):
    df['from']  = name
    return df

In [8]:
acid = ph2_intrinsic[ph2_intrinsic['category'] == 'acid']
base = ph7_intrinsic[ph7_intrinsic['category'] == 'base']
neutral_from_ph2 = ph2_intrinsic[ph2_intrinsic['category'] == 'neutral']
neutral_from_ph7 = ph7_intrinsic[ph7_intrinsic['category'] == 'neutral']

# extract acid from pH2 dataset, base from ph7 dataset
acid = add_from(acid, 'ph2')
base = add_from(base, 'ph7')
neutral_from_ph2 = add_from(neutral_from_ph2, 'ph2')
neutral_from_ph7 = add_from(neutral_from_ph7, 'ph7')

In [None]:
# extract acids from pH7 dataset
acid_cal_ph7 = ph7_intrinsic[ph7_intrinsic['category'] == 'acid']
acid_cal_ph7 = acid_cal_ph7[[min(pka) >= 9 for pka in acid_cal_ph7['acidic_pka']]]
acid_cal_ph7 = add_from(acid_cal_ph7, 'ph7')
acid_cal_ph7

# extract bases from pH2 dataset
base_cal_ph2 = ph2_intrinsic[ph2_intrinsic['category'] == 'base']
base_cal_ph2 = base_cal_ph2[[max(pka) <= 0 for pka in base_cal_ph2['basic_pka']]]
base_cal_ph2 = add_from(base_cal_ph2, 'ph2')
base_cal_ph2

In [None]:
#Extract amphoteric compounds from both pH2 and pH7 datasets
amphoteric_from_ph2 = ph2_intrinsic[ph2_intrinsic['category'] == 'amphoteric']
amphoteric_from_ph2 = add_from(amphoteric_from_ph2, 'ph2')
amphoteric_from_ph7 = ph7_intrinsic[ph7_intrinsic['category'] == 'amphoteric']
amphoteric_from_ph7 = add_from(amphoteric_from_ph7, 'ph7')

#Selection criteria
def cal_min_acidic_max_basic_pka(df):
    df['min_acidic_pka'] = [min(row) for row in df['acidic_pka']]
    df['max_basic_pka'] = [max(row) for row in df['basic_pka']]
    df['diff'] = df['min_acidic_pka'] - df['max_basic_pka']
    return df

amphoteric_from_ph2 = cal_min_acidic_max_basic_pka(amphoteric_from_ph2)
amphoteric_from_ph7 = cal_min_acidic_max_basic_pka(amphoteric_from_ph7)

In [None]:

print('with zwitter ion', len(amphoteric_from_ph7))
amphoteric_from_ph7 = amphoteric_from_ph7[amphoteric_from_ph7['diff']> 2]
print('without zwitter ion', len(amphoteric_from_ph7))
amphoteric_from_ph7['mid_point'] = (amphoteric_from_ph7['max_basic_pka']+amphoteric_from_ph7['min_acidic_pka'])/2
amphoteric_from_ph7['cal_with'] = ['ph2' if abs(row-2) < abs(row - 7) else 'ph7' for row in amphoteric_from_ph7['mid_point']]
print(amphoteric_from_ph7['cal_with'].value_counts())
ph7_amphoteric_cal_ph7_s0 = amphoteric_from_ph7[amphoteric_from_ph7['cal_with'] == 'ph7']
ph7_amphoteric_cal_ph7_s0


print('with zwitter ion', len(amphoteric_from_ph2))
amphoteric_from_ph2 = amphoteric_from_ph2[amphoteric_from_ph2['diff']> 2]
print('without zwitter ion', len(amphoteric_from_ph2))
amphoteric_from_ph2['mid_point'] = (amphoteric_from_ph2['max_basic_pka']+amphoteric_from_ph2['min_acidic_pka'])/2
amphoteric_from_ph2['cal_with'] = ['ph2' if abs(row - 2) < abs(row - 7) else 'ph7' for row in amphoteric_from_ph2['mid_point']]
print(amphoteric_from_ph2['cal_with'].value_counts())
ph2_amphoteric_cal_ph2_s0 = amphoteric_from_ph2[amphoteric_from_ph2['cal_with'] == 'ph2']
len(ph2_amphoteric_cal_ph2_s0)

df = pd.concat([ph7_amphoteric_cal_ph7_s0, ph2_amphoteric_cal_ph2_s0], axis = 0)
df = df.drop(['diff', 'mid_point', 'min_acidic_pka', 'max_basic_pka', 'cal_with'], axis = 1)

In [None]:
df_full = pd.concat([df, acid, base, neutral_from_ph2, neutral_from_ph7, acid_cal_ph7, base_cal_ph2, ph7_amphoteric_cal_ph2_s0, ph2_amphoteric_cal_ph7_s0],axis = 0)
df_full = df_full.reset_index(drop=True)

In [37]:
no_dup = df_full.drop_duplicates(subset = ['Identifier'], keep = False)
df_dup = df_full[~df_full['Identifier'].isin(no_dup['Identifier'])]
df_dup = df_dup.sort_values(by = ['Identifier'])
id_groups = df_dup.groupby('Identifier')
# comment from 2025.1.15 this line is actually useless, the resulting dataset is exactly the same with df_dup
id_diff_category = id_groups.filter(lambda x: x['category'].nunique() >= 1)
id_diff_category

In [44]:
def calculate_S0_difference(group):
    if len(group) == 2:  # We assume only two categories per 'Identifier'
        return pd.Series({'Identifier': group['Identifier'].iloc[0],

                          'S0_diff': abs(group['S0'].iloc[0] - group['S0'].iloc[1])})
    else:
        return None  # Handle cases where there are more than two rows per group if needed


In [None]:
S0_differences = id_diff_category.groupby('Identifier').apply(calculate_S0_difference)
print(S0_differences)

In [None]:
diff_accept = S0_differences[np.abs(S0_differences['S0_diff'])<0.7]
diff_accept

In [None]:
selected_neutral = id_diff_category[id_diff_category['Identifier'].isin(diff_accept['Identifier'])]
selected_neutral

In [48]:
idx = selected_neutral.groupby('Identifier')['S0'].idxmin()

# Use the indices to select the corresponding rows from the original dataframe
selected_rows = selected_neutral.loc[idx]

In [50]:
data_final = pd.concat([no_dup,selected_rows], axis = 0)
data_final

Unnamed: 0,SMILES,JNJNUMBER,Contract,JNJSALT,JNJBATCH,Conc (uM),solubility_uM,comments,PLM,acidic_pka,basic_pka,sol_log_M,S0,cat,category,from
0,COc1nccc(C(=O)N2CC3CN(CC3C2)c4ccc5ccc(O)cc5n4)c1F,81481920,2-Limited,81481920-AAA,57235570,0.01,31.27,,Amorphous,[8.95],[5.947],-4.504872,-4.546159,acid_1_base_1,amphoteric,ph7
1,O=C(C[C@@H]1OB(O)c2cc(ccc12)NS(=O)(=O)c3ccc(cc...,88971558,1-Free,88971558-AAA,61137454,0.01,489.56,,Amorphous,[8.986],[6.27],-3.310194,-3.388120,acid_1_base_1,amphoteric,ph7
2,CNC(=O)C[C@@H]1OB(O)c2cc(ccc12)NS(=O)(=O)c3ccc...,89250356,1-Free,89250356-AAA,61292945,0.01,538.40,,Amorphous,[8.987],[5.12],-3.268895,-3.278977,acid_1_base_1,amphoteric,ph7
3,O=C(Nc1cccc(c1)Cn2ccnc2)c3ccc(cc3)S(=O)(=O)Nc4...,88421853,1-Free,88421853-AAA,60773055,0.01,2.94,,Amorphous,[8.875],[6.593],-5.531653,-5.679353,acid_1_base_1,amphoteric,ph7
4,Cc1cnc(O)cc1Nc2nc(cc(n2)C3CCN(CC3)C(=O)OC(C)(C...,89853166,1-Free,89853166-AAA,61785640,0.01,275.52,,Amorphous,[7.857],[5.542],-3.559847,-3.629452,acid_1_base_1,amphoteric,ph7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15059,O=C(NC(CF)c1ccc(F)cc1)N2C[C@@H]3[C@@H](C[C@H]3...,95841759,2-Limited,95841759-AAA,65202487,0.01,0.93,,Amorphous,[],[],-6.031517,-6.031517,neutral,neutral,ph7
8769,CC(=O)N(C)[C@@H]1Cc2ccc(cc2C1)Oc3ccc(cc3)OCC(F...,95883138,1-Free,95883138-AAA,65227267,0.01,0.24,,Amorphous,[],[],-6.619789,-6.619789,neutral,neutral,ph2
8856,CC(=O)Nc1ccc(C)c(c1)C(=O)N2CCC(CC2)Oc3ccc(C)c(...,95884126,1-Free,95884126-AAA,65227827,0.01,1.08,,Amorphous,[],[],-5.966576,-5.966576,neutral,neutral,ph2
8855,N#Cc1ccc(cc1)CNC(=O)c2ccc(CC3CCOCC3)c4ccoc24,95886024,1-Free,95886024-AAA,65228898,0.01,6.66,,Amorphous,[],[],-5.176526,-5.176526,neutral,neutral,ph2


In [51]:
# Check stereoisomers
final_dataset_stereo = data_final.groupby(['SMILES']).size().reset_index(name='counts')
print('stereoisomers check:\n', final_dataset_stereo['counts'].value_counts())

stereoisomers check:
 counts
1    18461
2      119
3        2
Name: count, dtype: int64


In [52]:
data_final.to_csv('S0_pka_a_2024.csv', index = False)