In [1]:
import pandas as pd
import numpy as np
import os
import ast

import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import plotly.express as px
import plotly.graph_objects as go

In [2]:
root = os.path.abspath('')

'/domino/datasets/local/Sol_pred_Jiaxi/data_processing'

In [6]:
ph2 = pd.read_csv(os.path.join(root, 'your/ph2/data/.csv'))
ph7 = pd.read_csv(os.path.join(root, 'your/ph7/data/.csv.csv'))
reduce = True
print('#ph2:', len(ph2),
     '\n#ph7:', len(ph7))

#ph2: 12184 
#ph7: 14026


In [7]:
def cal_log_sol_and_str_to_list(df):
    df['sol_log_M']= np.log10(df['solubility_uM']/1000000)
    df['basic_pka'] = df['basic_pka'].apply(ast.literal_eval)
    df['acidic_pka'] = df['acidic_pka'].apply(ast.literal_eval)
    return df

ph2 = cal_log_sol_and_str_to_list(ph2)
ph7 = cal_log_sol_and_str_to_list(ph7)

In [8]:
def reduce_pka(ph2, ph7):
    if reduce == True:
        ph2['acidic_pka'] = ph2['acidic_pka'].apply(lambda x: [i for i in x if i < 4])
        ph2['basic_pka'] = ph2['basic_pka'].apply(lambda x: [i for i in x if i > 0])
        
        ph7['acidic_pka'] = ph7['acidic_pka'].apply(lambda x: [i for i in x if i < 9])
        ph7['basic_pka'] = ph7['basic_pka'].apply(lambda x: [i for i in x if i > 5])
        return ph2, ph7
    else:
        return ph2, ph7

ph2, ph7 = reduce_pka(ph2, ph7)

In [9]:
def seperate(df):
    pred_acid = df[df['basic_pka'].apply(lambda x: len(x) == 0) & df['acidic_pka'].apply(lambda x: len(x) > 0)]
    pred_base = df[df['basic_pka'].apply(lambda x: len(x) > 0) & df['acidic_pka'].apply(lambda x: len(x) == 0)]
    pred_amphoteric = df[df['basic_pka'].apply(lambda x: len(x) > 0) & df['acidic_pka'].apply(lambda x: len(x) > 0)]
    pred_neutral = df[df['basic_pka'].apply(lambda x: len(x) == 0) & df['acidic_pka'].apply(lambda x: len(x) == 0)]
    return pred_acid, pred_base, pred_amphoteric, pred_neutral

ph2_acid, ph2_base, ph2_amphoteric, ph2_neutral = seperate(ph2)
ph7_acid, ph7_base, ph7_amphoteric, ph7_neutral = seperate(ph7)

print("ph2_acid:", len(ph2_acid))
print("ph2_base:", len(ph2_base))
print("ph2_amphoteric:", len(ph2_amphoteric))
print("ph2_neutral:", len(ph2_neutral))

print("\nph7_acid:", len(ph7_acid))
print("ph7_base:", len(ph7_base))
print("ph7_amphoteric:", len(ph7_amphoteric))
print("ph7_neutral:", len(ph7_neutral))

ph2_acid: 47
ph2_base: 10740
ph2_amphoteric: 243
ph2_neutral: 1154

ph7_acid: 880
ph7_base: 4435
ph7_amphoteric: 209
ph7_neutral: 8502


In [10]:
def pka_3(acid, base, amphoteric):
    base_3pka = base[base['basic_pka'].apply(len) <= 3]
    acid_3pka = acid[acid['acidic_pka'].apply(len) <= 3]
    amphoteric['total_pka'] = amphoteric['acidic_pka'].apply(len) + amphoteric['basic_pka'].apply(len)
    amphoteric_3pka = amphoteric[amphoteric['total_pka'] <= 3]
    print('base_3pka:', len(base_3pka))
    print('acid_3pka:', len(acid_3pka))
    print('amphoteric_3pk:', len(amphoteric_3pka))
    return acid_3pka, base_3pka, amphoteric_3pka

In [11]:
ph7_acid_pka_3, ph7_base_pka_3, ph7_amphoteric_pka_3 = pka_3(ph7_acid, ph7_base, ph7_amphoteric)

base_3pka: 4434
acid_3pka: 879
amphoteric_3pk: 208


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amphoteric['total_pka'] = amphoteric['acidic_pka'].apply(len) + amphoteric['basic_pka'].apply(len)


In [12]:
ph2_acid_pka_3, ph2_base_pka_3, ph2_amphoteric_pka_3 = pka_3(ph2_acid, ph2_base, ph2_amphoteric)

base_3pka: 9154
acid_3pka: 47
amphoteric_3pk: 155


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amphoteric['total_pka'] = amphoteric['acidic_pka'].apply(len) + amphoteric['basic_pka'].apply(len)


In [13]:
def ph7_base_intrinsic_sol_cal(row):
    pka = row['basic_pka']
    sol = row['solubility_uM']
    if len(pka) == 1:
        pka1 = pka[0]
        return np.log10(sol/1000000) - np.log10(10**(+pka1 - 7) + 1)
    elif len(pka) == 2:
        pka1 = pka[0]
        pka2 = pka[1]
        return np.log10(sol/1000000) - np.log10(10**(+pka2 + pka1 - 2*7) + 10**(+pka2 - 7) + 1)
    elif len(pka) == 3:
        pka1 = pka[0]
        pka2 = pka[1]
        pka3 = pka[2]
        return np.log10(sol/1000000) - np.log10(10**(pka3 +pka2 + pka1 - 3*7) + 10**(+pka3 + pka2 - 2*7) + 10**(+pka3 - 7) + 1)


def ph7_acid_intrinsic_sol_cal(row):
    pka = row['acidic_pka']
    sol = row['solubility_uM']
    if len(pka) == 1:
        pka1 = pka[0]
        return np.log10(sol/1000000) - np.log10(10**(-pka1 + 7) + 1)
    elif len(pka) == 2:
        pka1 = pka[0]
        pka2 = pka[1]
        return np.log10(sol/1000000) - np.log10(10**(-pka2 - pka1 + 2*7) + 10**(- pka2 + 7) + 1)
    elif len(pka) == 3:
        pka1 = pka[0]
        pka2 = pka[1]
        pka3 = pka[2]
        return np.log10(sol/1000000) - np.log10(10**(- pka3 - pka2 - pka1 + 3*7) + 10**(- pka2 - pka1 + 2*7) + 10**(- pka1 + 7) + 1)

def ph7_amphoteric_intrinsic_sol_cal(row):
    pka_acid = row['acidic_pka']
    pka_base = row['basic_pka']
    sol = row['solubility_uM']
    
    if len(pka_acid) == 1 and len(pka_base) == 1:
        pka1 = pka_base[0]
        pka2 = pka_acid[0]
        return np.log10(sol/1000000) - np.log10(10**(+ pka1 - 7) + 10**(- pka2 + 7) + 1)
    elif len(pka_acid) == 1 and len(pka_base) == 2:
        pka1 = pka_base[0]
        pka2 = pka_base[1]
        pka3 = pka_acid[0]
        return np.log10(sol/1000000) - np.log10(10**(+ pka2 + pka1 - 2*7) + 10**(+ pka2 - 7) + 10**(- pka3 + 7) + 1)
    elif len(pka_acid) == 2 and len(pka_base) == 1:
        pka1 = pka_base[0]
        pka2 = pka_acid[0]
        pka3 = pka_acid[1]
        return np.log10(sol/1000000) - np.log10(10**(- pka3 - pka2 + 2*7) + 10**(- pka2 + 7) + 10**(+ pka1 - 7) + 1)

In [14]:
def ph2_base_intrinsic_sol_cal(row):
    pka = row['basic_pka']
    sol = row['solubility_uM']
    if len(pka) == 1:
        pka1 = pka[0]
        return np.log10(sol/1000000) - np.log10(10**(+pka1 - 2) + 1)
    elif len(pka) == 2:
        pka1 = pka[0]
        pka2 = pka[1]
        return np.log10(sol/1000000) - np.log10(10**(+pka2 + pka1 - 2*2) + 10**(+pka2 - 2) + 1)
    elif len(pka) == 3:
        pka1 = pka[0]
        pka2 = pka[1]
        pka3 = pka[2]
        return np.log10(sol/1000000) - np.log10(10**(pka3 +pka2 + pka1 - 3*2) + 10**(+pka3 + pka2 - 2*2) + 10**(+pka3 - 2) + 1)


def ph2_acid_intrinsic_sol_cal(row):
    pka = row['acidic_pka']
    sol = row['solubility_uM']
    if len(pka) == 1:
        pka1 = pka[0]
        return np.log10(sol/1000000) - np.log10(10**(-pka1 + 2) + 1)
    elif len(pka) == 2:
        pka1 = pka[0]
        pka2 = pka[1]
        return np.log10(sol/1000000) - np.log10(10**(-pka2 - pka1 + 2*2) + 10**(- pka2 + 2) + 1)
    elif len(pka) == 3:
        pka1 = pka[0]
        pka2 = pka[1]
        pka3 = pka[2]
        return np.log10(sol/1000000) - np.log10(10**(- pka3 - pka2 - pka1 + 3*2) + 10**(- pka2 - pka1 + 2*2) + 10**(- pka1 + 2) + 1)

def ph2_amphoteric_intrinsic_sol_cal(row):
    pka_acid = row['acidic_pka']
    pka_base = row['basic_pka']
    sol = row['solubility_uM']
    
    if len(pka_acid) == 1 and len(pka_base) == 1:
        pka1 = pka_base[0]
        pka2 = pka_acid[0]
        return np.log10(sol/1000000) - np.log10(10**(+ pka1 - 2) + 10**(- pka2 + 2) + 1)
    elif len(pka_acid) == 1 and len(pka_base) == 2:
        pka1 = pka_base[0]
        pka2 = pka_base[1]
        pka3 = pka_acid[0]
        return np.log10(sol/1000000) - np.log10(10**(+ pka2 + pka1 - 2*2) + 10**(+ pka2 - 2) + 10**(- pka3 + 2) + 1)
    elif len(pka_acid) == 2 and len(pka_base) == 1:
        pka1 = pka_base[0]
        pka2 = pka_acid[0]
        pka3 = pka_acid[1]
        return np.log10(sol/1000000) - np.log10(10**(- pka3 - pka2 + 2*2) + 10**(- pka2 + 2) + 10**(+ pka1 - 2) + 1)

In [15]:
def ph7_base_intrinsic_cat(row):
    pka = row['basic_pka']
    if len(pka) == 1:
        return 'base_1'
    elif len(pka) == 2:
        return 'base_2'
    elif len(pka) == 3:
        return 'base_3'

def ph7_acid_intrinsic_cat(row):
    pka = row['acidic_pka']
    if len(pka) == 1:
        return 'acid_1'
    elif len(pka) == 2:
        return 'acid_2'
    elif len(pka) == 3:
        return 'acid_3'

def ph7_amphoteric_intrinsic_cat(row):
    pka_acid = row['acidic_pka']
    pka_base = row['basic_pka']
    
    if len(pka_acid) == 1 and len(pka_base) == 1:
        return 'acid_1_base_1'
    elif len(pka_acid) == 1 and len(pka_base) == 2:
        return 'acid_1_base_2'
    elif len(pka_acid) == 2 and len(pka_base) == 1:
        return 'acid_2_base_1'

In [16]:
def ph2_base_intrinsic_cat(row):
    pka = row['basic_pka']
    if len(pka) == 1:
        return 'base_1'
    elif len(pka) == 2:
        return 'base_2'
    elif len(pka) == 3:
        return 'base_3'


def ph2_acid_intrinsic_cat(row):
    pka = row['acidic_pka']
    if len(pka) == 1:
        return 'acid_1'
    elif len(pka) == 2:
        return 'acid_2'
    elif len(pka) == 3:
        return 'acid_3'

def ph2_amphoteric_intrinsic_cat(row):
    pka_acid = row['acidic_pka']
    pka_base = row['basic_pka']
    
    if len(pka_acid) == 1 and len(pka_base) == 1:
        return 'acid_1_base_1'
    elif len(pka_acid) == 1 and len(pka_base) == 2:
        return 'acid_1_base_2'
    elif len(pka_acid) == 2 and len(pka_base) == 1:
        return 'acid_2_base_1'

In [17]:
def calculate_intrinsic_solubility(acid_3pka, base_3pka, amphoteric_3pka,ph):
    if ph == 7: 
        amphoteric_3pka['S0'] = amphoteric_3pka.apply(ph7_amphoteric_intrinsic_sol_cal, axis=1)
        base_3pka['S0'] = base_3pka.apply(ph7_base_intrinsic_sol_cal, axis=1)
        acid_3pka['S0'] = acid_3pka.apply(ph7_acid_intrinsic_sol_cal, axis=1)
        amphoteric_3pka['cat'] = amphoteric_3pka.apply(ph7_amphoteric_intrinsic_cat, axis=1)
        base_3pka['cat'] = base_3pka.apply(ph7_base_intrinsic_cat, axis=1)
        acid_3pka['cat'] = acid_3pka.apply(ph7_acid_intrinsic_cat, axis=1)
    elif ph == 2:
        amphoteric_3pka['S0'] = amphoteric_3pka.apply(ph2_amphoteric_intrinsic_sol_cal, axis=1)
        base_3pka['S0'] = base_3pka.apply(ph2_base_intrinsic_sol_cal, axis=1)
        acid_3pka['S0'] = acid_3pka.apply(ph2_acid_intrinsic_sol_cal, axis=1)
        amphoteric_3pka['cat'] = amphoteric_3pka.apply(ph2_amphoteric_intrinsic_cat, axis=1)
        base_3pka['cat'] = base_3pka.apply(ph2_base_intrinsic_cat, axis=1)
        acid_3pka['cat'] = acid_3pka.apply(ph2_acid_intrinsic_cat, axis=1)
        
    return acid_3pka, base_3pka, amphoteric_3pka

In [18]:
ph7_acid_pka_3, ph7_base_pka_3, ph7_amphoteric_pka_3 = calculate_intrinsic_solubility(ph7_acid_pka_3, ph7_base_pka_3, ph7_amphoteric_pka_3, ph = 7)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amphoteric_3pka['S0'] = amphoteric_3pka.apply(ph7_amphoteric_intrinsic_sol_cal, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base_3pka['S0'] = base_3pka.apply(ph7_base_intrinsic_sol_cal, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  acid_3pka['S0'] = acid_3pka.apply(ph7_acid_intr

In [19]:
ph2_acid_pka_3, ph2_base_pka_3, ph2_amphoteric_pka_3 = calculate_intrinsic_solubility(ph2_acid_pka_3, ph2_base_pka_3, ph2_amphoteric_pka_3, ph=2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amphoteric_3pka['S0'] = amphoteric_3pka.apply(ph2_amphoteric_intrinsic_sol_cal, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base_3pka['S0'] = base_3pka.apply(ph2_base_intrinsic_sol_cal, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amphoteric_3pka['cat'] = amphoteric_3pka.apply(

In [20]:
def concat_intrinsic_dataset(acid_3pka,base_3pka,amphoteric_3pka, neutral):
    acid_3pka['category'] = 'acid'
    base_3pka['category'] = 'base'
    amphoteric_3pka['category'] = 'amphoteric'
    amphoteric_3pka = amphoteric_3pka.drop(['total_pka'],axis = 1)
    neutral['cat'] = 'neutral'
    neutral['category'] = 'neutral'
    neutral['S0'] = neutral['sol_log_M']
    
    intrinsic_sol_full = pd.concat([acid_3pka,base_3pka, amphoteric_3pka, neutral], axis = 0)
    print('intrinsic solubility dataset:', len(intrinsic_sol_full))
    return intrinsic_sol_full

In [21]:
ph2_intrinsic = concat_intrinsic_dataset(ph2_acid_pka_3, ph2_base_pka_3, ph2_amphoteric_pka_3, ph2_neutral)
ph7_intrinsic = concat_intrinsic_dataset(ph7_acid_pka_3, ph7_base_pka_3, ph7_amphoteric_pka_3, ph7_neutral)

intrinsic solubility dataset: 10510
intrinsic solubility dataset: 14023


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base_3pka['category'] = 'base'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amphoteric_3pka['category'] = 'amphoteric'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neutral['cat'] = 'neutral'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexe

In [25]:
ph2_intrinsic.to_csv('reduced_ph2_intrinsic_sol.csv', index = False) 
ph7_intrinsic.to_csv('reduced_ph7_intrinsic_sol.csv', index = False) 

In [28]:
# Merge the DataFrames
pred_compare = pd.merge(ph7_intrinsic, ph2_intrinsic, how='inner', on=['JNJNUMBER'])
print(len(pred_compare))

17001
