In [2]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, hamming_loss, make_scorer # type: ignore


## Read fingerprint file and experimental file

In [3]:
receptors = pd.read_csv(
    "/Users/xiaomuou620/Desktop/PRIVATE_DATA/hallucinome.csv"
) # selected only existing binding receptors left
fingerprints = pd.read_csv('/Users/xiaomuou620/Library/CloudStorage/OneDrive-UniversityofCopenhagen/courses/Project1/Fingerprint/MACCS_fingerprints.csv')
print(receptors.head())



  receptor gprotein        drug    Emax  Emax_SE    TCoeff  TCoeff_SE  \
0    5HT1A      Gi1   25C-NBOMe  999.00   999.00       NaN        NaN   
1    5HT1A      Gi1    25I-NBMD   18.62     2.06  8.878197   0.201061   
2    5HT1A      Gi1   25I-NBOMe   24.53     3.28  7.947086   0.195392   
3    5HT1A      Gi1  25T7-NBOMe   22.00     3.87  7.457872   0.217788   
4    5HT1A      Gi1  4-AcO-MALT   38.93     5.16  7.417719   0.148907   

    logEmEC  logEmEc_Lower  logEmEc_Upper   pEC50  pEC50_SE  N  
0       NaN            NaN            NaN  999.00    999.00  3  
1  8.879980       8.529060       9.225551    7.61      0.30  3  
2  7.949698       7.607359       8.284201    6.56      0.28  3  
3  7.462423       7.078398       7.832796    6.12      0.30  3  
4  7.420284       7.158531       7.674340    5.83      0.20  3  


### Experimental data processing

In [5]:
# replace 999 in Emax with 0
receptors["Emax"] = receptors["Emax"].replace(999, 0)
# print(receptors.head())

# collect receptors for each drug
receptor_emax_mapping = (
    receptors.groupby(["drug", "receptor"])["Emax"].max().reset_index()
)
# print(receptor_emax_mapping.head())

# pivot the table to have drugs as rows and receptors as columns
receptor_emax_mapping_pivot = receptor_emax_mapping.pivot(
    index="drug", columns="receptor", values="Emax"
).reset_index()
print(receptor_emax_mapping_pivot.head())

receptor_emax_mapping_pivot.fillna(0, inplace=True)


receptor        drug      5HT1A       5HT1B   5HT1D   5HT1E       5HT1F  \
0          25C-NBOMe  13.047601    0.000000   61.56    0.00    0.000000   
1           25I-NBMD  27.200000   84.239297   91.94  105.26  102.099230   
2          25I-NBOMe  45.370000    0.000000   53.65   33.92   64.596780   
3         25T7-NBOMe  31.991315    0.000000   31.81    0.00   69.116817   
4         4-AcO-MALT  50.677696  126.170618  101.04  115.53  102.552716   

receptor   5HT2A  5HT2B   5HT2C  5HT5A  ...  Alpha2B  Alpha2C  Beta1AR  \
0         126.52  67.19   98.15   0.00  ...      0.0      0.0      0.0   
1         102.02  45.49   79.79   0.00  ...      0.0      0.0      0.0   
2         108.36  72.04  104.43  27.97  ...      0.0      0.0      0.0   
3          99.15  53.73   94.85  36.81  ...      0.0      0.0      0.0   
4         102.61  68.11  110.35  43.88  ...      0.0      0.0      0.0   

receptor  Beta2AR  Beta3AR  DRD1   DRD2   DRD3  DRD4   DRD5  
0             0.0      0.0   0.0  50.60   

### Fingerprint data processing

In [6]:
# Extract the names (first column)
drug_names_fingerprints = fingerprints['Name']
fingerprints_detail = fingerprints.drop('Name', axis=1)  # Drop the name column by index

# Create a dictionary where names are keys and fingerprints are lists of values
# csv_dict = {name: fingerprint.tolist() for name, fingerprint in zip(drug_names_fingerprints, fingerprints_detail.values)}


### Merge feature and labels

In [7]:
# merge receptor_emax_mapping_pivot with fingerprints
data_merged = pd.merge(
    receptor_emax_mapping_pivot, fingerprints, left_on="drug", right_on="Name"
).drop(columns=["Name"])

# data_merged.to_csv(
#     "/Users/xiaomuou620/Desktop/PRIVATE_DATA/selected_file_realval.csv", index=False
# )
print(data_merged.head())

         drug      5HT1A       5HT1B   5HT1D   5HT1E       5HT1F   5HT2A  \
0   25C-NBOMe  13.047601    0.000000   61.56    0.00    0.000000  126.52   
1    25I-NBMD  27.200000   84.239297   91.94  105.26  102.099230  102.02   
2   25I-NBOMe  45.370000    0.000000   53.65   33.92   64.596780  108.36   
3  25T7-NBOMe  31.991315    0.000000   31.81    0.00   69.116817   99.15   
4  4-AcO-MALT  50.677696  126.170618  101.04  115.53  102.552716  102.61   

   5HT2B   5HT2C  5HT5A  ...  157  158  159  160  161  162  163  164  165  166  
0  67.19   98.15   0.00  ...    1    1    1    1    1    1    1    1    1    0  
1  45.49   79.79   0.00  ...    1    1    1    1    1    1    1    1    1    0  
2  72.04  104.43  27.97  ...    1    1    1    1    1    1    1    1    1    0  
3  53.73   94.85  36.81  ...    1    1    1    1    1    1    1    1    1    0  
4  68.11  110.35  43.88  ...    1    1    1    1    1    1    1    1    1    0  

[5 rows x 193 columns]


## Training

In [8]:
# Drop the 'receptor' column
data_merged = data_merged.drop(columns=["drug"])


In [9]:
X = data_merged.iloc[:, 25:]
y = data_merged.iloc[:, 0:25]

print(X.shape, y.shape)
print(X.head())
print(y.head())


(40, 167) (40, 25)
   0  1  2  3  4  5  6  7  8  9  ...  157  158  159  160  161  162  163  164  \
0  0  0  0  0  0  0  0  0  0  0  ...    1    1    1    1    1    1    1    1   
1  0  0  0  0  0  0  0  0  0  0  ...    1    1    1    1    1    1    1    1   
2  0  0  0  0  0  0  0  0  0  0  ...    1    1    1    1    1    1    1    1   
3  0  0  0  0  0  0  0  0  0  0  ...    1    1    1    1    1    1    1    1   
4  0  0  0  0  0  0  0  0  0  0  ...    1    1    1    1    1    1    1    1   

   165  166  
0    1    0  
1    1    0  
2    1    0  
3    1    0  
4    1    0  

[5 rows x 167 columns]
       5HT1A       5HT1B   5HT1D   5HT1E       5HT1F   5HT2A  5HT2B   5HT2C  \
0  13.047601    0.000000   61.56    0.00    0.000000  126.52  67.19   98.15   
1  27.200000   84.239297   91.94  105.26  102.099230  102.02  45.49   79.79   
2  45.370000    0.000000   53.65   33.92   64.596780  108.36  72.04  104.43   
3  31.991315    0.000000   31.81    0.00   69.116817   99.15  53.73   94.85 

In [14]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.multioutput import MultiOutputRegressor

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. 训练 Random Forest 回归模型
rf_model = RandomForestRegressor(random_state=42)
multioutput_model = MultiOutputRegressor(rf_model)

# 训练模型
multioutput_model.fit(X_train, y_train)

# 4. 预测并评估模型
y_pred = multioutput_model.predict(X_test)

# 输出评估指标（例如 MSE 和 R^2）
for i, receptor in enumerate(y.columns):
    mse = mean_squared_error(y_test.iloc[:, i], y_pred[:, i])
    r2 = r2_score(y_test.iloc[:, i], y_pred[:, i])
    print(f"{receptor} - MSE: {mse}, R2: {r2}")

# 5. 判断 drug 是否 bind receptor（例如 IC50 < 10 nM 表示 bind）
binding_threshold = 0  # IC50/Ki 阈值
bind_predictions = (y_pred > binding_threshold).astype(int)

# 输出 bit vectors
bit_vectors = pd.DataFrame(bind_predictions, columns=y.columns)
print(bit_vectors)

binding_threshold = 0  # 例如 Ki 或 IC50 阈值
y_test_bit = (y_test > binding_threshold).astype(int)
y_pred_bit = (y_pred > binding_threshold).astype(int)

# 计算 Hamming Loss
hl = hamming_loss(y_test_bit, y_pred_bit)
print(f"Hamming Loss: {hl:.4f}")

#sf = make_scorer(hamming_loss, greater_is_better=False)


5HT1A - MSE: 501.2340823596505, R2: 0.0640275266406446
5HT1B - MSE: 244.5208546958326, R2: -2.831306265770787
5HT1D - MSE: 261.1945048049991, R2: -0.6359468989087809
5HT1E - MSE: 482.0566265675001, R2: -0.6646466923385026
5HT1F - MSE: 349.17163828985525, R2: -3.649014372849228
5HT2A - MSE: 117.99829144375005, R2: -0.25887086356707933
5HT2B - MSE: 364.4545430200004, R2: 0.4000435677719961
5HT2C - MSE: 255.0917413162486, R2: -0.2668892401169325
5HT5A - MSE: 964.0292145675003, R2: -0.48914074995780243
5HT6 - MSE: 507.41441546625117, R2: 0.6998984018957134
5HT7 - MSE: 1064.074213717499, R2: -0.070515209340418
Alpha1A - MSE: 308.6376772450002, R2: 0.7543942164869653
Alpha1B - MSE: 585.4360737912485, R2: -0.20968123548569562
Alpha1D - MSE: 349.4719455112489, R2: 0.6375493860946374
Alpha2A - MSE: 866.8793735212498, R2: 0.16489915224686502
Alpha2B - MSE: 172.22913681499972, R2: 0.8640443820273143
Alpha2C - MSE: 1133.8817451137506, R2: 0.033100534157212924
Beta1AR - MSE: 0.0, R2: 1.0
Beta2AR - 