In [None]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate

from mordred import Calculator, descriptors
import sklearn.preprocessing as preprocessing

# Read the data

In [None]:
train_test_path = "../../data_for_modeling/filter_data/v1/clean_data/HDAC2_train_test_clean_data.xlsx"; 
train_dataset = pd.read_excel(train_test_path, sheet_name='train_dataset')
test_dataset = pd.read_excel(train_test_path, sheet_name='test_dataset')

In [None]:
print(len(train_dataset), len(test_dataset))

In [None]:
train_dataset.head()

# 2. Xây dựng mô hình với Modred descriptors

## 2.1. Tính toán mordred descriptors

__Mã hóa cấu trúc phân tử bằng Modred descriptors__

<b>Use this for new data</b>

In [None]:
def process_modred(data):
    calc = Calculator(descriptors, ignore_3D=True)
    mols = [Chem.MolFromSmiles(smi) for smi in data]
    # pandas df
    df = calc.pandas(mols)
    return df

train_modred_descriptors = process_modred(train_dataset['SMILES'])
test_mordred_descriptors = process_modred(test_dataset['SMILES'])

In [None]:
train_modred_descriptors.head()

<b>Use this when already have file</b>

In [None]:
# train_modred_descriptors = pd.read_excel('../data_for_modeling/filter_data/v1/modred_descriptors_out.xlsx', sheet_name="train_modred_descriptors")
# test_mordred_descriptors = pd.read_excel('../data_for_modeling/filter_data/v1/modred_descriptors_out.xlsx', sheet_name="test_modred_descriptor")
# train_modred_descriptors = pd.DataFrame(train_modred_descriptors)
# test_mordred_descriptors = pd.DataFrame(test_mordred_descriptors)

__Làm sạch dữ liệu__

- Do thuật toán mã hóa Modred không thể tìm được mọi features của SMILES nên sẽ có một số chỗ không phải là số thực mà là một object báo lỗi, ta sẽ loại bỏ tất cả object báo lỗi này.
- Các dữ liệu sau khi xử lý có giá trị khác numpy.float64 và numpy.int64 thì cho bằng 0.
- Toàn bộ dữ liệu ta xử lý đều là dữ liệu số, vì vậy ta sẽ đặt những object này với giá trị bằng 0.

In [None]:
train_np = np.array(train_modred_descriptors)
test_np = np.array(test_mordred_descriptors)
for (row, col), value in np.ndenumerate(train_np):
    if not (value.__class__ in [int, float, np.float64, np.float32, np.int64, np.int32]):
        train_np[row, col] = 0
        
for (row, col), value in np.ndenumerate(test_np):
    if not (value.__class__ in [int, float, np.float64, np.float32, np.int64, np.int32]):
        test_np[row, col] = 0

train_modred_descriptors = pd.DataFrame(train_np, columns=train_modred_descriptors.columns)
test_mordred_descriptors = pd.DataFrame(test_np, columns=test_mordred_descriptors.columns)
all_mordred_descriptors = pd.concat([train_modred_descriptors, test_mordred_descriptors], ignore_index=False)
# Write to file
# train_modred_descriptors.to_csv("../output/modred_des/train_modred_des_unspec_removed(1).csv", index=False)
# test_mordred_descriptors.to_csv("../output/modred_des/test_modred_des_unspec_removed.csv", index=False)

In [None]:
train_modred_descriptors.head()

In [None]:
all_mordred_descriptors.head()

<b>Import and encoding y</b>

In [None]:
y_train = np.array(train_dataset['ACTIVITY'])
y_test = np.array(test_dataset['ACTIVITY'])
y_all = np.append(y_train, y_test)

In [None]:
print(y_train[0:5])
print(y_test[0:5])

In [None]:
y_train = preprocessing.LabelEncoder().fit_transform(y_train)
y_test = preprocessing.LabelEncoder().fit_transform(y_test)

In [None]:
print(y_train[0:5])
print(y_test[0:5])

In [None]:
print(len(y_train), len(y_test), len(y_all))

## 2.2 Tiền xử lý dữ liệu

## Tạo ma trận features

In [None]:
ic50_train = np.array(train_dataset['IC50 (uM)'])
for i, value in np.ndenumerate(ic50_train):
    if not (value.__class__ in [int, float, np.float64, np.float32, np.int64, np.int32]):
        ic50_train[i] = 0
print(len(ic50_train))
print(ic50_train[:5])

In [None]:
from sklearn.preprocessing import StandardScaler 
train_modred_descriptors['IC50'] = ic50_train
sc = StandardScaler()
train_modred_np = sc.fit_transform(train_modred_descriptors)
train_modred_descriptors = pd.DataFrame(train_modred_np, columns=train_modred_descriptors.columns)
corr_matrix = train_modred_descriptors.corr(method="pearson")

In [None]:
train_modred_descriptors.head()

In [None]:
corr_matrix.head()

In [None]:
sns.heatmap(corr_matrix)

In [None]:
len(corr_matrix)

In [None]:
corr_matrix.to_excel("../output/corr_matrix/unclean_data/HDAC2_corr_matrix_with_unclean_data.xlsx")

In [None]:
corr_matrix = corr_matrix.dropna(subset=['IC50'])
print(len(corr_matrix))
sns.heatmap(corr_matrix)

In [None]:
ic50_corrs_abs = corr_matrix['IC50'].abs()
features = corr_matrix.loc[ic50_corrs_abs > 0.05, :].index.tolist()
sorted_corr_matrix = corr_matrix.loc[features, features]

sorted_corr_matrix = sorted_corr_matrix.sort_values(by='IC50', ascending=False)
index_order = sorted_corr_matrix.index.tolist()
sorted_corr_matrix = sorted_corr_matrix[index_order].reindex(index_order)

In [None]:
sorted_corr_matrix.head()

In [None]:
def set_zero_diagonal(sorted_corr_matrix):
    col_idx = 0
    arr = []
    for index, row in sorted_corr_matrix.iterrows():
        row.iloc[col_idx] = 0
        col_idx+=1
    return sorted_corr_matrix

upper_triangle = sorted_corr_matrix.where(np.triu(np.ones(sorted_corr_matrix.shape)).astype(bool))
upper_triangle = upper_triangle.fillna(0)
upper_triangle = set_zero_diagonal(upper_triangle)

In [None]:
sns.heatmap(upper_triangle)

In [None]:
len(upper_triangle)

In [None]:
upper_triangle.head()

In [None]:
upper_triangle.to_excel("../output/corr_matrix/HDAC2_upper_triangle_unspec_removed(1).xlsx")

In [None]:
def remove_correlated_features(sorted_df, pcc_point):
    above_pcc_point = (sorted_df > pcc_point) | (sorted_df < -pcc_point)
    shapes = [] 
    features = []
    dropped_features = set()
    for idx, row in sorted_df.iterrows():
        if idx in dropped_features:
            continue
        print("[+] Working with row: " + idx)
        cols_rows_to_drop = []
        for col in sorted_df.columns:
            if above_pcc_point.loc[idx, col]:
                cols_rows_to_drop.append(col)
        # drop the columns
        print("[-] Detected in this row:")
        print(cols_rows_to_drop)
        for col_row_index in cols_rows_to_drop:
            dropped_features.add(col_row_index)
        #Drop the data
        sorted_df = sorted_df.drop(cols_rows_to_drop, axis=1).drop(cols_rows_to_drop, axis=0)
        shapes.append(sorted_df.shape)
        features.append(sorted_df.index.to_list())  
        # check if there are any rows left
        if len(sorted_df) == 0:
            break
    return sorted_df, shapes, features

In [None]:
pcc_point = 0.95
matrix_before_processing = upper_triangle.iloc[1:, 1:]
result_matrix, shapes, features = remove_correlated_features(matrix_before_processing, pcc_point=pcc_point)

In [None]:
len(result_matrix)

In [None]:
result_features = result_matrix.columns.to_list()
lengths = []
features_to_file = []
tmp_features = []
for features in result_features:
    tmp_features.append(features)
    lengths.append(len(tmp_features))
    features_to_file.append(tmp_features.copy())

In [None]:
processed_pcc_matrix_fp = "../output/pcc_processed_matrix/"+str(pcc_point)+"_pcc_processed_matrix_unspec_removed(1).xlsx"
shapes_and_features_fp = "../output/shapes_and_features/"+str(pcc_point)+"_shapes_and_features_unspec_removed(1).xlsx"
#Write to file processed pcc matrix
result_matrix['IC50'] = upper_triangle.iloc[0]
result_matrix.to_excel(processed_pcc_matrix_fp, index=True)

# create a DataFrame with the shapes and features
df_shapes = pd.DataFrame({'Len': lengths, 'Features': features_to_file})
# write the DataFrame to an Excel file
df_shapes.to_excel(shapes_and_features_fp, index=False)

## Choosing the optimal features

### Read from features file

In [None]:
# features_data = pd.read_excel('../output/shapes_and_features.xlsx', sheet_name='Sheet1')
# features_strings = features_data['Features']

In [None]:
# import ast
# list_of_features = []
# for features_string in features_strings:
#     list_of_features.append(ast.literal_eval(features_string))

# features = list_of_features[0]
# len(list_of_features[0])

In [None]:
# authors_features = ['Xc-4dv', 'MATS4s', 'GATS1dv', 'SdssC', 'BCUTd-1h', 'GATS1s', 'GATS1are', 'AATSC3d'
#                     , 'PEOE_VSA2', 'AATS4v', 'AATS4Z', 'GATS3d', 'MATS3m', 'AXp-5dv', 'EState_VSA4',
#                     'ETA_eta_L', 'ATSC5m', 'RotRatio', 'SsNH2', 'ETA_epsilon_5', 'SlogP_VSA2',
#                     'MID_N', 'EState_VSA5', 'Xc-5dv', 'JGI5', 'GATS5pe', 'AATSC4d']
authors_features = ['nSpiro', 'C4SP3', 'GATS8s', 'n5ARing', 'SssCH2', 'nARing', 'Xpc-5dv', 'ATSC3d', 'Xc-4dv', 'ATSC5d', 'SMR_VSA4', 'GATS8se', 'n5AHRing', 'MDEC-24', 'ATSC6v', 'GATS2d', 'n6ARing', 'ATSC3v', 'nAHRing', 'ATSC6m', 'nHRing', 'ATSC2d', 'PEOE_VSA7', 'Xc-3dv', 'SMR_VSA6', 'GATS4dv', 'MATS4s']
len(authors_features)

In [None]:
X_train = train_modred_descriptors[authors_features]
X_test = test_mordred_descriptors[authors_features]
print(X_train.columns)
print(X_test.columns)
print(len(X_train.columns))

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_np = sc.fit_transform(X_train)
X_test_np = sc.transform(X_test)
X_train = pd.DataFrame(X_train_np, columns = X_train.columns)
X_test = pd.DataFrame(X_test_np, columns = X_test.columns)

In [None]:
sns.heatmap(X_train.corr())

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## 2.3. Xây dựng mô hình

#### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_des_author = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0)
rf_des_author.fit(X_train, y_train)

#### K-nearest neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_des_author = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn_des_author.fit(X_train, y_train)

#### Support Vector Machine

In [None]:
from sklearn.svm import SVC
svm_des_author = SVC(kernel='rbf', probability=True, random_state=0)
svm_des_author.fit(X_train, y_train)

#### XGBoost

In [None]:
from xgboost import XGBClassifier
bst_des_author = XGBClassifier(n_estimators=100, objective='binary:logistic')
bst_des_author.fit(X_train, y_train)

# 3. Đánh giá mô hình

### 10-fold-cross validation

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [None]:
X_Total = np.concatenate((X_train, X_test), axis=0)
y_Total = np.concatenate((y_train, y_test), axis=0)

In [None]:
cv = KFold(n_splits=10, random_state=1, shuffle=True)

scores = cross_val_score(knn_des_author, X_Total, y_Total, scoring='accuracy', cv=cv, n_jobs=-1)
print('Độ chính xác của 10-fold cross validation: %.3f (%.3f)' % (scores.mean(), scores.std()))

scores = cross_val_score(rf_des_author, X_Total, y_Total, scoring='accuracy', cv=cv, n_jobs=-1)
print('Độ chính xác của 10-fold cross validation: %.3f (%.3f)' % (scores.mean(), scores.std()))

scores = cross_val_score(svm_des_author, X_Total, y_Total, scoring='accuracy', cv=cv, n_jobs=-1)
print('Độ chính xác của 10-fold cross validation: %.3f (%.3f)' % (scores.mean(), scores.std()))

scores = cross_val_score(bst_des_author, X_Total, y_Total, scoring='accuracy', cv=cv, n_jobs=-1)
print('Độ chính xác của 10-fold cross validation: %.3f (%.3f)' % (scores.mean(), scores.std()))

### Accuracy, Sensitivity, Specificity

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
from tabulate import tabulate
import math

In [None]:
def model_evaluation_calculation(cm):
    tp = cm[0][0]; tn = cm[1][1]; fp = cm[0][1]; fn = cm[1][0]
    ac = (tp+tn)/(tp+tn+fp+fn)
    se = tp/(tp+fn)
    sp = tn/(tn+fp)
    mcc = (tp*tn - fp*fn) / math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    return ac, se, sp, mcc

In [None]:
def me_result(cm):
    print("Confusion matrix:")
    print(cm)
    ac, se, sp, mcc = model_evaluation_calculation(cm)
    print("Comparision:")
    table = [[' ' 'Accuracy', 'Sensitity', 'Specificity', 'MCC'], ['My model', ac, se, sp, mcc]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

KNN

In [None]:
#KNN
y_knn_pred = knn_des_author.predict(X_test)
cm = confusion_matrix(y_test, y_knn_pred)
me_result(cm)

RF

In [None]:
#SVM
y_rf_pred = rf_des_author.predict(X_test)
cm = confusion_matrix(y_test, y_rf_pred)
me_result(cm)

SVM

In [None]:
#SVM
y_svm_pred = svm_des_author.predict(X_test)
cm = confusion_matrix(y_test, y_svm_pred)
me_result(cm)

XGboost

In [None]:
y_bst_pred = bst_des_author.predict(X_test)
cm = confusion_matrix(y_test, y_bst_pred)
me_result(cm)

### AUC

In [None]:
from sklearn.metrics import roc_auc_score
knn_y_proba = knn_des_author.predict_proba(X_test)[:, 1]
rf_y_proba = rf_des_author.predict_proba(X_test)[:, 1]
svm_y_proba = svm_des_author.predict_proba(X_test)[:, 1]
bst_y_proba = bst_des_author.predict_proba(X_test)[:, 1]


knn_auc_score = roc_auc_score(y_test, knn_y_proba)
rf_auc_score = roc_auc_score(y_test, rf_y_proba)
svm_auc_score = roc_auc_score(y_test, svm_y_proba)
bst_auc_score = roc_auc_score(y_test, bst_y_proba)
print(knn_auc_score, rf_auc_score, svm_auc_score, bst_auc_score)

In [None]:
from sklearn.metrics import RocCurveDisplay
fig, ax = plt.subplots(figsize=(6, 6))

RocCurveDisplay.from_estimator(
    estimator=rf_des_author, 
    X=X_test, 
    y=y_test,
    name=f"ROC curve for RF",
    color='cornflowerblue',
    ax=ax)


RocCurveDisplay.from_estimator(
    estimator=knn_des_author, 
    X=X_test, 
    y=y_test,
    name=f"ROC curve for KNN",
    color='darkorange',
    ax=ax)

RocCurveDisplay.from_estimator(
    estimator=bst_des_author, 
    X=X_test, 
    y=y_test,
    name=f"ROC curve for XGBoost",
    color='aqua',
    ax=ax)

RocCurveDisplay.from_estimator(
    estimator=svm_des_author, 
    X=X_test, 
    y=y_test,
    name=f"ROC curve for SVM",
    color='red',
    ax=ax)

plt.plot([0, 1], [0, 1], "k--", label="chance level (AUC = 0.5)")
plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Modred Descriptors ROC Curves")
plt.legend()
plt.show()