In [56]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import svm, metrics
from sklearn.decomposition import KernelPCA, PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
import xgboost as xgb

In [57]:
# 1 加载数据
train_X_df = pd.read_excel(r"D:\GitHub\modeling\D\ans3\Molecular_Descriptor.xlsx", engine='openpyxl',
                           sheet_name='training')
train_Y_df = pd.read_excel(r"D:\GitHub\modeling\D\ans3\ADMET.xlsx", engine='openpyxl', sheet_name='training')

In [58]:
# 只用第一问的20个特征
var = ['MDEC-23', 'minsssN', 'LipoaffinityIndex', 'maxHsOH', 'maxssO', 'C1SP2',
                   'minHsOH', 'BCUTc-1l', 'minsOH', 'minHBint5', 'MLFER_A', 'nHBAcc', 'VC-5',
                   'MDEO-12', 'ndssC', 'TopoPSA', 'ATSc3', 'SHBint10', 'MDEC-33', 'XLogP']
var_top20 = pd.Index(var)
# var_top20 = pd.Index(var_top20)
# X_train_df = train_X_df[var_top20]
X_train_df = train_X_df

X = X_train_df.iloc[:, 1:].values  # 去掉第一列
Y = train_Y_df.iloc[:, 5].values  # 选择指定label（从1开始）

# TODO:找出全零的列，将其剔除

In [59]:
# 2 降维
# scikit_kpca = KernelPCA()
#
# # 使用KPCA降低数据维度，直接获得投影后的坐标
# X = scikit_kpca.fit_transform(X)


In [60]:
# 3 分割训练数据和测试数据
# random_state 设置随机数为33
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=33)

# 3 在训练之前，我们需要对数据进行规范化，这样让数据同在同一个量级上，避免因为维度问题造成数据误差：
# 采用 Z-Score 规范化数据，保证每个特征维度的数据均值为 0，方差为 1
ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)

In [61]:
# 4 创建 SVM 分类器
# model_svm = svm.SVC(kernel='poly',degree=8, gamma='auto')
model_svm = svm.SVC()
# 用训练集做训练
model_svm.fit(x_train, y_train)
# 用测试集做预测
p_svm = model_svm.predict(x_test)
print('svm准确率: ', metrics.accuracy_score(p_svm, y_test))

svm准确率:  0.9088607594936708


In [62]:
def modelfit(alg, x_train, y_train, x_test, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(x_train, label=y_train)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])

    #Fit the algorithm on the data
    alg.fit(x_train, y_train,eval_metric='auc')

    #Predict training set:
    dtrain_predictions = alg.predict(x_test)
    dtrain_predprob = alg.predict_proba(x_test)[:,1]

    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(y_test, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(y_test, dtrain_predprob))

    # feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)[:20]
    # feat_imp.plot(kind='bar', title='Feature Importances')
    # plt.ylabel('Feature Importance Score')


In [63]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb1, x_train, y_train, x_test, y_test)





Model Report
Accuracy : 0.9468
AUC Score (Train): 0.985990


In [64]:
#Choose all predictors except target & IDcols
xgb_test = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=7,
 min_child_weight=1,
 gamma=0,
 subsample=0.7,
 colsample_bytree=0.7,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

# param_test1 = {
#  'max_depth':range(3,10,2),
#  'min_child_weight':range(1,6,2)
# }
# param_test3 = {
#  'gamma':[i/10.0 for i in range(0,5)]
# }
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch1 = GridSearchCV(estimator = xgb_test, param_grid = param_test4 , scoring='roc_auc',n_jobs=4, cv=5)
# gsearch1.fit(x_train,y_train)
# gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_


In [65]:
xgb2 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=7,
 min_child_weight=1,
 gamma=0,
 subsample=0.7,
 colsample_bytree=0.7,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb2, x_train, y_train, x_test, y_test)





Model Report
Accuracy : 0.9443
AUC Score (Train): 0.987169


In [66]:
# 5 xgboost
# model_xgboost = XGBClassifier(learning_rate=0.01,
#                       n_estimators=10,           # 树的个数-10棵树建立xgboost
#                       max_depth=4,               # 树的深度
#                       min_child_weight = 1,      # 叶子节点最小权重
#                       gamma=0.,                  # 惩罚项中叶子结点个数前的参数
#                       subsample=1,               # 所有样本建立决策树
#                       colsample_btree=1,         # 所有特征建立决策树
#                       scale_pos_weight=1,        # 解决样本个数不平衡的问题
#                       random_state=27,           # 随机数
#                       slient = 0
#                       )
model_xgboost = XGBClassifier(booster='gbtree', )
model_xgboost.fit(x_train, y_train)
# 用测试集做预测
p_xgboost = model_xgboost.predict(x_test)
print('xgboost准确率: ', metrics.accuracy_score(p_xgboost, y_test))





xgboost准确率:  0.9493670886075949


In [67]:
# 测试结果
test_X_df = pd.read_excel(r"D:\GitHub\modeling\D\ans3\Molecular_Descriptor.xlsx", engine='openpyxl',
                           sheet_name='test')
test_X = test_X_df.iloc[:, 1:].values  # 去掉第一列
pred = model_xgboost.predict(test_X)
# list转dataframe
# col_name = 'CYP3A4'
# col_name = 'hERG'
# col_name = 'HOB'
col_name = 'MN'

df = pd.DataFrame(pred, columns=[col_name])
# 保存到本地excel
df.to_excel(col_name + ".xlsx", index=False)

In [68]:
# 6 随机森林
# model_rf = RandomForestClassifier(n_estimators = 1000, max_depth=None, random_state=0)
model_rf = RandomForestClassifier()
model_rf.fit(x_train, y_train)
# 用测试集做预测
p_rf = model_rf.predict(x_test)
print('随机森林准确率: ', metrics.accuracy_score(p_rf, y_test))

随机森林准确率:  0.9392405063291139
