In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import auc,roc_auc_score,roc_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from numpy import savetxt,loadtxt,save,load
from sklearn.linear_model import Lasso, LassoCV
from sklearn.metrics import r2_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import seaborn as sns
pd.options.display.max_rows = None

In [None]:
A_B = pd.read_csv('abundance.csv')
A_B['class'] = A_B['class'].map({'A':0, 'B':1})
A_B_X = A_B.drop(['Sample','class'], axis=1)
A_B_y = A_B['class']
X_train, X_test, y_train, y_test = train_test_split( A_B_X, A_B_y, test_size=0.2,stratify=A_B_y, random_state=2024)
X_value =X_train.values
y_value = y_train.values

In [None]:
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1, perc=90)
feat_selector.fit(X_value, y_value,)
columns_result_1 = {'feature':X_train.columns, 'support':feat_selector.support_, 'ranking':feat_selector.ranking_}
columns_result_1 = pd.DataFrame(data=columns_result_1)
print(len(columns_result_1[columns_result_1['support']==True]))
columns_after_drop_1 = columns_result_1[columns_result_1['support']==True]
columns_list_1 = list(columns_after_drop_1['feature'])
X_train = X_train[columns_list_1]

In [None]:
params = {'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'auc',
          'nthread':4,
          'learning_rate':0.1,
          'num_leaves':30, 
          'max_depth': 5,   
          'subsample': 0.8, 
          'colsample_bytree': 0.8, 
}

data_train = lgb.Dataset(data=X_train, label=y_train)
cv_results = lgb.cv(params, data_train, num_boost_round=1000, 
                    nfold=10, stratified=True, shuffle=True, 
                    metrics='auc',early_stopping_rounds=50,seed=0)
n_estimators = len(cv_results['auc-mean'])
print('best n_estimators:', len(cv_results['auc-mean']))
print('best cv score:', pd.Series(cv_results['auc-mean']).max())

params_test1={'max_depth': range(3,8,1), 'num_leaves':range(5, 100, 5)}
gsearch1 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',
                                                       metrics='auc',learning_rate=0.1, n_estimators=n_estimators, 
                                                       max_depth=6, bagging_fraction = 0.8,feature_fraction = 0.8), 
                        param_grid = params_test1, scoring='roc_auc',cv=10,n_jobs=-1)

gsearch1.fit(X_train,y_train)
print(gsearch1.best_params_)
print(gsearch1.best_score_)
max_depth = gsearch1.best_params_['max_depth']
num_leaves = gsearch1.best_params_['num_leaves']

params_test2={'max_bin': range(5,256,10), 'min_data_in_leaf':range(1,102,10)}              
gsearch2 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',
                                                       learning_rate=0.1, n_estimators=n_estimators, max_depth=max_depth, 
                                                       num_leaves=num_leaves,bagging_fraction = 0.8,feature_fraction = 0.8),
                        param_grid = params_test2, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch2.fit(X_train,y_train)
max_bin = gsearch2.best_params_['max_bin']
min_data_in_leaf = gsearch2.best_params_['min_data_in_leaf']
print(gsearch2.best_params_)
print(gsearch2.best_score_)

params_test3={'feature_fraction': [0.6,0.7,0.8,0.9,1.0],              
              'bagging_fraction': [0.6,0.7,0.8,0.9,1.0],              
              'bagging_freq': range(0,81,10)}              
gsearch3 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',
                                                       metrics='auc',learning_rate=0.1, n_estimators=n_estimators, 
                                                       max_depth=max_depth, num_leaves=num_leaves,max_bin=max_bin,
                                                       min_data_in_leaf=min_data_in_leaf),                        
                        param_grid = params_test3, scoring='roc_auc', cv=10 ,n_jobs=-1)
gsearch3.fit(X_train,y_train)
feature_fraction = gsearch3.best_params_['feature_fraction']
bagging_fraction = gsearch3.best_params_['bagging_fraction']
bagging_freq = gsearch3.best_params_['bagging_freq']
print(gsearch3.best_params_)
print(gsearch3.best_score_)

params_test4={'lambda_l1': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0],              
              'lambda_l2': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]}              
gsearch4 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',
                                                       learning_rate=0.1, n_estimators=n_estimators, 
                                                       max_depth=max_depth, num_leaves=num_leaves,
                                                       max_bin=max_bin, min_data_in_leaf=min_data_in_leaf,
                                                       bagging_fraction=bagging_fraction, bagging_freq= bagging_freq, 
                                                       feature_fraction= feature_fraction),
                        param_grid = params_test4, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch4.fit(X_train,y_train)
lambda_l1 = gsearch4.best_params_['lambda_l1']
lambda_l2 = gsearch4.best_params_['lambda_l2']

print(gsearch4.best_params_)
print(gsearch4.best_score_)

params_test5={'min_split_gain':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]}              
gsearch5 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',
                                                       learning_rate=0.1, n_estimators=n_estimators, 
                                                       max_depth=max_depth, num_leaves=num_leaves,
                                                       max_bin=max_bin,min_data_in_leaf=min_data_in_leaf,
                                                       bagging_fraction=bagging_fraction,bagging_freq= bagging_freq, 
                                                       feature_fraction= feature_fraction,lambda_l1=lambda_l1,
                                                       lambda_l2=lambda_l2), 
                        param_grid = params_test5, scoring='roc_auc',cv=10,n_jobs=-1)

gsearch5.fit(X_train,y_train)
min_split_gain = gsearch5.best_params_['min_split_gain']
print(gsearch5.best_params_)
print(gsearch5.best_score_)

lgb_params={
    'boosting_type':'gbdt',
    'objective':'binary',
    'metrics':'auc',
    'learning_rate':0.1, 
    'n_estimators':n_estimators, 
    'max_depth':max_depth,
    'num_leaves':num_leaves,
    'max_bin':max_bin, 
    'min_data_in_leaf':min_data_in_leaf,
    'bagging_fraction':bagging_fraction, 
    'bagging_freq':bagging_freq, 
    'feature_fraction':feature_fraction,
    'lambda_l1':lambda_l1,
    'lambda_l2':lambda_l2,
    'min_split_gain':min_split_gain
}

In [None]:
dtrain = lgb.Dataset(data=X_train,
                     label=y_train,
                     free_raw_data=False)
clf = lgb.train(params=lgb_params, train_set=dtrain)
y_train_pred = clf.predict(X_train)
fpr_train, tpr_train, _ = roc_curve(y_train, y_train_pred)
train_auc = auc(fpr_train, tpr_train)

y_test_pred = clf.predict(X_test)
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_pred)
test_auc = auc(fpr_test, tpr_test)
plt.subplots(figsize=(7,5.5))
plt.title('Result Analysis')
plt.plot(fpr_train, tpr_train, color='red',
         lw=2, label='train ROC curve (area = %0.2f)' % train_auc)
plt.plot(fpr_test, tpr_test, color='blue',
         lw=2, label='test ROC curve (area = %0.2f)' % test_auc)
plt.legend() 
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()