In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score,cross_validate,train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc,accuracy_score,precision_recall_curve
from sklearn import neighbors,preprocessing
from sklearn import tree
from sklearn import ensemble
from sklearn import svm
import sklearn.linear_model as LM
from sklearn.model_selection import GridSearchCV
import sklearn.neural_network as net
import scipy.stats as st
from scipy.optimize import root,fsolve
from sklearn.feature_selection import VarianceThreshold, SelectKBest,f_classif,chi2
from sklearn.feature_selection import RFE,RFECV,SelectFromModel
from sklearn.linear_model import Lasso, LassoCV, lasso_path,Ridge,RidgeCV
from sklearn.linear_model import enet_path,ElasticNetCV,ElasticNet
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from boruta import BorutaPy
from sklearn.decomposition import PCA
import itertools

In [2]:
# Fetch Dataset
data=pd.read_excel('dataset5.xlsx')
X=data.iloc[:,1:86]
Y=data.iloc[:,86]

# Feature selection

In [21]:
# Method 1: Low Variance Filtering
selector=VarianceThreshold(threshold=0.05)
selector.fit(X)
print("Low Variance Filtering: The number of remaining variables is %d"%len(selector.get_support(True)))

colIndex = list(selector.get_support(indices=True))
X_useFeature1 = X.iloc[:,colIndex]
print(X_useFeature1.shape)

Low Variance Filtering: The number of remaining variables is 29
(1628, 29)


In [22]:
# Method 2: Lasso Regression
model = LassoCV()
model.fit(X,Y)
print('The number of remaining variables after Lasso regression: %d'%sum(model.coef_==0))
print('The best alpha is: ',model.alpha_)
lassoAlpha=model.alpha_

estimator = Lasso(alpha=lassoAlpha) 
selector=SelectFromModel(estimator=estimator)
selector.fit(X,Y)
print("The threshold is: %s"%selector.threshold_)
print("Lasso Regression: The number of remaining variables is %d"%len(selector.get_support(indices=True)))

colIndex = list(selector.get_support(indices=True))
X_useFeature2 = X.iloc[:,colIndex]  # get selected features
print(X_useFeature2.shape)

The number of remaining variables after Lasso regression: 61
The best alpha is:  0.0020137797634826008
The threshold is: 1e-05
Lasso Regression: The number of remaining variables is 24
(1628, 24)


In [23]:
# Method 3: Rridge Regression
modelRidge = RidgeCV()
modelRidge.fit(X,Y)
print('The number of remaining variables after Rridge regression: %d'%sum(modelRidge.coef_==0))
print('The best alpha is: ',modelRidge.alpha_) 
ridgeAlpha=modelRidge.alpha_

estimator = Ridge(alpha=ridgeAlpha) 
selector=SelectFromModel(estimator=estimator)
selector.fit(X,Y)
print("The threshold is: %s"%selector.threshold_)
print("Rridge Regression: The number of remaining variables is: %d"%len(selector.get_support(indices=True)))

colIndex = list(selector.get_support(indices=True))
X_useFeature3 = X.iloc[:,colIndex]  # get selected features
print(X_useFeature3.shape)

The number of remaining variables after Rridge regression: 10
The best alpha is:  10.0
The threshold is: 0.0372450007999867
Rridge Regression: The number of remaining variables is: 26
(1628, 26)


In [24]:
# Division of training set and testing set
# Without feature selection
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,train_size=0.70, random_state=123) 

# After Low Variance Filtering
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X_useFeature1,Y,train_size=0.70, random_state=123) 

# After Lasso Regression
X2_train, X2_test, Y2_train, Y2_test = train_test_split(X_useFeature2,Y,train_size=0.70, random_state=123) 

# After Ridge Regression
X3_train, X3_test, Y3_train, Y3_test = train_test_split(X_useFeature3,Y,train_size=0.70, random_state=123) 


In [25]:
# Feature selection function
def useFeatureSelection(model, X_train, Y_train, X_test, Y_test):
    model.fit(X_train,Y_train)
    Y_train_pre=model.predict(X_train)
    print('Model training result: \n',classification_report(Y_train,Y_train_pre, digits=4))

    Y_test_pre=model.predict(X_test)
    print('Model prediction result: \n',classification_report(Y_test,Y_test_pre, digits=4))

# Model 1：KNN

## Without Feature Selection

In [26]:
# Guassain kernel function
def guass(x):
    x=preprocessing.scale(x)
    output=1/np.sqrt(2*np.pi)*np.exp(-x*x/2)
    return output

modelKNN=neighbors.KNeighborsClassifier(n_neighbors=10,weights=guass)
modelKNN.fit(X_train,Y_train)
Y_train_pre=modelKNN.predict(X_train)
print('Model training result: \n',classification_report(Y_train,Y_train_pre, digits=4))

Y_test_pre=modelKNN.predict(X_test)
print('Model prediction result: \n',classification_report(Y_test,Y_test_pre, digits=4))

Model training result: 
               precision    recall  f1-score   support

           0     0.5865    0.6186    0.6021       603
           1     0.5427    0.5093    0.5255       536

    accuracy                         0.5672      1139
   macro avg     0.5646    0.5640    0.5638      1139
weighted avg     0.5659    0.5672    0.5661      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.5526    0.6562    0.6000       256
           1     0.5243    0.4163    0.4641       233

    accuracy                         0.5419       489
   macro avg     0.5385    0.5363    0.5321       489
weighted avg     0.5391    0.5419    0.5353       489



## Using Feature Selection

In [28]:
# Low Variance Filtering
modelKNN1=neighbors.KNeighborsClassifier(n_neighbors=10,weights=guass)
useFeatureSelection(modelKNN1, X1_train, Y1_train, X1_test, Y1_test)

Model training result: 
               precision    recall  f1-score   support

           0     0.6335    0.6650    0.6489       603
           1     0.6008    0.5672    0.5835       536

    accuracy                         0.6190      1139
   macro avg     0.6171    0.6161    0.6162      1139
weighted avg     0.6181    0.6190    0.6181      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.5545    0.6562    0.6011       256
           1     0.5269    0.4206    0.4678       233

    accuracy                         0.5440       489
   macro avg     0.5407    0.5384    0.5344       489
weighted avg     0.5413    0.5440    0.5376       489



In [29]:
# Lasso Regression
modelKNN2=neighbors.KNeighborsClassifier(n_neighbors=10,weights=guass)
useFeatureSelection(modelKNN2, X2_train, Y2_train, X2_test, Y2_test)

Model training result: 
               precision    recall  f1-score   support

           0     0.6210    0.6468    0.6336       603
           1     0.5832    0.5560    0.5692       536

    accuracy                         0.6040      1139
   macro avg     0.6021    0.6014    0.6014      1139
weighted avg     0.6032    0.6040    0.6033      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.5537    0.6641    0.6039       256
           1     0.5275    0.4120    0.4627       233

    accuracy                         0.5440       489
   macro avg     0.5406    0.5380    0.5333       489
weighted avg     0.5412    0.5440    0.5366       489



In [30]:
# Ridge Regression
modelKNN3=neighbors.KNeighborsClassifier(n_neighbors=10,weights=guass)
useFeatureSelection(modelKNN3, X3_train, Y3_train, X3_test, Y3_test)

Model training result: 
               precision    recall  f1-score   support

           0     0.8529    0.9420    0.8952       603
           1     0.9260    0.8172    0.8682       536

    accuracy                         0.8832      1139
   macro avg     0.8894    0.8796    0.8817      1139
weighted avg     0.8873    0.8832    0.8825      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.8063    0.8945    0.8481       256
           1     0.8683    0.7639    0.8128       233

    accuracy                         0.8323       489
   macro avg     0.8373    0.8292    0.8305       489
weighted avg     0.8359    0.8323    0.8313       489



# Model 2: Naive Bayse

## Without Feature Selection

In [31]:
modelNB = GaussianNB()
modelNB.fit(X_train, Y_train)
print('Model training result: \n',classification_report(Y_train,modelNB.predict(X_train), digits=4))
print('Model prediction result: \n',classification_report(Y_test,modelNB.predict(X_test), digits=4))

Model training result: 
               precision    recall  f1-score   support

           0     0.9274    0.2753    0.4246       603
           1     0.5448    0.9757    0.6992       536

    accuracy                         0.6049      1139
   macro avg     0.7361    0.6255    0.5619      1139
weighted avg     0.7473    0.6049    0.5538      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.8800    0.2578    0.3988       256
           1     0.5411    0.9614    0.6924       233

    accuracy                         0.5930       489
   macro avg     0.7105    0.6096    0.5456       489
weighted avg     0.7185    0.5930    0.5387       489



## Using Feature Selection

In [32]:
# Low Variance Filtering
modelNB1=GaussianNB()
useFeatureSelection(modelNB1, X1_train, Y1_train, X1_test, Y1_test)

Model training result: 
               precision    recall  f1-score   support

           0     0.7981    0.9768    0.8784       603
           1     0.9651    0.7220    0.8260       536

    accuracy                         0.8569      1139
   macro avg     0.8816    0.8494    0.8522      1139
weighted avg     0.8767    0.8569    0.8538      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.8058    0.9727    0.8814       256
           1     0.9611    0.7425    0.8378       233

    accuracy                         0.8630       489
   macro avg     0.8835    0.8576    0.8596       489
weighted avg     0.8798    0.8630    0.8606       489



In [33]:
# Lasso Regression
modelNB2=GaussianNB()
useFeatureSelection(modelNB2, X2_train, Y2_train, X2_test, Y2_test)

Model training result: 
               precision    recall  f1-score   support

           0     0.7891    0.9801    0.8743       603
           1     0.9692    0.7052    0.8164       536

    accuracy                         0.8507      1139
   macro avg     0.8791    0.8427    0.8453      1139
weighted avg     0.8738    0.8507    0.8470      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.8091    0.9766    0.8850       256
           1     0.9667    0.7468    0.8426       233

    accuracy                         0.8671       489
   macro avg     0.8879    0.8617    0.8638       489
weighted avg     0.8842    0.8671    0.8648       489



In [34]:
# Ridge Regression
modelNB3=GaussianNB()
useFeatureSelection(modelNB3, X3_train, Y3_train, X3_test, Y3_test)

Model training result: 
               precision    recall  f1-score   support

           0     0.9263    0.8342    0.8778       603
           1     0.8322    0.9254    0.8763       536

    accuracy                         0.8771      1139
   macro avg     0.8793    0.8798    0.8771      1139
weighted avg     0.8820    0.8771    0.8771      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.9159    0.8086    0.8589       256
           1     0.8137    0.9185    0.8629       233

    accuracy                         0.8609       489
   macro avg     0.8648    0.8635    0.8609       489
weighted avg     0.8672    0.8609    0.8608       489



# Model 3: Decision Tree

## Without Feature Selection

In [35]:
modelDTC = tree.DecisionTreeClassifier(max_depth=10,random_state=123)
modelDTC.fit(X_train, Y_train)
print('Model training result: \n',classification_report(Y_train,modelDTC.predict(X_train), digits=4))
print('Model prediction result: \n',classification_report(Y_test,modelDTC.predict(X_test), digits=4))

Model training result: 
               precision    recall  f1-score   support

           0     0.9633    1.0000    0.9813       603
           1     1.0000    0.9571    0.9781       536

    accuracy                         0.9798      1139
   macro avg     0.9816    0.9785    0.9797      1139
weighted avg     0.9805    0.9798    0.9798      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.9020    0.8984    0.9002       256
           1     0.8889    0.8927    0.8908       233

    accuracy                         0.8957       489
   macro avg     0.8954    0.8956    0.8955       489
weighted avg     0.8957    0.8957    0.8957       489



## Using Feature Selection

In [36]:
# Low Variance Filtering
modelDTC1 = tree.DecisionTreeClassifier(max_depth=10,random_state=123)
useFeatureSelection(modelDTC1, X1_train, Y1_train, X1_test, Y1_test)

Model training result: 
               precision    recall  f1-score   support

           0     0.9742    1.0000    0.9869       603
           1     1.0000    0.9701    0.9848       536

    accuracy                         0.9860      1139
   macro avg     0.9871    0.9851    0.9859      1139
weighted avg     0.9863    0.9860    0.9859      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.8880    0.8984    0.8932       256
           1     0.8870    0.8755    0.8812       233

    accuracy                         0.8875       489
   macro avg     0.8875    0.8870    0.8872       489
weighted avg     0.8875    0.8875    0.8875       489



In [37]:
# Lasso  Regression
modelDTC2 = tree.DecisionTreeClassifier(max_depth=10,random_state=123)
useFeatureSelection(modelDTC2, X2_train, Y2_train, X2_test, Y2_test)

Model training result: 
               precision    recall  f1-score   support

           0     0.9773    1.0000    0.9885       603
           1     1.0000    0.9739    0.9868       536

    accuracy                         0.9877      1139
   macro avg     0.9887    0.9869    0.9876      1139
weighted avg     0.9880    0.9877    0.9877      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.8837    0.8906    0.8872       256
           1     0.8788    0.8712    0.8750       233

    accuracy                         0.8814       489
   macro avg     0.8813    0.8809    0.8811       489
weighted avg     0.8814    0.8814    0.8814       489



In [38]:
# Rridge Regression
modelDTC3 = tree.DecisionTreeClassifier(max_depth=10,random_state=123)
useFeatureSelection(modelDTC3, X3_train, Y3_train, X3_test, Y3_test)

Model training result: 
               precision    recall  f1-score   support

           0     0.9436    0.9983    0.9702       603
           1     0.9980    0.9328    0.9643       536

    accuracy                         0.9675      1139
   macro avg     0.9708    0.9656    0.9673      1139
weighted avg     0.9692    0.9675    0.9674      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.8839    0.9219    0.9025       256
           1     0.9099    0.8670    0.8879       233

    accuracy                         0.8957       489
   macro avg     0.8969    0.8944    0.8952       489
weighted avg     0.8963    0.8957    0.8955       489



# Model 4: Random Forest

In [40]:
# Feature Selection
def useFeatureSelection2(model, X_train, Y_train, X_test, Y_test):
    model.fit(X_train,Y_train)
    Y_train_pre=model.predict(X_train).round().astype(int)
    print('Model training result: \n',classification_report(np.array(Y_train),Y_train_pre, digits=4))

    Y_test_pre=model.predict(X_test).round().astype(int)
    print('Model prediction result: \n',classification_report(np.array(Y_test),Y_test_pre, digits=4))

## Without Feature Selection

In [41]:
RF=ensemble.RandomForestClassifier(n_estimators=120,oob_score=True,random_state=123,
                                      bootstrap=True)
RF.fit(X_train,Y_train)    
pre_rf_train=RF.predict(X_train).round().astype(int)
pre_rf_test=RF.predict(X_test).round().astype(int)
print('Model training result: \n',classification_report(np.array(Y_train),pre_rf_train, digits=4))
print('Model prediction result: \n',classification_report(np.array(Y_test),pre_rf_test, digits=4))

Model training result: 
               precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       603
           1     1.0000    1.0000    1.0000       536

    accuracy                         1.0000      1139
   macro avg     1.0000    1.0000    1.0000      1139
weighted avg     1.0000    1.0000    1.0000      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.9067    0.9492    0.9275       256
           1     0.9412    0.8927    0.9163       233

    accuracy                         0.9223       489
   macro avg     0.9239    0.9210    0.9219       489
weighted avg     0.9231    0.9223    0.9222       489



## Using Feature Selection

In [42]:
# Low Variance Filtering
RF1=ensemble.RandomForestClassifier(n_estimators=120,oob_score=True,random_state=123,
                                      bootstrap=True)
useFeatureSelection2(RF1, X1_train, Y1_train, X1_test, Y1_test)

Model training result: 
               precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       603
           1     1.0000    1.0000    1.0000       536

    accuracy                         1.0000      1139
   macro avg     1.0000    1.0000    1.0000      1139
weighted avg     1.0000    1.0000    1.0000      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.9011    0.9609    0.9301       256
           1     0.9537    0.8841    0.9176       233

    accuracy                         0.9243       489
   macro avg     0.9274    0.9225    0.9238       489
weighted avg     0.9262    0.9243    0.9241       489



In [43]:
# Lasso Regression
RF2=ensemble.RandomForestClassifier(n_estimators=120,
                                  oob_score=True,random_state=123,bootstrap=True)
useFeatureSelection2(RF2, X2_train, Y2_train, X2_test, Y2_test)

Model training result: 
               precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       603
           1     1.0000    1.0000    1.0000       536

    accuracy                         1.0000      1139
   macro avg     1.0000    1.0000    1.0000      1139
weighted avg     1.0000    1.0000    1.0000      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.9124    0.9766    0.9434       256
           1     0.9721    0.8970    0.9330       233

    accuracy                         0.9387       489
   macro avg     0.9423    0.9368    0.9382       489
weighted avg     0.9408    0.9387    0.9385       489



In [44]:
# Ridge Regression
RF3=ensemble.RandomForestClassifier(n_estimators=120,
                                  oob_score=True,random_state=123,bootstrap=True)
useFeatureSelection2(RF3, X3_train, Y3_train, X3_test, Y3_test)

Model training result: 
               precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       603
           1     1.0000    1.0000    1.0000       536

    accuracy                         1.0000      1139
   macro avg     1.0000    1.0000    1.0000      1139
weighted avg     1.0000    1.0000    1.0000      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.9067    0.9492    0.9275       256
           1     0.9412    0.8927    0.9163       233

    accuracy                         0.9223       489
   macro avg     0.9239    0.9210    0.9219       489
weighted avg     0.9231    0.9223    0.9222       489



# Model 5: AdaBoost

## Without Feature Selection

In [45]:
# Building weak model
dt_stump = tree.DecisionTreeClassifier(max_depth=8, min_samples_leaf=1)
adaBoost = ensemble.AdaBoostClassifier(estimator=dt_stump,n_estimators=150,
                                      random_state=123)
adaBoost.fit(X_train,Y_train)

pre_adaBoost_train=adaBoost.predict(X_train).round().astype(int)
pre_adaBoost_test=adaBoost.predict(X_test).round().astype(int)
print('Model training result: \n',classification_report(np.array(Y_train),pre_adaBoost_train, digits=4))
print('Model prediction result: \n',classification_report(np.array(Y_test),pre_adaBoost_test, digits=4))



Model training result: 
               precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       603
           1     1.0000    1.0000    1.0000       536

    accuracy                         1.0000      1139
   macro avg     1.0000    1.0000    1.0000      1139
weighted avg     1.0000    1.0000    1.0000      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.8826    0.9688    0.9236       256
           1     0.9615    0.8584    0.9070       233

    accuracy                         0.9162       489
   macro avg     0.9221    0.9136    0.9153       489
weighted avg     0.9202    0.9162    0.9157       489



## Using Feature Selection 

In [50]:
# Low Variance Filtering
adaBoost1 = ensemble.AdaBoostClassifier(estimator=dt_stump,n_estimators=150,
                                      random_state=123)
useFeatureSelection2(adaBoost1, X1_train, Y1_train, X1_test, Y1_test)



Model training result: 
               precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       603
           1     1.0000    1.0000    1.0000       536

    accuracy                         1.0000      1139
   macro avg     1.0000    1.0000    1.0000      1139
weighted avg     1.0000    1.0000    1.0000      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.8821    0.9648    0.9216       256
           1     0.9569    0.8584    0.9050       233

    accuracy                         0.9141       489
   macro avg     0.9195    0.9116    0.9133       489
weighted avg     0.9178    0.9141    0.9137       489



In [52]:
# Lasso Regression
adaBoost2 = ensemble.AdaBoostClassifier(estimator=dt_stump,n_estimators=150,
                                      random_state=123)
useFeatureSelection2(adaBoost2, X2_train, Y2_train, X2_test, Y2_test)



Model training result: 
               precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       603
           1     1.0000    1.0000    1.0000       536

    accuracy                         1.0000      1139
   macro avg     1.0000    1.0000    1.0000      1139
weighted avg     1.0000    1.0000    1.0000      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.8885    0.9648    0.9251       256
           1     0.9573    0.8670    0.9099       233

    accuracy                         0.9182       489
   macro avg     0.9229    0.9159    0.9175       489
weighted avg     0.9213    0.9182    0.9179       489



In [53]:
# Ridge Regression
adaBoost3 = ensemble.AdaBoostClassifier(estimator=dt_stump,n_estimators=150,
                                      random_state=123)
useFeatureSelection2(adaBoost3, X3_train, Y3_train, X3_test, Y3_test)



Model training result: 
               precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       603
           1     1.0000    1.0000    1.0000       536

    accuracy                         1.0000      1139
   macro avg     1.0000    1.0000    1.0000      1139
weighted avg     1.0000    1.0000    1.0000      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.8713    0.9258    0.8977       256
           1     0.9124    0.8498    0.8800       233

    accuracy                         0.8896       489
   macro avg     0.8919    0.8878    0.8889       489
weighted avg     0.8909    0.8896    0.8893       489



# Model 6: MLP

## Without Feature Selection 

In [55]:
NeuNet=net.MLPClassifier(activation='logistic',random_state=123,hidden_layer_sizes=(60,),max_iter=2000)
NeuNet.fit(X_train,Y_train)
pre_net_train=NeuNet.predict(X_train)
pre_net_test=NeuNet.predict(X_test)
print('Model traing Result: \n',classification_report(Y_train, pre_net_train, digits=4))
print('Model prediction Result: \n',classification_report(Y_test, pre_net_test, digits=4))

Model traing Result: 
               precision    recall  f1-score   support

           0     0.9215    0.9536    0.9372       603
           1     0.9456    0.9086    0.9267       536

    accuracy                         0.9324      1139
   macro avg     0.9336    0.9311    0.9320      1139
weighted avg     0.9328    0.9324    0.9323      1139

Model prediction Result: 
               precision    recall  f1-score   support

           0     0.8806    0.9219    0.9008       256
           1     0.9095    0.8627    0.8855       233

    accuracy                         0.8937       489
   macro avg     0.8950    0.8923    0.8931       489
weighted avg     0.8944    0.8937    0.8935       489



## Using Feature Selection 

In [56]:
# Low Varaince Filtering
NeuNet1=net.MLPClassifier(activation='logistic',random_state=123,hidden_layer_sizes=(60,),max_iter=2000)
useFeatureSelection(NeuNet1, X1_train, Y1_train, X1_test, Y1_test)

Model training result: 
               precision    recall  f1-score   support

           0     0.9221    0.9420    0.9319       603
           1     0.9331    0.9104    0.9216       536

    accuracy                         0.9271      1139
   macro avg     0.9276    0.9262    0.9268      1139
weighted avg     0.9273    0.9271    0.9271      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.8962    0.9102    0.9031       256
           1     0.8996    0.8841    0.8918       233

    accuracy                         0.8978       489
   macro avg     0.8979    0.8971    0.8974       489
weighted avg     0.8978    0.8978    0.8977       489



In [57]:
# Lasso Regression
NeuNet2=net.MLPClassifier(activation='logistic',random_state=123,hidden_layer_sizes=(60,),max_iter=2000)
useFeatureSelection(NeuNet2, X2_train, Y2_train, X2_test, Y2_test)

Model training result: 
               precision    recall  f1-score   support

           0     0.9141    0.9701    0.9413       603
           1     0.9639    0.8974    0.9295       536

    accuracy                         0.9359      1139
   macro avg     0.9390    0.9338    0.9354      1139
weighted avg     0.9375    0.9359    0.9357      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.8843    0.9258    0.9046       256
           1     0.9140    0.8670    0.8899       233

    accuracy                         0.8978       489
   macro avg     0.8992    0.8964    0.8972       489
weighted avg     0.8985    0.8978    0.8976       489



In [58]:
# Ridge Regression
NeuNet3=net.MLPClassifier(activation='logistic',random_state=123,hidden_layer_sizes=(60,),max_iter=2000)
useFeatureSelection(NeuNet3, X3_train, Y3_train, X3_test, Y3_test)

Model training result: 
               precision    recall  f1-score   support

           0     0.9168    0.9320    0.9243       603
           1     0.9221    0.9049    0.9134       536

    accuracy                         0.9192      1139
   macro avg     0.9194    0.9184    0.9189      1139
weighted avg     0.9193    0.9192    0.9192      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.9144    0.9180    0.9162       256
           1     0.9095    0.9056    0.9075       233

    accuracy                         0.9121       489
   macro avg     0.9119    0.9118    0.9119       489
weighted avg     0.9121    0.9121    0.9121       489



# Model 7: SVM

## Without Feature Selection

In [59]:
parameters = {'C':[1, 3, 5, 7, 9, 11, 13, 15, 17, 19],
              'gamma':[0.00001, 0.0001, 0.001, 0.1, 1, 10, 100, 1000]}# pre-setting some parameters


modelSVC=svm.SVC(kernel='rbf')
clf = GridSearchCV(modelSVC, parameters, cv=5, n_jobs=8)# Grid search with 5-fold cross validation
clf.fit(X_train,Y_train)

pre_svm_train=clf.predict(X_train)
pre_svm_test=clf.predict(X_test)
print('The best hyper-parameters: ', clf.best_params_)
print('Model training result: \n',classification_report(Y_train, pre_svm_train, digits=4))
print('Model prediction result: \n',classification_report(Y_test, pre_svm_test, digits=4))

The best hyper-parameters:  {'C': 19, 'gamma': 0.0001}
Model training result: 
               precision    recall  f1-score   support

           0     0.8320    0.8706    0.8509       603
           1     0.8465    0.8022    0.8238       536

    accuracy                         0.8385      1139
   macro avg     0.8392    0.8364    0.8373      1139
weighted avg     0.8388    0.8385    0.8381      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.6854    0.7148    0.6998       256
           1     0.6712    0.6395    0.6549       233

    accuracy                         0.6789       489
   macro avg     0.6783    0.6772    0.6774       489
weighted avg     0.6786    0.6789    0.6784       489



## Using Feature Selection

In [60]:
# Low Variance Filtering
clf1 = GridSearchCV(modelSVC, parameters, cv=5, n_jobs=8)# Grid search with 5-fold cross validation
useFeatureSelection(clf1, X1_train, Y1_train, X1_test, Y1_test)

Model training result: 
               precision    recall  f1-score   support

           0     0.8317    0.8690    0.8500       603
           1     0.8448    0.8022    0.8230       536

    accuracy                         0.8376      1139
   macro avg     0.8383    0.8356    0.8365      1139
weighted avg     0.8379    0.8376    0.8373      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.6854    0.7148    0.6998       256
           1     0.6712    0.6395    0.6549       233

    accuracy                         0.6789       489
   macro avg     0.6783    0.6772    0.6774       489
weighted avg     0.6786    0.6789    0.6784       489



In [61]:
# Lasso Regression
clf2 = GridSearchCV(modelSVC, parameters, cv=5, n_jobs=8)# Grid search with 5-fold cross validation
useFeatureSelection(clf2, X2_train, Y2_train, X2_test, Y2_test)

Model training result: 
               precision    recall  f1-score   support

           0     0.8167    0.8574    0.8366       603
           1     0.8300    0.7836    0.8061       536

    accuracy                         0.8227      1139
   macro avg     0.8234    0.8205    0.8214      1139
weighted avg     0.8230    0.8227    0.8223      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.6766    0.7109    0.6933       256
           1     0.6636    0.6266    0.6446       233

    accuracy                         0.6708       489
   macro avg     0.6701    0.6688    0.6690       489
weighted avg     0.6704    0.6708    0.6701       489



In [62]:
# Ridge Regression
clf3 = GridSearchCV(modelSVC, parameters, cv=5, n_jobs=8)# Grid search with 5-fold cross validation
useFeatureSelection(clf3, X3_train, Y3_train, X3_test, Y3_test)

Model training result: 
               precision    recall  f1-score   support

           0     0.9147    0.9071    0.9109       603
           1     0.8965    0.9049    0.9006       536

    accuracy                         0.9061      1139
   macro avg     0.9056    0.9060    0.9058      1139
weighted avg     0.9061    0.9061    0.9061      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.9109    0.9180    0.9144       256
           1     0.9091    0.9013    0.9052       233

    accuracy                         0.9100       489
   macro avg     0.9100    0.9096    0.9098       489
weighted avg     0.9100    0.9100    0.9100       489



# Model 8: XGBoost

## Without Feature Selection

In [65]:
XGBmodel = XGBClassifier(learning_rate=0.3,
                          n_estimators=150,
                          max_depth=12)

XGBmodel.fit(X_train, Y_train)

pre_XGBoost_train=XGBmodel.predict(X_train)
pre_XGBoost_test=XGBmodel.predict(X_test)
print('Model training result: \n',classification_report(Y_train,pre_XGBoost_train, digits=4))
print('Model prediction result: \n',classification_report(Y_test,pre_XGBoost_test, digits=4))


Model training result: 
               precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       603
           1     1.0000    1.0000    1.0000       536

    accuracy                         1.0000      1139
   macro avg     1.0000    1.0000    1.0000      1139
weighted avg     1.0000    1.0000    1.0000      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.9067    0.9492    0.9275       256
           1     0.9412    0.8927    0.9163       233

    accuracy                         0.9223       489
   macro avg     0.9239    0.9210    0.9219       489
weighted avg     0.9231    0.9223    0.9222       489



## Using Feature Selection

In [68]:
# Low Variance Filtering
XGBmodel1 = XGBClassifier(learning_rate=0.3,
                          n_estimators=150,
                          max_depth=12)
useFeatureSelection(XGBmodel1, X1_train, Y1_train, X1_test, Y1_test)

Model training result: 
               precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       603
           1     1.0000    1.0000    1.0000       536

    accuracy                         1.0000      1139
   macro avg     1.0000    1.0000    1.0000      1139
weighted avg     1.0000    1.0000    1.0000      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.9064    0.9453    0.9254       256
           1     0.9369    0.8927    0.9143       233

    accuracy                         0.9202       489
   macro avg     0.9217    0.9190    0.9199       489
weighted avg     0.9209    0.9202    0.9201       489



In [69]:
# Lasso Regression
XGBmodel2 = XGBClassifier(learning_rate=0.3,
                          n_estimators=150,
                          max_depth=12)
useFeatureSelection(XGBmodel2, X2_train, Y2_train, X2_test, Y2_test)

Model training result: 
               precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       603
           1     1.0000    1.0000    1.0000       536

    accuracy                         1.0000      1139
   macro avg     1.0000    1.0000    1.0000      1139
weighted avg     1.0000    1.0000    1.0000      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.9074    0.9570    0.9316       256
           1     0.9498    0.8927    0.9204       233

    accuracy                         0.9264       489
   macro avg     0.9286    0.9249    0.9260       489
weighted avg     0.9276    0.9264    0.9262       489



In [70]:
# Ridge Regression
XGBmodel3 = XGBClassifier(learning_rate=0.3,
                          n_estimators=150,
                          max_depth=12)
useFeatureSelection(XGBmodel3, X3_train, Y3_train, X3_test, Y3_test)

Model training result: 
               precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       603
           1     1.0000    1.0000    1.0000       536

    accuracy                         1.0000      1139
   macro avg     1.0000    1.0000    1.0000      1139
weighted avg     1.0000    1.0000    1.0000      1139

Model prediction result: 
               precision    recall  f1-score   support

           0     0.8787    0.9336    0.9053       256
           1     0.9217    0.8584    0.8889       233

    accuracy                         0.8978       489
   macro avg     0.9002    0.8960    0.8971       489
weighted avg     0.8992    0.8978    0.8975       489

