
# Appendix 3. Optinisation of XGBClassifier with data features selected based on the K highest Chi-squared Scores¶

## Dissertation, CP70017E, June 2022
### Supervisor: Professor Konstantin Nikolic
### Student: Mariya Ivanova, 21435612


In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# data from Apendix 1, In[76]
%store -r X_train
%store -r X_test
%store -r y_train
%store -r y_test

In [None]:
# XGBClassifier (eXtreme Gradient Boosting)
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

xgbc = XGBClassifier(verbosity=0, use_label_encoder=False)

# Instantiate the GridSearchCV object and run the search
parameters = {'max_depth':[3, 18, 1],
              'gamma': [1,9],
              'reg_alpha' : [40,180,1],
              'reg_lambda' : ['reg_lambda', 0,1],
              'colsample_bytree' : ['colsample_bytree', 0.5,1],
              'min_child_weight' : ['min_child_weight', 0, 10, 1],
              'n_estimators': [180,100],
              'seed': [0]
}
searcher = GridSearchCV(xgbc, parameters, cv=10)
searcher.fit(X_train, y_train)


# Report the best parameters and the corresponding score
print("Best CV params", searcher.best_params_)
print("Best CV accuracy", searcher.best_score_)
print("Test accuracy of best grid search hypers:",
      searcher.score(X_test, y_test))

In [None]:
xgbc = XGBClassifier(colsample_bytree=0.5, gamma=1,
                     max_depth=3, min_child_weight=0,
                     n_estimators=180, reg_alpha=1,
                     reg_lambda=0, seed=0)

xgbc.fit(X_train, y_train)
print(xgbc)

In [None]:
from sklearn.metrics import accuracy_score

y_test_pred = xgbc.predict(X_test)
y_train_pred = xgbc.predict(X_train)
print("Accuracy train: ", accuracy_score(y_train, y_train_pred))
print("Accuracy test: ", accuracy_score(y_test, y_test_pred))

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc = {'figure.figsize':(15,8)})

cm = confusion_matrix(y_test, y_test_pred, labels=xgbc.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=xgbc.classes_)
disp.plot()
plt.show()

Observation: 44+50 correct predictions and 7+1 incorrect predictions.

In [None]:
from sklearn.metrics import classification_report

print('XGBClassifier Classification Report :\n\n', 
      classification_report(y_test, y_test_pred))

In [None]:
from xgboost import plot_tree

plot_tree(xgbc)
plt.show()

In [None]:
# ROC
from sklearn.metrics import roc_curve  

def plot_roc_cur(fper, tper):  
    plt.plot(fper, tper, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
    
probs = xgbc.predict_proba(X_test)  
probs = probs[:, 1]  
fper, tper, thresholds = roc_curve(y_test, probs) 
plot_roc_cur(fper, tper)

[<a href="#content">Back to top</a>]

## Prediction bioapplicability of a new nanorobot

In [None]:
# Use the model to predict new nanorobots bioaplicability 
import pandas as pd

model= xgbc
pd.to_pickle(model,r'C:\Users\ivano\Dissertation\new_model.pickle')
model = pd.read_pickle(r'C:\Users\ivano\Dissertation\new_model.pickle')

In [None]:
# NR new values user inputs 
Mass = float(input("Enter mass: "))

In [None]:
Zeta_potential = float(input("Enter zeta_potential: "))

In [None]:
Concentration = float(input("Enter concentration: "))

In [None]:
eV = float(input("Enter eV: "))

In [None]:
Toxicity = float(input("Enter zeta potential: "))

In [None]:
Energy_source_no = float(input("No need of energy source: "))

In [None]:
Shape_Janus = float(input("Is it a Janus shape: "))

In [None]:
# Creat DataFramefrom the new values
mass = [Mass]
zeta_potential = [Zeta_potential]
concentration = [Concentration]
ev = [eV]
toxicity = [Toxicity]
energy_source_no = [Energy_source_no]
shape_Janus = [Shape_Janus]



dataf = pd.DataFrame({'mass':mass, 'zeta_potential':zeta_potential,
                    'concentration':concentration, 'ev':ev,
                    'toxicity':toxicity, 'energy_source_no':energy_source_no,
                    'shape_Janus':shape_Janus})
dataf

In [None]:
# Metadata
data = []
for feature in dataf.columns:

 # Defining the role
    if 'mass' in feature or feature == 'zeta_potential'\
    or feature == 'concentration'  or feature == 'ev'\
    or feature == 'toxicity' or feature == 'energy_source_no'\
    or feature == 'shape_Janus':
        use = 'input'
        
 # Defining the type        
    if 'mass' in feature or feature == 'zeta_potential'\
    or feature == 'concentration'  or feature == 'ev'\
    or feature == 'toxicity' or feature == 'energy_source_no'\
    or feature == 'shape_Janus':
        type = 'real'

# Initialize preserve to True for all variables
    preserve = True

# Defining the data type     
    dtype = dataf[feature].dtype
    if 'mass' in feature or feature == 'zeta_potential'\
    or feature == 'concentration'  or feature == 'ev'\
    or feature == 'toxicity' or feature == 'energy_source_no'\
    or feature == 'shape_Janus':
        dtype = dataf[feature].dtype
        
        
    feature_dictionary = {
        'varname': feature,
        'use': use,
        'type': type,
        'preserve': preserve,
        'dtype': dtype,
    }
    data.append(feature_dictionary)
    
    
meta = pd.DataFrame(data, columns=['varname', 'use', 'type', 'preserve', 'dtype'])
meta.set_index('varname', inplace=True)
pd.set_option('display.max_rows', meta.shape[0]+1)
print(meta)

In [None]:
# Creat interaction variables
from sklearn.preprocessing import PolynomialFeatures

v = meta[(meta.type == 'real') & (meta.preserve)].index
poly = PolynomialFeatures(degree=2, interaction_only=False,
                          include_bias=False)
interactions = pd.DataFrame(data=poly.fit_transform(dataf[v]),
                            columns=poly.get_feature_names_out(v))
# Remove the original columns
interactions.drop(v, axis=1, inplace=True)
# Concat the interaction variables to the train data
print('Before creating interactions we have {} variables'.format(dataf.shape[1]))
dataf = pd.concat([dataf, interactions], axis=1)
print('After creating interactions we have {} variables'.format(dataf.shape[1]))

In [None]:
dataf

In [None]:
# Using 14 variables determoned asthe most influential
result = model.predict([[dataf['mass^2'].mean(), dataf['mass zeta_potential'].mean(),
                         dataf['mass concentration'].mean(), dataf['mass ev'].mean(), 
                         dataf['concentration^2'].mean(), dataf['zeta_potential^2'].mean(),
                         dataf['concentration toxicity'].mean(), dataf['zeta_potential toxicity'].mean(),
                         dataf['mass'].mean(),dataf['energy_source_no'].mean(), dataf['shape_Janus'].mean(), 
                         dataf['concentration ev'].mean(), dataf['toxicity'].mean()]])

In [None]:
print("Prediction is {}".format(result))
print("Legent:\n[1] is 'Yes', it is bio-applicable\n[0] is 'No', it is not bio-applicable")

In [None]:
print("mass^2: {}".format(dataf['mass^2'].mean()))
print("mass zeta_potential: {}".format(dataf['mass zeta_potential'].mean()))
print("mass concentration: {}".format(dataf['mass concentration'].mean()))
print("mass ev: {}".format(dataf['mass ev'].mean()))
print("concentration^2: {}".format(dataf['concentration^2'].mean()))
print("zeta_potential^2: {}".format(dataf['zeta_potential^2'].mean()))
print("zeta_potential concentration: {}".format(dataf['zeta_potential concentration'].mean()))
print("concentration toxicity: {}".format(dataf['concentration toxicity'].mean()))
print("zeta_potential toxicity: {}".format(dataf['zeta_potential toxicity'].mean()))
print("mass: {}".format(dataf['mass'].mean()))
print("energy_source_no: {}".format(dataf['energy_source_no'].mean()))
print("shape_Janus: {}".format(dataf['shape_Janus'].mean())) 
print("concentration ev: {}".format(dataf['concentration ev'].mean()))
print("toxicity: {}".format(dataf['toxicity'].mean()))

[<a href="#content">Back to top</a>]