# Introduction to Data Science

Authors: Lior Tondovski, Ilan Vasilevski, Maya Vilenko

---

### imports

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
#visualizations
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
#preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from scipy.stats import shapiro
#clasiifiers
from xgboost import XGBClassifier
#Dimensionality reduction
from sklearn.decomposition import PCA
#Anomaly detection
from sklearn.ensemble import IsolationForest
#Feature selection
from sklearn.feature_selection import RFECV
#Hyperparameter tuning
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
#metrics and model evaluation
import shap
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score
#utils
from utils import *
#config
from Config import *
#set seed
np.random.seed(12)

In [1]:
#read train and test data pickls
test = pd.read_pickle('processed_test_data.pkl')
train_undersampled_sm = pd.read_pickle('processed_train_data_undersampled_sm.pkl')
train_oversampled= pd.read_pickle('processed_train_data_undersampled.pkl.pkl')

NameError: name 'pd' is not defined

In [None]:
#Train Test Split
Train, Test = train_test_split(train_data, test_size=0.2, random_state=42)
X_train = Train[all_features]
y_train = Train[target_column].squeeze()
X_test = Test[all_features]
y_test = Test[target_column].squeeze()

Conclusion: Based on the PCA dimension reduction to 3 Dimension, it can be observed that the data is fairly linearly separated, which is why it should perform well on the evaluation metrics once a classifier is fitted to make a prediction. Over 95% of the variance is preserved by three PCA components.

##### Feature Selection

In [None]:
#Feature Selection using RFE
#RFECV is a recursive feature elimination with cross validation
xgb_classifier = XGBClassifier()
rfecv = RFECV(estimator=xgb_classifier, step=1, cv=5, scoring='roc_auc')
rfecv.fit(X_train, y_train)
print(f'Optimal number of features : {rfecv.n_features_}')
print(f'Best features : {X_train.columns[rfecv.support_]}')

#plot the average cross validation score for each number of features
plt.figure(figsize=(10, 5))
plt.title('Recursive Feature Elimination with Cross-Validation')
plt.xlabel('Number of features selected')
plt.ylabel(' Abveraged Cross validation score ')
#calc the average cross validation score for each number of features
avrage_cv_scores = [np.mean(cv_scores) for cv_scores in rfecv.grid_scores_]
plt.plot(range(1, len(avrage_cv_scores) + 1), avrage_cv_scores)
#add grid
plt.grid()
plt.show()

In [None]:
#keep the best features
X_train = X_train[X_train.columns[rfecv.support_]]
X_test = X_test[X_test.columns[rfecv.support_]]

### Hyperparameter Tuning

In [None]:
#Hyperparameter Tuning with Skopt BayesSearchCV (Bayesian Optimization)
#Bayesian Optimization is a method for optimizing an expensive function by iteratively building a surrogate function that approximates the function being optimized
#set the classifier
xgb_classifier = XGBClassifier()
#set the parameters to search for
params = {
    'n_estimators': Integer(100, 1000),
    'max_depth': Integer(3, 10),
    'learning_rate': Real(0.01, 1.0, 'log-uniform'),
    'subsample': Real(0.01, 1.0, 'uniform'),
    'colsample_bytree': Real(0.01, 1.0, 'uniform'),
    'min_child_weight': Integer(1, 10),
    'gamma': Real(0.0, 50.0, 'uniform'),
    'reg_alpha': Real(0.0, 50.0, 'uniform'),
    'reg_lambda': Real(0.0, 50.0, 'uniform'),
    'scale_pos_weight': Real(0.01, 50.0, 'uniform'),
    'n_jobs': [-1]
}

#set the bayes search cv
opt = BayesSearchCV(
    xgb_classifier,
    params,
    n_iter=50,
    scoring='roc_auc',
    cv=2,
    n_jobs=-1,
    verbose=0,
    refit=True,
    random_state=42
)

#fit the model
opt.fit(X_train, y_train)
#print the best parameters
print(f'the best parameters are: {opt.best_params_}')
#print the best score
print(f'the best auc score: {opt.best_score_}')
#save the best estimator
best_estimator = opt.best_estimator_

In [None]:
#After Further analysis of the differances between the train and test set we found that the train set overfits a bit
# and thefore i decided to increase the gamma parameter which is responsible for the tree pruning and will reduce the overfitting
best_estimator.set_params(gamma=20)

##### Feature importance with SHAP

In [None]:
#plot the feature importance with shap values
explainer = shap.TreeExplainer(best_estimator)
#calculate the shap values
shap_values = explainer.shap_values(X_train)
#plot shap bar plot
shap.summary_plot(shap_values, X_train, plot_type='bar')
#plot shap beeswarm
shap.summary_plot(shap_values, X_train)

Conclusion: .....

### Evaluation on Test Set

In [None]:
#test set - predictions and evaluation
y_pred = best_estimator.predict(X_test)
y_pred_proba = best_estimator.predict_proba(X_test)[:,1]

#calculate the accuracy score
print(f'Accuracy Score : {accuracy_score(y_test, y_pred)}')
#calculate the precision score
print(f'Precision Score : {precision_score(y_test, y_pred)}')
#calculate the recall score
print(f'Recall Score : {recall_score(y_test, y_pred)}')
#calculate the f1 score
print(f'F1 Score : {f1_score(y_test, y_pred)}')
#calculate auc
print(f'AUC : {roc_auc_score(y_test, y_pred_proba)}')

In [None]:
#plot the ROC curve
plot_roc_curve(y_test, y_pred_proba)

In [None]:
#plot the precision recall curve
plot_precision_recall_curve(y_test, y_pred_proba)


In [None]:
#plot the confusion matrix
plot_confusion_matrix(y_test, y_pred)

In [None]:
#check if the model is overfitting
#compare the train and test scores
plot_comparison_bar_chart(best_estimator, X_train, X_test, y_train, y_test)

Conclusion: