# Packages and data reading

In [None]:
!pip install plotly==5.11.0
!pip install torch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
# NumPy for math operations, and Pandas for processing tabular data.
import numpy as np
import pandas as pd

# Plotly plotting package
import plotly.graph_objects as go
import plotly.express as px

# Metrics and tools from sklearn
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    f1_score,
    roc_curve,
    precision_recall_curve,
    confusion_matrix,
    classification_report,
    accuracy_score,
)
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest, f_classif
from sklearn.impute import KNNImputer

# XGBoost classifier
from xgboost import XGBClassifier

# PyTorch package and submodules
import torch
import torch.nn as nn
from torch.optim import Adam, SGD
from torch.nn import BCELoss

In [None]:
#read data
data = pd.read_csv('/content/drive/MyDrive/sph6004_assignment1_data.csv')
print(data.head())

x_raw=data.drop(columns=['aki','id'])
y_df=data['aki']
combined_df = pd.concat([x_raw, y_df], axis=1)

#50920 observations, 160 columns
x_raw.info()
x_raw.head(3)
y_df.value_counts()


# Data cleaning

In [None]:
# Define race categories
race_codes = {
    'white': 1,
    'black': 2,
    'asian': 3,
    'hispanic/latino': 4,
    'unknown': 5,
    'other': 6
}

x_raw['race_code'] = x_raw['race'].str.lower().map(race_codes)


# Define gender categories
gender_mapping = {'M': 0, 'F': 1}
x_raw['gender_encoded'] = x_raw['gender'].map(gender_mapping)
print(x_raw['gender_encoded'])

x_raw.drop(columns=['gender', 'race'], inplace=True)

# Convert integer columns to float
x_raw['race_code'] = x_raw['race_code'].astype(float)
x_raw['gender_encoded'] = x_raw['gender_encoded'].astype(float)

x_raw.describe()

In [None]:
# Clean outliers
x_df=x_raw.copy()
# Calculate z-scores for each column
z_scores = (x_df - x_df.mean()) / x_df.std()

# Define threshold for outliers
threshold = 3

# Identify outliers
outliers = np.abs(z_scores) > threshold

# Print the indices of rows containing outliers for each column
for column in outliers.columns:
    print(f"Outliers in {column}:")
    print(outliers[outliers[column]])

#Remove outliers by replacing them with NaN
x_df = x_df.mask(outliers)
x_df.describe()

In [None]:
# Delete columns with na over 90%
na_percentage = (x_df.isna().sum() / len(x_raw)) * 100
pd.set_option('display.max_rows', None)
print(na_percentage)

pd.reset_option('display.max_rows')

# 134 columns left
columns_to_keep = na_percentage[na_percentage <= 90].index
x_df = x_df[columns_to_keep]
x_df.describe()

# Check the correlation between variables
round(x_df.apply(x_df.corrwith),2)


In [None]:
# Reclassify the target variable
y_df_binary = np.where(y_df == 0, 0, 1)

print("Unique values of y_df_binary:", np.unique(y_df_binary))
print("Data type of y_df_binary:", y_df_binary.dtype)

# Split

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x_df, y_df_binary, stratify=y_df_binary)

# Print the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Standardization and imputation





In [103]:
# Standardization
columns_to_normalize = [col for col in X_train.columns]
for col in columns_to_normalize:
    X_train[col] = (X_train[col] - X_train[col].mean()) / X_train[col].std()

columns_to_normalize = [col for col in X_test.columns]
for col in columns_to_normalize:
    X_test[col] = (X_test[col] - X_test[col].mean()) / X_test[col].std()


In [None]:
# KNN imputation
# Initialize KNNImputer with desired number of neighbors
knn_imputer = KNNImputer(n_neighbors=5)

# Fit the imputer on the data and transform the data
X_train = knn_imputer.fit_transform(X_train)
X_test = knn_imputer.fit_transform(X_test)

In [None]:
# Convert the imputed arrays back into DataFrame with the original column names
# Store the column names before imputation
column_names = x_df.columns

X_train = pd.DataFrame(X_train, columns=column_names)
X_test = pd.DataFrame(X_test, columns=column_names)

In [None]:
# Checking for missing values in X_train
missing_values_train = np.isnan(X_train).sum(axis=0)
print("Missing values in X_train:")
print(missing_values_train)

# Checking for missing values in X_test
missing_values_test = np.isnan(X_test).sum(axis=0)
print("Missing values in X_test:")
print(missing_values_test)

In [None]:
# Print the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Feature selection - Forward and backward selection (followed three models are built using the selected features through forward selection)

In [17]:
model = logit(penalty='l1',C=1,solver='liblinear') # c: 1/(strength of L1 regularization)

In [None]:
# Forward feature selection.
forward_selection = SFS(
    model, n_features_to_select=20, direction="forward"
).fit(X_train, y_train)

In [None]:
# Backward feature selection.
backward_selection = SFS(
    model, n_features_to_select=20, direction="backward"
).fit(X_train, y_train)

In [None]:
forward_selection.get_feature_names_out()

In [None]:
backward_selection.get_feature_names_out()

In [None]:
# Fit logistic regression models
# Full model
model.fit(X_train,y_train)
y_pred_full = model.predict_proba(X_test)

# Model with forward selected features
model.fit(forward_selection.transform(X_train),y_train)
y_pred_FS = model.predict_proba(forward_selection.transform(X_test))

# Model with backward selected features
model.fit(backward_selection.transform(X_train),y_train)
y_pred_BS = model.predict_proba(backward_selection.transform(X_test))

In [None]:
# roc_curve
fpr_full, tpr_full, _ = roc_curve(y_test,y_pred_full[:,1])
fpr_FS, tpr_FS, _ = roc_curve(y_test,y_pred_FS[:,1])
fpr_BS, tpr_BS, _ = roc_curve(y_test,y_pred_BS[:,1])

roc_df = pd.DataFrame(
    {
        'False Positive Rate':np.hstack([fpr_full,fpr_FS,fpr_BS]),
        'True Positive Rate':np.hstack([tpr_full,tpr_FS,tpr_BS]),
        'method':['full_model']*len(fpr_full)+['FS']*len(fpr_FS)+['BS']*len(fpr_BS)
    }
)

In [None]:
# Visualize ROC curve
fig = px.line(roc_df,y='True Positive Rate',x='False Positive Rate',facet_col='method',color='method')
fig


In [None]:
# List of model predictions and their names
models = [(y_pred_full, 'Full Model'), (y_pred_FS, 'Forward Selected Model'), (y_pred_BS, 'Backward Selected Model')]

# Calculate and display AUC scores for each model
for pred, name in models:
    auc = roc_auc_score(y_test, pred[:, 1])
    print(f"AUC Score for {name}: {auc}")

In [None]:
# Precision recall curves
p_full, r_full, _ = precision_recall_curve(y_test,y_pred_full[:,1])
p_FS, r_FS, _ = precision_recall_curve(y_test,y_pred_FS[:,1])
p_BS, r_BS, _ = precision_recall_curve(y_test,y_pred_BS[:,1])

pr_df = pd.DataFrame(
    {
        'Precision':np.hstack([p_full,p_FS,p_BS]),
        'Recall':np.hstack([r_full,r_FS,r_BS]),
        'method':['Full Model']*len(p_full)+['Forward Selection']*len(p_FS)+['Backward Selection']*len(p_BS)
    }
)

In [None]:
# Visualize precision recall curve
fig = px.line(pr_df,x='Recall',y='Precision',facet_col='method',color='method')
fig

In [34]:
# Get the selected feature indices
selected_feature_indices = forward_selection.get_support(indices=True)

# Select only the chosen features from X_train and X_test
X_train_selected = forward_selection.transform(X_train)
X_test_selected = forward_selection.transform(X_test)

In summary, full model has the highest AUC score, followed by forward selection model. There is not much difference between the full model which contains 134 variables and forward selected model which contains 20 variables. Therefore, the 20 forwarded selected variables were included in further predictive model implementation.

# Logistic Regression Estimation

In [None]:
# Fit the model
model_selected = model.fit(X_train_selected,y_train)

y_pred_logistic = model.predict_proba(X_test_selected)

fpr_logistic, tpr_logistic, _ = roc_curve(y_test,y_pred_logistic[:,1])
roc_df = pd.DataFrame(
    {
        'False Positive Rate':np.hstack([fpr_logistic]),
        'True Positive Rate':np.hstack([tpr_logistic]),
        'method':['Logistic Regression']*len(fpr_logistic)
    }
)
fig = px.line(roc_df,y='True Positive Rate',x='False Positive Rate',facet_col='method',color='method')
fig

In [None]:
# List of model predictions and their names
models = [(y_pred_logistic, 'Logistic Regression')]
# Calculate and display AUC scores for each model
for pred, name in models:
    auc = roc_auc_score(y_test, pred[:, 1])
    print(f"AUC Score for {name}: {auc}")

In [None]:
# precision recall curves
p_logistic, r_logistic, _ = precision_recall_curve(y_test,y_pred_logistic[:,1])


pr_logistic = pd.DataFrame(
    {
        'Precision':np.hstack([p_logistic]),
        'Recall':np.hstack([r_logistic]),
        'method':['Logistic Regression']*len(p_logistic)
    }
)
# Visualize precision recall curve
fig = px.line(pr_logistic,x='Recall',y='Precision',facet_col='method',color='method')
fig

In [None]:
# Convert predicted probabilities to binary class labels
y_pred_binary = (y_pred_logistic[:, 1] > 0.5).astype(int)

# Calculate F1 score on test set
f1 = f1_score(y_test, y_pred_binary)

# Print F1 score
print('F1 score on test set: {:.4f}'.format(f1))


In [None]:
# Accuracy
# Predict the target on the train dataset
predict_train_logistic = model_selected.predict(X_train_selected)
print('\nTarget on train data',predict_train_logistic)

# Accuray Score on train dataset
accuracy_train_logistic = accuracy_score(y_train,predict_train_logistic)
print('\naccuracy_score on train dataset : ', accuracy_train_logistic)

# Predict the target on the test dataset
predict_test_logistic = model_selected.predict(X_test_selected)
print('\nTarget on test data',predict_test_logistic)

# Accuracy Score on test dataset
accuracy_test_logistic = accuracy_score(y_test,predict_test_logistic)
print('\naccuracy_score on test dataset : ', accuracy_test_logistic)

# XGBoost

In [None]:
parameters = {
    'n_estimators':np.arange(start=2,stop=20,step=2),
    'max_depth':np.arange(start=2,stop=6,step=1),
    'learning_rate':np.arange(start=0.05,stop=0.4,step=0.05)
}

parameters

In [None]:
stratifiedCV = StratifiedKFold(n_splits=3)
# XGBC: XGBoost classifier
XGBoostModel = XGBC()
BestXGBoost = GridSearchCV(
    XGBoostModel,
    param_grid=parameters,
    scoring='f1',
    cv=stratifiedCV,
    verbose=1,
    n_jobs=-1 # use all cpu cores to speedup grid search
)
BestXGBoost.fit(X_train_selected,y_train)

In [None]:
BestXGBoost.best_params_

In [None]:
BestXGBoost.best_score_

In [None]:
y_pred_XG = BestXGBoost.predict(X_test_selected)
print('F1 score on test set: {:.4f}'.format(f1_score(y_test,y_pred_XG)))
pd.crosstab(y_test,y_pred_XG)

In [None]:
#accuracy
# predict the target on the train dataset
predict_train = BestXGBoost.predict(X_train_selected)
print('\nTarget on train data',predict_train)

# Accuray Score on train dataset
accuracy_train = accuracy_score(y_train,predict_train)
print('\naccuracy_score on train dataset : ', accuracy_train)

# predict the target on the test dataset
predict_test = BestXGBoost.predict(X_test_selected)
print('\nTarget on test data',predict_test)

# Accuracy Score on test dataset
accuracy_test = accuracy_score(y_test,predict_test)
print('\naccuracy_score on test dataset : ', accuracy_test)

In [None]:
y_pred_XG = BestXGBoost.predict_proba(X_test_selected)
fpr_XG, tpr_XG, _ = roc_curve(y_test,y_pred_XG[:,1])
roc_df = pd.DataFrame(
    {
        'False Positive Rate':np.hstack([fpr_XG]),
        'True Positive Rate':np.hstack([tpr_XG]),
        'method':['XGBoost']*len(fpr_XG)
    }
)
fig = px.line(roc_df,y='True Positive Rate',x='False Positive Rate',facet_col='method',color='method')
fig

In [None]:
# List of model predictions and their names
models = [(y_pred_XG, 'XGBoost Model')]
# Calculate and display AUC scores for each model
for pred, name in models:
    auc = roc_auc_score(y_test, pred[:, 1])
    print(f"AUC Score for {name}: {auc}")

# SVM

In [None]:
# Model building
parameters = {
    'C':np.arange(start=0.1, stop=1.0, step=0.3)
}
stratifiedCV = StratifiedKFold(n_splits=3)
SVCModel = SVC(kernel='linear', probability=True)
BestSVC = GridSearchCV(
    SVCModel,
    param_grid=parameters,
    scoring='f1',
    cv=stratifiedCV,
    verbose=1,
    n_jobs=-1
)

BestSVC.fit(X_train_selected,y_train)

In [None]:
BestSVC.best_estimator_

In [None]:
BestSVC.best_score_

In [None]:
y_pred_SVC = BestSVC.predict(X_test_selected)
# Check unique values in predicted labels
unique_values = np.unique(y_pred_SVC)
print("Unique values in predicted labels:", unique_values)

In [None]:
print('F1 score on test set: {:.4f}'.format(f1_score(y_test,y_pred_SVC)))
pd.crosstab(y_test,y_pred_SVC)

In [None]:
# predict the target on the train dataset
predict_train = BestSVC.predict(X_train_selected)
print('\nTarget on train data',predict_train)

# Accuray Score on train dataset
accuracy_train = accuracy_score(y_train,predict_train)
print('\naccuracy_score on train dataset : ', accuracy_train)

# predict the target on the test dataset
predict_test = BestSVC.predict(X_test_selected)
print('\nTarget on test data',predict_test)

# Accuracy Score on test dataset
accuracy_test = accuracy_score(y_test,predict_test)
print('\naccuracy_score on test dataset : ', accuracy_test)

In [None]:
fpr_SVC, tpr_SVC, _ = roc_curve(y_test, y_pred_SVC)
roc_df = pd.DataFrame(
    {
        'False Positive Rate': np.hstack([fpr_SVC]),
        'True Positive Rate': np.hstack([tpr_SVC]),
        'method': ['full_model'] * len(fpr_SVC)
    }
)

# Plot ROC curve
fig = px.line(roc_df, y='True Positive Rate', x='False Positive Rate', facet_col='method', color='method')
fig.show()

In [None]:
# Calculate AUC score
auc = roc_auc_score(y_test, y_pred_SVC)
print(f"AUC Score for SVM Model: {auc}")

In [None]:
y_pred_SVC = BestSVC.predict(X_test_selected)

fpr_SVC, tpr_SVC, _ = roc_curve(y_test,y_pred_SVC[:,1])
roc_df = pd.DataFrame(
    {
        'False Positive Rate':np.hstack([fpr_SVC]),
        'True Positive Rate':np.hstack([tpr_SVC]),
        'method':['full_model']*len(fpr_SVC)
    }
)
fig = px.line(roc_df,y='True Positive Rate',x='False Positive Rate',facet_col='method',color='method')
fig

# List of model predictions and their names
models = [(y_pred_SVC, 'Full Model')]
# Calculate and display AUC scores for each model
for pred, name in models:
    auc = roc_auc_score(y_test, pred[:, 1])
    print(f"AUC Score for {name}: {auc}")

In [63]:
# roc_curve
roc_df = pd.DataFrame(
    {
        'False Positive Rate':np.hstack([fpr_logistic,fpr_XG,fpr_SVC]),
        'True Positive Rate':np.hstack([tpr_logistic,tpr_XG,tpr_SVC]),
        'method':['logistic_model']*len(fpr_logistic)+['XG']*len(fpr_XG)+['SVC']*len(fpr_SVC)
    }
)

In [None]:
# Visualize ROC curve
fig = px.line(roc_df,y='True Positive Rate',x='False Positive Rate',facet_col='method',color='method')
fig

# Feature selection - Genetic selection (followed three models are built using the selected features through GA)

In [None]:
!pip install sklearn-genetic

In [None]:
from genetic_selection import GeneticSelectionCV

In [None]:
# Create LogisticRegression model
estimator = LogisticRegression()

# Create GeneticSelectionCV object with LogisticRegression
models = GeneticSelectionCV(
    estimator, cv=3, verbose=0,
    scoring="roc_auc", max_features=None,
    n_population=50, crossover_proba=0.5,
    mutation_proba=0.2, n_generations=50,
    crossover_independent_proba=0.5,
    mutation_independent_proba=0.04,
    tournament_size=3, n_gen_no_change=10,
    caching=True, n_jobs=-1)

# Fit the model
model_LR = models.fit(X_train, y_train)

# Print selected features
print('Feature Selection:', X_train.columns[model_LR.support_])

In [66]:
# Get the selected column names
selected_columns = ['admission_age', 'heart_rate_min', 'heart_rate_max', 'heart_rate_mean', 'sbp_min', 'sbp_max', 'sbp_mean', 'dbp_min', 'dbp_max', 'dbp_mean', 'mbp_min', 'mbp_mean', 'resp_rate_min', 'resp_rate_max', 'temperature_min', 'temperature_max', 'temperature_mean', 'spo2_min', 'spo2_max', 'spo2_mean', 'glucose_min', 'glucose_max', 'glucose_mean', 'lactate_min', 'ph_max', 'so2_min', 'so2_max', 'po2_min', 'pco2_min', 'pco2_max', 'aado2_calc_min', 'aado2_calc_max', 'pao2fio2ratio_min', 'baseexcess_min', 'totalco2_max', 'hematocrit_min', 'hematocrit_max', 'hemoglobin_min', 'hemoglobin_max', 'temperature_max.1', 'chloride_min', 'calcium_min', 'glucose_min.1', 'glucose_max.1', 'potassium_max', 'sodium_min', 'sodium_max', 'hematocrit_min.1', 'hematocrit_max.1', 'hemoglobin_min.1', 'hemoglobin_max.1', 'platelets_min', 'wbc_max', 'albumin_min', 'aniongap_min', 'bicarbonate_min.1', 'bicarbonate_max.1', 'bun_min', 'bun_max', 'calcium_min.1', 'calcium_max.1', 'chloride_min.1', 'chloride_max.1', 'glucose_min.2', 'sodium_min.1', 'sodium_max.1', 'potassium_max.1', 'abs_basophils_min', 'abs_basophils_max', 'abs_eosinophils_max', 'abs_lymphocytes_max', 'atyps_min', 'atyps_max', 'bands_min', 'bands_max', 'metas_min', 'metas_max', 'fibrinogen_min', 'fibrinogen_max', 'inr_max', 'ptt_min', 'ptt_max', 'alt_min', 'alp_max', 'ast_min', 'ast_max', 'bilirubin_total_min', 'ck_mb_min', 'ck_mb_max', 'ld_ldh_min', 'gcs_min', 'gcs_motor', 'gcs_verbal', 'gcs_eyes', 'weight_admit', 'gender_encoded']

# Filter the selected columns in X_train and X_test
X_train_selected_GS = X_train[selected_columns]
X_test_selected_GS = X_test[selected_columns]


In [None]:
# Check the number of columns in X_train_selected and X_test_selected
num_columns_X_train = X_train_selected_GS.shape[1]
num_columns_X_test = X_test_selected_GS.shape[1]

print("Number of columns kept in X_train_selected:", num_columns_X_train)
print("Number of columns kept in X_test_selected:", num_columns_X_test)

# Logistic Regression Estimation

In [None]:
# Fit the model
model_selected = model.fit(X_train_selected_GS,y_train)

y_pred_logistic = model.predict_proba(X_test_selected_GS)

fpr_logistic, tpr_logistic, _ = roc_curve(y_test,y_pred_logistic[:,1])
roc_df = pd.DataFrame(
    {
        'False Positive Rate':np.hstack([fpr_logistic]),
        'True Positive Rate':np.hstack([tpr_logistic]),
        'method':['Logistic Regression']*len(fpr_logistic)
    }
)
fig = px.line(roc_df,y='True Positive Rate',x='False Positive Rate',facet_col='method',color='method')
fig

In [None]:
# List of model predictions and their names
models = [(y_pred_logistic, 'Logistic Regression')]
# Calculate and display AUC scores for each model
for pred, name in models:
    auc = roc_auc_score(y_test, pred[:, 1])
    print(f"AUC Score for {name}: {auc}")

In [None]:
# precision recall curves
p_logistic, r_logistic, _ = precision_recall_curve(y_test,y_pred_logistic[:,1])


pr_logistic = pd.DataFrame(
    {
        'Precision':np.hstack([p_logistic]),
        'Recall':np.hstack([r_logistic]),
        'method':['Logistic Regression']*len(p_logistic)
    }
)
# Visualize precision recall curve
fig = px.line(pr_logistic,x='Recall',y='Precision',facet_col='method',color='method')
fig

In [None]:
# Convert predicted probabilities to binary class labels
y_pred_binary = (y_pred_logistic[:, 1] > 0.5).astype(int)

# Calculate F1 score on test set
f1 = f1_score(y_test, y_pred_binary)

# Print F1 score
print('F1 score on test set: {:.4f}'.format(f1))

In [None]:
# predict the target on the train dataset
predict_train_logistic = model_selected.predict(X_train_selected_GS)
print('\nTarget on train data',predict_train_logistic)

# Accuray Score on train dataset
accuracy_train_logistic = accuracy_score(y_train,predict_train_logistic)
print('\naccuracy_score on train dataset : ', accuracy_train_logistic)

# predict the target on the test dataset
predict_test_logistic = model_selected.predict(X_test_selected_GS)
print('\nTarget on test data',predict_test_logistic)

# Accuracy Score on test dataset
accuracy_test_logistic = accuracy_score(y_test,predict_test_logistic)
print('\naccuracy_score on test dataset : ', accuracy_test_logistic)

# XGBoost

In [None]:
parameters = {
    'n_estimators':np.arange(start=2,stop=20,step=2),
    'max_depth':np.arange(start=2,stop=6,step=1),
    'learning_rate':np.arange(start=0.05,stop=0.4,step=0.05)
}

parameters

In [None]:
stratifiedCV = StratifiedKFold(n_splits=3)
# XGBC: XGBoost classifier
XGBoostModel = XGBC()
BestXGBoost = GridSearchCV(
    XGBoostModel,
    param_grid=parameters,
    scoring='f1',
    cv=stratifiedCV,
    verbose=1,
    n_jobs=-1 # use all cpu cores to speedup grid search
)
BestXGBoost.fit(X_train_selected_GS,y_train)

In [None]:
BestXGBoost.best_params_

In [None]:
BestXGBoost.best_score_

In [None]:
y_pred_XG = BestXGBoost.predict(X_test_selected_GS)
print('F1 score on test set: {:.4f}'.format(f1_score(y_test,y_pred_XG)))
pd.crosstab(y_test,y_pred_XG)


In [None]:
# predict the target on the train dataset
predict_train = BestXGBoost.predict(X_train_selected_GS)
print('\nTarget on train data',predict_train)

# Accuray Score on train dataset
accuracy_train = accuracy_score(y_train,predict_train)
print('\naccuracy_score on train dataset : ', accuracy_train)

# predict the target on the test dataset
predict_test = BestXGBoost.predict(X_test_selected_GS)
print('\nTarget on test data',predict_test)

# Accuracy Score on test dataset
accuracy_test = accuracy_score(y_test,predict_test)
print('\naccuracy_score on test dataset : ', accuracy_test)

In [None]:
y_pred_XG = BestXGBoost.predict_proba(X_test_selected_GS)
fpr_XG, tpr_XG, _ = roc_curve(y_test,y_pred_XG[:,1])
roc_df = pd.DataFrame(
    {
        'False Positive Rate':np.hstack([fpr_XG]),
        'True Positive Rate':np.hstack([tpr_XG]),
        'method':['XGBoost']*len(fpr_XG)
    }
)
fig = px.line(roc_df,y='True Positive Rate',x='False Positive Rate',facet_col='method',color='method')
fig

In [None]:
# List of model predictions and their names
models = [(y_pred_XG, 'XGBoost Model')]
# Calculate and display AUC scores for each model
for pred, name in models:
    auc = roc_auc_score(y_test, pred[:, 1])
    print(f"AUC Score for {name}: {auc}")

# SVM

In [None]:
# Model building
parameters = {
    'C':np.arange(start=0.1, stop=1.0, step=0.3)
}
stratifiedCV = StratifiedKFold(n_splits=3)
SVCModel = SVC(kernel='linear', probability=True)
BestSVC = GridSearchCV(
    SVCModel,
    param_grid=parameters,
    scoring='f1',
    cv=stratifiedCV,
    verbose=1,
    n_jobs=-1
)

BestSVC.fit(X_train_selected_GS,y_train)

In [None]:
BestSVC.best_estimator_

In [None]:
BestSVC.best_score_

In [None]:
y_pred_SVC = BestSVC.predict(X_test_selected_GS)
# Check unique values in predicted labels
unique_values = np.unique(y_pred_SVC)
print("Unique values in predicted labels:", unique_values)

In [None]:
print('F1 score on test set: {:.4f}'.format(f1_score(y_test,y_pred_SVC)))
pd.crosstab(y_test,y_pred_SVC)

In [None]:
# predict the target on the train dataset
predict_train = BestSVC.predict(X_train_selected_GS)
print('\nTarget on train data',predict_train)

# Accuray Score on train dataset
accuracy_train = accuracy_score(y_train,predict_train)
print('\naccuracy_score on train dataset : ', accuracy_train)

# predict the target on the test dataset
predict_test = BestSVC.predict(X_test_selected_GS)
print('\nTarget on test data',predict_test)

# Accuracy Score on test dataset
accuracy_test = accuracy_score(y_test,predict_test)
print('\naccuracy_score on test dataset : ', accuracy_test)

In [None]:
fpr_SVC, tpr_SVC, _ = roc_curve(y_test, y_pred_SVC)
roc_df = pd.DataFrame(
    {
        'False Positive Rate': np.hstack([fpr_SVC]),
        'True Positive Rate': np.hstack([tpr_SVC]),
        'method': ['full_model'] * len(fpr_SVC)
    }
)

# Plot ROC curve
fig = px.line(roc_df, y='True Positive Rate', x='False Positive Rate', facet_col='method', color='method')
fig.show()

In [None]:
# Calculate AUC score
auc = roc_auc_score(y_test, y_pred_SVC)
print(f"AUC Score for SVC Model: {auc}")

In [91]:
# roc_curve
roc_df = pd.DataFrame(
    {
        'False Positive Rate':np.hstack([fpr_logistic,fpr_XG,fpr_SVC]),
        'True Positive Rate':np.hstack([tpr_logistic,tpr_XG,tpr_SVC]),
        'method':['logistic_model']*len(fpr_logistic)+['XG']*len(fpr_XG)+['SVC']*len(fpr_SVC)
    }
)

In [None]:
# Visualize ROC curve
fig = px.line(roc_df,y='True Positive Rate',x='False Positive Rate',facet_col='method',color='method')
fig