In [1]:
import pandas as pd
import numpy as np
data_reduced=pd.read_csv("final_data_v3.csv")

In [2]:
data_reduced.shape

(146231, 146)

In [3]:
X=data_reduced.drop(['loan_status'],axis=1)
y=data_reduced[['loan_status']]

In [4]:
correlation_matrix=X.corr()

In [5]:
# Filter the correlation matrix to show only values greater than 0.6
filtered_corr = correlation_matrix.where(correlation_matrix > 0.6, np.nan)

# Set the diagonal values to NaN
np.fill_diagonal(filtered_corr.values, np.nan)

np.fill_diagonal(filtered_corr.values, np.nan)

# Drop rows and columns where all elements are NaN
filtered_corr = filtered_corr.dropna(how='all').dropna(axis=1, how='all')

# Melt the DataFrame to show only the non-NaN values
filtered_corr_melted = filtered_corr.stack().reset_index()
filtered_corr_melted.columns = ['Variable 1', 'Variable 2', 'Correlation']

print(filtered_corr_melted)

              Variable 1            Variable 2  Correlation
0            installment       total_rec_prncp     0.765573
1            delinq_2yrs    num_tl_90g_dpd_24m     0.667981
2                pub_rec  pub_rec_bankruptcies     0.609783
3                pub_rec             tax_liens     0.739624
4              revol_bal      total_rev_hi_lim     0.813389
5              total_acc             num_il_tl     0.695882
6              total_acc         num_rev_accts     0.740337
7              total_acc              num_sats     0.697688
8        total_rec_prncp           installment     0.765573
9        total_rec_prncp       last_pymnt_amnt     0.659399
10       last_pymnt_amnt       total_rec_prncp     0.659399
11  last_fico_range_high   last_fico_range_low     0.841422
12   last_fico_range_low  last_fico_range_high     0.841422
13        acc_now_delinq          num_tl_30dpd     0.820706
14      total_rev_hi_lim             revol_bal     0.813389
15      total_rev_hi_lim        total_bc

In [6]:
col_2_drop=filtered_corr_melted['Variable 2'].unique()

In [7]:
col_2_drop=list(col_2_drop)

In [8]:
col_2_drop

['total_rec_prncp',
 'num_tl_90g_dpd_24m',
 'pub_rec_bankruptcies',
 'tax_liens',
 'total_rev_hi_lim',
 'num_il_tl',
 'num_rev_accts',
 'num_sats',
 'installment',
 'last_pymnt_amnt',
 'last_fico_range_low',
 'last_fico_range_high',
 'num_tl_30dpd',
 'revol_bal',
 'total_bc_limit',
 'num_tl_op_past_12m',
 'tot_hi_cred_lim',
 'percent_bc_gt_75',
 'num_bc_sats',
 'num_op_rev_tl',
 'num_rev_tl_bal_gt_0',
 'num_actv_bc_tl',
 'num_bc_tl',
 'total_acc',
 'acc_now_delinq',
 'delinq_2yrs',
 'acc_open_past_24mths',
 'bc_util',
 'pub_rec',
 'avg_cur_bal',
 'bc_open_to_buy']

In [9]:
X_mod=X.drop(col_2_drop,axis=1)

In [10]:
X_mod.shape

(146231, 114)

In [11]:
X_mod.columns

Index(['int_rate', 'annual_inc', 'dti', 'fico_range_high', 'inq_last_6mths',
       'total_rec_int', 'total_rec_late_fee', 'collection_recovery_fee',
       'collections_12_mths_ex_med', 'tot_coll_amt',
       ...
       'hardship_flag_N', 'disbursement_method_Cash',
       'disbursement_method_DirectPay', 'debt_settlement_flag_N',
       'debt_settlement_flag_Y', 'sector_Education',
       'sector_Finance and Banking', 'sector_Healthcare', 'sector_IT',
       'sector_Manufacturing'],
      dtype='object', length=114)

In [12]:
col2=list(X_mod.filter(regex='Sentiment Score', axis=1).columns)
col2.append(list(X_mod.filter(regex='sector_', axis=1).columns))

In [13]:
col2=['Sentiment Score_2007',
 'Sentiment Score_2008',
 'Sentiment Score_2009',
 'Sentiment Score_2010',
 'Sentiment Score_2011',
 'Sentiment Score_2012',
 'Sentiment Score_2013',
 'Sentiment Score_2014',
 'Sentiment Score_2015',
 'Sentiment Score_2016',
 'Sentiment Score_2017',
 'Sentiment Score_2018','sector_Education',
  'sector_Finance and Banking',
  'sector_Healthcare',
  'sector_IT',
  'sector_Manufacturing']

## Remove Sentiment Columns

In [14]:
X_mod=X_mod.drop(col2,axis=1)

In [15]:
X_mod.shape

(146231, 97)

### Model Building

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [17]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_mod, y, test_size=0.2, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both the training and testing data
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Check the shapes of the resulting sets
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

X_train shape: (116984, 97), y_train shape: (116984, 1)
X_test shape: (29247, 97), y_test shape: (29247, 1)


In [18]:
y_test.value_counts()

loan_status
1              23909
0               5338
Name: count, dtype: int64

## Implementing Smote for Class imbalance

In [19]:
from imblearn.over_sampling import SMOTE
from collections import Counter
# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check the distribution of the target after SMOTE
print(f"SMOTE target distribution: {Counter(y_train_smote)}")

SMOTE target distribution: Counter({'loan_status': 1})


In [20]:
y_train.value_counts()

loan_status
1              95558
0              21426
Name: count, dtype: int64

In [21]:
y_train_smote.value_counts()

loan_status
0              95558
1              95558
Name: count, dtype: int64

In [22]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [24]:
import warnings
warnings.filterwarnings("ignore")
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('NB',GaussianNB()))
models.append(('XGB', XGBClassifier()))


## Basic model for all data (No Smote)

In [25]:
for name, model in models:
    #model_name, _ = model
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred = model.predict(X_test)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred)
    print(f"{name} Train Accuracy: {accuracy_train}")
    print(f"{name} Test Accuracy: {accuracy_test}")

LR Train Accuracy: 0.9419322300485536
LR Test Accuracy: 0.940780250965911
KNN Train Accuracy: 0.8859673117691308
KNN Test Accuracy: 0.8593360002735323
CART Train Accuracy: 1.0
CART Test Accuracy: 0.8939720313194516
RF Train Accuracy: 0.9999914518224714
RF Test Accuracy: 0.9433446165418675
NB Train Accuracy: 0.9162278602202011
NB Test Accuracy: 0.9141450405169761
XGB Train Accuracy: 0.9486938384736374
XGB Test Accuracy: 0.9422846787704722


## Basic Model for SMOTE Data

In [26]:
for name, model in models:
    #model_name, _ = model
    model.fit(X_train_smote, y_train_smote)
    y_pred_train = model.predict(X_train_smote)
    y_pred = model.predict(X_test)
    accuracy_train = accuracy_score(y_train_smote, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred)
    print(f"{name} Train Accuracy: {accuracy_train}")
    print(f"{name} Test Accuracy: {accuracy_test}")

LR Train Accuracy: 0.8994694321773163
LR Test Accuracy: 0.9354805621089343
KNN Train Accuracy: 0.902629816446556
KNN Test Accuracy: 0.7149793141176873
CART Train Accuracy: 1.0
CART Test Accuracy: 0.8859370191814545
RF Train Accuracy: 1.0
RF Test Accuracy: 0.9426607857216125
NB Train Accuracy: 0.8976433161012161
NB Test Accuracy: 0.9108968441207645
XGB Train Accuracy: 0.9670828188116118
XGB Test Accuracy: 0.943139467295791


## Feature Importance for Normal Data 

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import numpy as np

# Setting a random seed for reproducibility
random_seed = 42

# List of models to evaluate
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('NB', GaussianNB()))
models.append(('XGB', XGBClassifier()))

# Function to print top 20 feature importances
def print_feature_importance(model, X_train, name, top_n=20):
    if hasattr(model, "feature_importances_"):
        importances = model.feature_importances_
        feature_names = X_mod.columns
        sorted_indices = np.argsort(importances)[::-1]
        print(f"{name} Top {top_n} Feature Importances:")
        for idx in sorted_indices[:top_n]:
            print(f"{feature_names[idx]}: {importances[idx]}")
        print()
    elif hasattr(model, "coef_"):
        importances = model.coef_[0]
        feature_names = X_mod.columns
        sorted_indices = np.argsort(importances)[::-1]
        print(f"{name} Top {top_n} Feature Importances:")
        for idx in sorted_indices[:top_n]:
            print(f"{feature_names[idx]}: {importances[idx]}")
        print()
    else:
        print(f"{name} does not support feature importance.\n")

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate classification metrics
    report = classification_report(y_test, y_pred)
    
    # Calculate confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Calculate ROC-AUC score if the model supports predict_proba
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_pred_proba)
    else:
        roc_auc = "N/A"
    
    print(f"{name} Classification Report:")
    print(report)
    print(f"ROC-AUC Score: {roc_auc}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print()
    
    # Print top 20 feature importances
    print_feature_importance(model, X_train, name, top_n=20)


LR Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.68      0.81      5338
           1       0.93      1.00      0.97     23909

    accuracy                           0.94     29247
   macro avg       0.96      0.84      0.89     29247
weighted avg       0.94      0.94      0.94     29247

ROC-AUC Score: 0.9147847352584431
Confusion Matrix:
[[ 3626  1712]
 [   20 23889]]

LR Top 20 Feature Importances:
debt_settlement_flag_N: 0.8556169954742144
term_36 months: 0.17675968427547895
total_rec_int: 0.16321437567816205
mort_acc: 0.13584388881363268
mths_since_recent_bc: 0.08693596734952116
fico_range_high: 0.06816192727075535
sub_grade_A1: 0.06624706903751133
grade_A: 0.06285403682934755
home_ownership_MORTGAGE: 0.05763263753758867
sub_grade_A3: 0.036102605680432505
total_il_high_credit_limit: 0.03267837278547872
verification_status_Not Verified: 0.030921211551812988
mths_since_recent_inq: 0.030776007858181298
mo_sin_old_rev_tl_o

## Feature Importance for SMOTE Data

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import numpy as np

# Setting a random seed for reproducibility
random_seed = 42

# List of models to evaluate
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('NB', GaussianNB()))
models.append(('XGB', XGBClassifier()))

# Function to print top 20 feature importances
def print_feature_importance(model, X_train_smote, name, top_n=20):
    if hasattr(model, "feature_importances_"):
        importances = model.feature_importances_
        feature_names = X_mod.columns
        sorted_indices = np.argsort(importances)[::-1]
        print(f"{name} Top {top_n} Feature Importances:")
        for idx in sorted_indices[:top_n]:
            print(f"{feature_names[idx]}: {importances[idx]}")
        print()
    elif hasattr(model, "coef_"):
        importances = model.coef_[0]
        feature_names = X_mod.columns
        sorted_indices = np.argsort(importances)[::-1]
        print(f"{name} Top {top_n} Feature Importances:")
        for idx in sorted_indices[:top_n]:
            print(f"{feature_names[idx]}: {importances[idx]}")
        print()
    else:
        print(f"{name} does not support feature importance.\n")

for name, model in models:
    model.fit(X_train_smote, y_train_smote)
    y_pred = model.predict(X_test)
    
    # Calculate classification metrics
    report = classification_report(y_test, y_pred)
    
    # Calculate confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Calculate ROC-AUC score if the model supports predict_proba
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_pred_proba)
    else:
        roc_auc = "N/A"
    
    print(f"{name} Classification Report:")
    print(report)
    print(f"ROC-AUC Score: {roc_auc}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print()
    
    # Print top 20 feature importances
    print_feature_importance(model, X_train_smote, name, top_n=20)


LR Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.71      0.80      5338
           1       0.94      0.99      0.96     23909

    accuracy                           0.94     29247
   macro avg       0.93      0.85      0.88     29247
weighted avg       0.93      0.94      0.93     29247

ROC-AUC Score: 0.9141399070576722
Confusion Matrix:
[[ 3786  1552]
 [  335 23574]]

LR Top 20 Feature Importances:
debt_settlement_flag_N: 1.0885574810919374
term_36 months: 0.2043286855305284
total_rec_int: 0.1844530445278727
mths_since_recent_bc: 0.153976635070817
mort_acc: 0.14415549089453092
fico_range_high: 0.09890245101418421
mths_since_recent_inq: 0.0774565443165469
total_il_high_credit_limit: 0.0708008561931619
mo_sin_old_rev_tl_op: 0.059671267617766366
annual_inc: 0.057611157411051224
num_accts_ever_120_pd: 0.05523768068933652
purpose_vacation: 0.04803453131321315
home_ownership_OWN: 0.047011757060760294
sub_grade_A3: 0.0377349668

## Hyperparameter Tuning - Normal

In [29]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import numpy as np
warnings.filterwarnings("ignore")

# List of models and their hyperparameters to tune
models = []
models.append(('LR', LogisticRegression(), {'C': [0.01, 0.1, 1, 10, 100], 'solver': ['liblinear', 'lbfgs']}))
models.append(('KNN', KNeighborsClassifier(), {'n_neighbors': np.arange(5, 35, 5), 'weights': ['uniform', 'distance']}))
models.append(('CART', DecisionTreeClassifier(), {'max_depth': np.arange(5, 25, 5), 'min_samples_split': np.arange(5, 25, 5)}))
models.append(('RF', RandomForestClassifier(), {'n_estimators': np.arange(5, 25, 5), 'max_features': ['auto', 'sqrt', 'log2']}))
models.append(('NB', GaussianNB(), {}))  # GaussianNB doesn't have hyperparameters to tune
models.append(('XGB', XGBClassifier(), {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}))


# Iterate over each model, perform hyperparameter tuning, and evaluate
for name, model, params in models:
    if params:
        grid_search = GridSearchCV(model, param_grid=params, cv=5, scoring='roc_auc')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        # If there are no hyperparameters to tune
        best_model = model
        best_model.fit(X_train, y_train)
        best_params = "N/A"
    
    # Perform cross-validation
    cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='roc_auc')
    mean_cv_score = np.mean(cv_scores)
    std_cv_score = np.std(cv_scores)
    
    # Fit the best model on the full training data
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    
    # Calculate classification metrics
    report = classification_report(y_test, y_pred)
    
    # Calculate confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Calculate ROC-AUC score if the model supports predict_proba
    if hasattr(best_model, "predict_proba"):
        y_pred_proba = best_model.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_pred_proba)
    else:
        roc_auc = "N/A"
    
    print(f"{name} Classification Report:")
    print(report)
    print(f"Cross-Validation ROC-AUC Score: {mean_cv_score} ± {std_cv_score}")
    print(f"Test ROC-AUC Score: {roc_auc}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"Model Params{best_params}")
    print()


LR Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.69      0.81      5338
           1       0.93      1.00      0.97     23909

    accuracy                           0.94     29247
   macro avg       0.96      0.84      0.89     29247
weighted avg       0.95      0.94      0.94     29247

Cross-Validation ROC-AUC Score: 0.9159145247282302 ± 0.0013988666061179578
Test ROC-AUC Score: 0.9153402009596114
Confusion Matrix:
[[ 3670  1668]
 [   20 23889]]
Model Params{'C': 100, 'solver': 'liblinear'}

KNN Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.25      0.40      5338
           1       0.86      1.00      0.92     23909

    accuracy                           0.86     29247
   macro avg       0.89      0.62      0.66     29247
weighted avg       0.87      0.86      0.82     29247

Cross-Validation ROC-AUC Score: 0.797083840589295 ± 0.003533152503781959
Test ROC-A

## Hyperparameter Tuning - Smote

In [30]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import numpy as np
warnings.filterwarnings("ignore")

# List of models and their hyperparameters to tune
models = []
models.append(('LR', LogisticRegression(), {'C': [0.01, 0.1, 1, 10, 100], 'solver': ['liblinear', 'lbfgs']}))
models.append(('KNN', KNeighborsClassifier(), {'n_neighbors': np.arange(5, 35, 5), 'weights': ['uniform', 'distance']}))
models.append(('CART', DecisionTreeClassifier(), {'max_depth': np.arange(5, 25, 5), 'min_samples_split': np.arange(5, 25, 5)}))
models.append(('RF', RandomForestClassifier(), {'n_estimators': np.arange(5, 25, 5), 'max_features': ['auto', 'sqrt', 'log2']}))
models.append(('NB', GaussianNB(), {}))  # GaussianNB doesn't have hyperparameters to tune
models.append(('XGB', XGBClassifier(), {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}))


# Iterate over each model, perform hyperparameter tuning, and evaluate
for name, model, params in models:
    if params:
        grid_search = GridSearchCV(model, param_grid=params, cv=5, scoring='roc_auc')
        grid_search.fit(X_train_smote, y_train_smote)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        # If there are no hyperparameters to tune
        best_model = model
        best_model.fit(X_train_smote, y_train_smote)
        best_params = "N/A"
    
    # Perform cross-validation
    cv_scores = cross_val_score(best_model, X_train_smote, y_train_smote, cv=5, scoring='roc_auc')
    mean_cv_score = np.mean(cv_scores)
    std_cv_score = np.std(cv_scores)
    
    # Fit the best model on the full training data
    best_model.fit(X_train_smote, y_train_smote)
    y_pred = best_model.predict(X_test)
    
    # Calculate classification metrics
    report = classification_report(y_test, y_pred)
    
    # Calculate confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Calculate ROC-AUC score if the model supports predict_proba
    if hasattr(best_model, "predict_proba"):
        y_pred_proba = best_model.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_pred_proba)
    else:
        roc_auc = "N/A"
    
    print(f"{name} Classification Report:")
    print(report)
    print(f"Cross-Validation ROC-AUC Score: {mean_cv_score} ± {std_cv_score}")
    print(f"Test ROC-AUC Score: {roc_auc}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"Model Params{best_params}")
    print()

LR Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.71      0.80      5338
           1       0.94      0.99      0.96     23909

    accuracy                           0.94     29247
   macro avg       0.93      0.85      0.88     29247
weighted avg       0.94      0.94      0.93     29247

Cross-Validation ROC-AUC Score: 0.9498457436256658 ± 0.017695987890047612
Test ROC-AUC Score: 0.9143995245115814
Confusion Matrix:
[[ 3784  1554]
 [  301 23608]]
Model Params{'C': 100, 'solver': 'lbfgs'}

KNN Classification Report:
              precision    recall  f1-score   support

           0       0.35      0.67      0.46      5338
           1       0.91      0.72      0.80     23909

    accuracy                           0.71     29247
   macro avg       0.63      0.69      0.63     29247
weighted avg       0.81      0.71      0.74     29247

Cross-Validation ROC-AUC Score: 0.9594500003621974 ± 0.007488783313168847
Test ROC-AUC S

## Deep Learning

In [31]:
import tensorflow as tf
from tensorflow import keras




In [32]:
model = keras.models.Sequential([
keras.layers.Flatten(input_shape=[97]),
keras.layers.Dense(1000, activation="relu"),
keras.layers.Dense(450, activation="relu"),
keras.layers.Dense(100, activation="relu"),
keras.layers.Dense(1, activation="softmax")
])




In [33]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
train_data = scaler.fit_transform(X_train_smote)
val_data = scaler.transform(X_test)#train_data

In [34]:
train_labels_mc = y_train_smote
val_labels_mc = y_test

In [35]:
# Since it is binary class classification categorical crossentropy and accuracy as metric is utilized to compile the model
opt = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss="binary_crossentropy",
optimizer=opt,
metrics=["accuracy"])

In [36]:
history = model.fit(train_data, train_labels_mc, epochs=100, validation_data=(val_data, val_labels_mc), verbose=1)# Turn verbose=1 to printing epochs

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 52/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 65/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [38]:
import numpy as np
from sklearn.metrics import classification_report

# Train your model (already done)
#history = model.fit(train_data, train_labels_mc, epochs=100, validation_data=(val_data, val_labels_mc), verbose=1)

# Make predictions on the validation set
predictions = model.predict(val_data)

# Convert predictions to binary class labels
predicted_labels = (predictions > 0.5).astype(int).flatten()

# Generate and print the classification report
report = classification_report(val_labels_mc, predicted_labels, target_names=['Class 0', 'Class 1'])
print(report)


              precision    recall  f1-score   support

     Class 0       0.00      0.00      0.00      5338
     Class 1       0.82      1.00      0.90     23909

    accuracy                           0.82     29247
   macro avg       0.41      0.50      0.45     29247
weighted avg       0.67      0.82      0.74     29247



In [39]:
import numpy as np
from sklearn.metrics import classification_report, roc_auc_score

# Train your model (already done)
# history = model.fit(train_data, train_labels_mc, epochs=100, validation_data=(val_data, val_labels_mc), verbose=1)

# Make predictions on the validation set
predictions = model.predict(val_data)

# Convert predictions to binary class labels
predicted_labels = (predictions > 0.5).astype(int).flatten()

# Generate and print the classification report
report = classification_report(val_labels_mc, predicted_labels, target_names=['Class 0', 'Class 1'])
print("Classification Report:\n", report)

# Calculate and print the ROC AUC score
roc_auc = roc_auc_score(val_labels_mc, predictions)
print("ROC AUC Score:", roc_auc)


Classification Report:
               precision    recall  f1-score   support

     Class 0       0.00      0.00      0.00      5338
     Class 1       0.82      1.00      0.90     23909

    accuracy                           0.82     29247
   macro avg       0.41      0.50      0.45     29247
weighted avg       0.67      0.82      0.74     29247

ROC AUC Score: 0.5
