# Disseration Experiment 
# Dataset Model Building and Evaluation Functions
Ciaran Finnegan October 2023

In [1]:
# display libraries
from IPython.display import display, HTML

In [2]:
# Compute additional evaluation metrics
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

In [3]:
# Graph enhancements
import matplotlib.ticker as ticker

In [4]:
def display_banner(text):
    banner_html = f"""
    <div style="background-color: #4CAF50; padding: 7px; text-align: center; border-radius: 3px;">
        <h2 style="color: white;">{text}</h2>
    </div>
    """
    display(HTML(banner_html))

In [5]:
def display_text(text):
    text_html = f"""
    <div style="font-size: 20px; font-weight: bold;">
        {text}
    </div>
    """
    display(HTML(text_html))

In [6]:
def display_model_metrics_tabular(model, X_test, y_test):
    
    # Evaluate the model on the test set
    display_banner("This is the Model Accuracy")
    
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
    display_text(f'Test Accuracy: {test_accuracy * 100:.2f}%')
    #print(f'Test Accuracy: {test_accuracy * 100:.2f}%')
    
    # Predict probabilities
    y_pred_probs = model.predict(X_test)
    
    # Convert probabilities to binary predictions
    y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_probs]
    
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_probs)
    
    
    # Creating a formatted table to display the results
    table = """
    <table>
        <tr>
            <th>Metric</th>
            <th>Value</th>
        </tr>
        <tr>
            <td>Test Accuracy</td>
            <td>{:.4f}</td>
        </tr>
        <tr>
            <td>Precision</td>
            <td>{:.4f}</td>
        </tr>
        <tr>
            <td>Recall</td>
            <td>{:.4f}</td>
        </tr>
        <tr>
            <td>F1-Score</td>
            <td>{:.4f}</td>
        </tr>
        <tr>
            <td>ROC-AUC Score</td>
            <td>{:.4f}</td>
        </tr>
    </table>
    """.format(test_accuracy, precision, recall, f1, roc_auc)
    
    # Display the table
    #display(HTML(table))
        
    # Extract metrics directly from the classification_report function in a structured format
    report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    # Organize the metrics into a dataframe
    metrics_df = pd.DataFrame({
        'Metric': ['Accuracy', 'ROC AUC Score', 'Precision (Class 0)', 'Recall (Class 0)', 'F1-Score (Class 0)', 
                   'Precision (Class 1)', 'Recall (Class 1)', 'F1-Score (Class 1)'],
        'Value': [test_accuracy, roc_auc, 
                  report_dict['0']['precision'], report_dict['0']['recall'], report_dict['0']['f1-score'],
                  report_dict['1']['precision'], report_dict['1']['recall'], report_dict['1']['f1-score']]
    })

    # Display the dataframe in a tabular format
    display_text("Model Performance Metrics")
    display(HTML(metrics_df.to_html(index=False, classes="table table-striped table-bordered")))
    
    print("Tablular Done!")
    
    return y_pred

In [7]:
def generate_confusion_matrix(y_test, y_pred):

    display_banner("Confusion Matrix")
    
    # Plotting the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', cbar=False)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    
    plt.show()
    
    return "Confusion Matrix!"

In [8]:
def scale_the_features(X_train_downsampled, X_test_downsampled, df_downsampled):
    
    # ----------------------------------------
    # Visualise before scaling
    # ----------------------------------------
    display_text("Visualise before scaling...")    
    
    # Convert the downsampled DataFrame to Pandas DataFrame for visualization
    X_train_downsampled_df = pd.DataFrame(X_train_downsampled, columns=df_downsampled.columns)
    
    # Plotting the distributions before scaling
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=X_train_downsampled_df)
    plt.title("Feature Distributions Before Scaling")
    plt.xticks(rotation=90)
    # Disable scientific notation for y-axis
    plt.gca().yaxis.set_major_formatter(ticker.ScalarFormatter(useOffset=False))

    plt.show()
    
    X_train_downsampled.head()
    
    
    
    # ----------------------------------------
    # Scaling the features
    # ----------------------------------------
    scaler = StandardScaler()   
    
    X_train_downsampled = scaler.fit_transform(X_train_downsampled)
    X_test_downsampled = scaler.transform(X_test_downsampled)
    
    
    
    # ----------------------------------------
    # Visualise after scaling
    # ----------------------------------------
    display_text("Visualise after scaling...")    
    
    # Extract the feature names, excluding the target variable 'default'
    feature_names = df_downsampled.drop('default', axis=1).columns
    
    # Convert the scaled data back to a DataFrame
    X_train_scaled_df = pd.DataFrame(X_train_downsampled, columns=feature_names)
    
    # Plotting the distributions after scaling
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=X_train_scaled_df)
    plt.title("Feature Distributions After Scaling")
    plt.xticks(rotation=90)
    plt.show()


    
    return X_train_downsampled, X_test_downsampled
