# Disseration Experiment 
# Dataset Model Building and Evaluation Functions
Ciaran Finnegan February 2024

In [1]:
# display libraries
from IPython.display import display, HTML

In [2]:
# Compute additional evaluation metrics
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

In [3]:
# Graph enhancements
import matplotlib.ticker as ticker

In [4]:
def display_banner(text):
    banner_html = f"""
    <div style="background-color: #4CAF50; padding: 7px; text-align: center; border-radius: 3px;">
        <h2 style="color: white;">{text}</h2>
    </div>
    """
    display(HTML(banner_html))

In [5]:
def display_text(text):
    text_html = f"""
    <div style="font-size: 20px; font-weight: bold;">
        {text}
    </div>
    """
    display(HTML(text_html))

In [6]:
def display_model_metrics_tabular(model, X_test, y_test):
    
    # Evaluate the model on the test set
    display_banner("This is the Model Accuracy")
    
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
    display_text(f'Test Accuracy: {test_accuracy * 100:.2f}%')
    
    # Predict probabilities
    y_pred_probs = model.predict(X_test)
    
    # Convert probabilities to binary predictions
    y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_probs]
    
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_probs)
    
    
    # Creating a formatted table to display the results
    table = """
    <table>
        <tr>
            <th>Metric</th>
            <th>Value</th>
        </tr>
        <tr>
            <td>Test Accuracy</td>
            <td>{:.4f}</td>
        </tr>
        <tr>
            <td>Precision</td>
            <td>{:.4f}</td>
        </tr>
        <tr>
            <td>Recall</td>
            <td>{:.4f}</td>
        </tr>
        <tr>
            <td>F1-Score</td>
            <td>{:.4f}</td>
        </tr>
        <tr>
            <td>ROC-AUC Score</td>
            <td>{:.4f}</td>
        </tr>
    </table>
    """.format(test_accuracy, precision, recall, f1, roc_auc)
        
    # Extract metrics directly from the classification_report function in a structured format
    report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    # Organize the metrics into a dataframe
    metrics_df = pd.DataFrame({
        'Metric': ['Accuracy', 'ROC AUC Score', 'Precision (Class 0)', 'Recall (Class 0)', 'F1-Score (Class 0)', 
                   'Precision (Class 1)', 'Recall (Class 1)', 'F1-Score (Class 1)'],
        'Value': [test_accuracy, roc_auc, 
                  report_dict['0']['precision'], report_dict['0']['recall'], report_dict['0']['f1-score'],
                  report_dict['1']['precision'], report_dict['1']['recall'], report_dict['1']['f1-score']]
    })

    # Display the dataframe in a tabular format
    display_text("Model Performance Metrics")
    display(HTML(metrics_df.to_html(index=False, classes="table table-striped table-bordered")))
    
    print("Tablular Done!")
    
    return y_pred

In [7]:
def generate_confusion_matrix(y_test, y_pred):

    display_banner("Confusion Matrix")
    
    # Plotting the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', cbar=False)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    
    plt.show()
    
    return "Confusion Matrix!"

In [8]:
def display_CC_binary_cols_cnt(df):
    # Identify categorical columns
    categorical_features = df.select_dtypes(include=['object', 'category']).columns

    # Count the unique values in each categorical column
    unique_counts = df[categorical_features].nunique()

    # Create a DataFrame for better visualization
    visualization_df = pd.DataFrame({
        "Categorical Column": categorical_features,
        "Unique Values Count": unique_counts
    })

    # Use the style property to better present the DataFrame
    styled_df = visualization_df.style.set_table_styles(
        [{
            'selector': 'th',
            'props': [('font-size', '12pt'), ('background-color', 'lightblue')]
        },
        {
            'selector': 'td',
            'props': [('font-size', '12pt')]
        }]
    ).set_properties(**{
        'text-align': 'left',
    }).set_caption("Categorical Columns and Unique Value Counts")

    # Display the styled DataFrame
    return unique_counts, styled_df, categorical_features

In [9]:
def scale_the_features(X_train_downsampled, X_test_downsampled, df_downsampled, sLabel='default'):
    
    # ----------------------------------------
    # Visualise before scaling
    # ----------------------------------------
    display_text("Visualise before scaling...")    
    
    # Convert the downsampled DataFrame to Pandas DataFrame for visualization
    X_train_downsampled_df = pd.DataFrame(X_train_downsampled, columns=df_downsampled.columns)
    
    # Plotting the distributions before scaling
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=X_train_downsampled_df)
    plt.title("Feature Distributions Before Scaling")
    plt.xticks(rotation=90)
    # Disable scientific notation for y-axis
    plt.gca().yaxis.set_major_formatter(ticker.ScalarFormatter(useOffset=False))

    plt.show()
    
    # ----------------------------------------
    # Isolate Non-Binary features for scaling
    # ----------------------------------------

    # Identify binary features
    binary_features = [col for col in X_train_downsampled if 
                       X_train_downsampled[col].dropna().isin([0, 1]).all()]
    print("\n\nAll CC Fraud Binary features (NOT to be scaled):", binary_features)
    print("\n\n")
    
    
    # Split dataframes into binary and non-binary features
    X_train_binary = X_train_downsampled[binary_features]
    X_train_non_binary = X_train_downsampled.drop(columns=binary_features)
    
    X_test_binary = X_test_downsampled[binary_features]
    X_test_non_binary = X_test_downsampled.drop(columns=binary_features)
    
     
    
    
    # ----------------------------------------
    # Scaling the features
    # ----------------------------------------
    # Scaling the non-binary features
    scaler = StandardScaler()
    X_train_non_binary_scaled = scaler.fit_transform(X_train_non_binary)
    X_test_non_binary_scaled = scaler.transform(X_test_non_binary)
    #
    
    
    # Convert scaled arrays back to dataframes
    X_train_non_binary_scaled_df = pd.DataFrame(X_train_non_binary_scaled, 
                                                index=X_train_non_binary.index, 
                                                columns=X_train_non_binary.columns)
    X_test_non_binary_scaled_df = pd.DataFrame(X_test_non_binary_scaled, 
                                               index=X_test_non_binary.index, 
                                               columns=X_test_non_binary.columns)
    
    # Concatenate the binary and scaled non-binary features
    X_train_downsampled_scaled = pd.concat([X_train_binary, X_train_non_binary_scaled_df], axis=1)
    X_test_downsampled_scaled = pd.concat([X_test_binary, X_test_non_binary_scaled_df], axis=1)
    
    
    # Ensure the sequence of columns matches the original dataframe (excluding the label)
    cols = df_downsampled.drop(columns=[sLabel]).columns
    X_train_downsampled_scaled = X_train_downsampled_scaled[cols]
    X_test_downsampled_scaled = X_test_downsampled_scaled[cols]
    
    
    
    # ----------------------------------------
    # Visualise after scaling
    # ----------------------------------------
    display_text("Visualise after scaling...")    
    
    # Extract the feature names, excluding the target variable 'default'
    feature_names = df_downsampled.drop(sLabel, axis=1).columns
    
    # Convert the scaled data back to a DataFrame
    X_train_scaled_df = X_train_downsampled_scaled
    
    # Plotting the distributions after scaling
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=X_train_scaled_df)
    plt.title("Feature Distributions After Scaling")
    plt.xticks(rotation=90)
    plt.show()


    
    return X_train_downsampled_scaled, X_test_downsampled_scaled, scaler

In [10]:
def scale_the_DiCE_df(df_downsampled, scaler, sLabel='Fraud'):
    
    # The DiCE explainer is built using the entire dataset and 
    # is scaled seperately from within that Notebook

    # ----------------------------------------
    # Isolate Non-Binary features for scaling
    # ----------------------------------------

    # Identify binary features
    binary_features = [col for col in df_downsampled if 
                       df_downsampled[col].dropna().isin([0, 1]).all()]
    print("\n\nAll CC Fraud Binary features (NOT to be scaled):", binary_features)
    print("\n\n")
    
    
    # Split dataframes into binary and non-binary features
    df_downsampled_binary = df_downsampled[binary_features]
    df_downsampled_non_binary = df_downsampled.drop(columns=binary_features)
    
    
    # ----------------------------------------
    # Scaling the features
    # ----------------------------------------
    # Scaling the non-binary features
    df_downsampled_non_binary_scaled = scaler.transform(df_downsampled_non_binary)
    #
    
    
    # Convert scaled arrays back to dataframes
    df_downsampled_non_binary_scaled_df = pd.DataFrame(df_downsampled_non_binary_scaled, 
                                                       index=df_downsampled_non_binary.index, 
                                                       columns=df_downsampled_non_binary.columns)

    
    # Concatenate the binary and scaled non-binary features
    df_downsampled_scaled = pd.concat([df_downsampled_binary, df_downsampled_non_binary_scaled_df], axis=1)

    
    
    # Ensure the sequence of columns matches the original dataframe (excluding the label)
    cols = df_downsampled.columns
    df_downsampled_scaled = df_downsampled_scaled[cols]


    
    return df_downsampled_scaled

In [11]:
def store_CC_train_test_data(X_test, y_test, X_train, y_train, df_downsampled, lCatCols, sLabel='_Fraud'):

    # Assuming X_train, X_test, y_train, and y_test are already created by the calling Notebook

    # Assuming X_train, X_test, y_train, and y_test are numpy arrays - convert
    # Test Data
    X_test_df = pd.DataFrame(X_test)
    y_test_df = pd.DataFrame(y_test)
    
    #Training Data
    X_train_df = pd.DataFrame(X_train)
    y_train_df = pd.DataFrame(y_train)
    
    
    
    #Set up file names
    X_test_file = 'X_test' + sLabel + '.csv'
    y_test_file = 'y_test' + sLabel + '.csv'
    X_train_file = 'X_train' + sLabel + '.csv'
    y_train_file = 'y_train' + sLabel + '.csv'
    dfdownsmp_file = 'df_downsampled' + sLabel + '.csv'
    dfCatCols_file = 'df_CatCols' + sLabel + '.csv'
    
    
    
    # Save the test data
    X_test_df.to_csv(X_test_file, index=False)
    y_test_df.to_csv(y_test_file, index=False)
    
    
    # Save the training data
    X_train_df.to_csv(X_train_file, index=False)
    y_train_df.to_csv(y_train_file, index=False)
    
    
    # Save the sample dataset
    df_downsampled.to_csv(dfdownsmp_file, index=False)
    
    
    # Save the list of categorical columns in the dataset
    dfCatCols = pd.DataFrame(lCatCols)
    dfCatCols.to_csv(dfCatCols_file, index=False)


    return "CC Data stored!", X_test_df, y_test_df, X_train_df, y_train_df

In [12]:
def load_CC_train_test_data(sLabel='_Fraud'):
    
    #Set up file names
    X_test_file = 'X_test' + sLabel + '.csv'
    y_test_file = 'y_test' + sLabel + '.csv'
    X_train_file = 'X_train' + sLabel + '.csv'
    y_train_file = 'y_train' + sLabel + '.csv'
    dfdownsmp_file = 'df_downsampled' + sLabel + '.csv'
    dfCatCols_file = 'df_CatCols' + sLabel + '.csv'
    
    
    # Load the test data
    X_test = pd.read_csv(X_test_file)
    y_test = pd.read_csv(y_test_file)
    
    # Load the training data
    X_train = pd.read_csv(X_train_file)
    y_train = pd.read_csv(y_train_file)
    
    # Load the downsamples dataset
    df_downsampled = pd.read_csv(dfdownsmp_file)
    
    # Load the list of categorical columns in the dataset
    dfCatCols = pd.read_csv(dfCatCols_file)
    
    return X_test, y_test, X_train, y_train, df_downsampled, dfCatCols