In [1]:
import pandas as pd
import sklearn
from sklearn.preprocessing import StandardScaler
import os
import sys

In [2]:
"""
Read in the data, we are only interested in headlines and category. 
One hot encode the categories
"""
if not os.path.exists("processedData.csv"):

    df = pd.read_csv("data/train.csv")

    # Downsample the data to 50000 samples
    df = df.sample(n=50000, random_state=42)

    target = ['Response']
    boolean_vars = ['Gender', 'Driving_License', 'Previously_Insured', 
                    'Vehicle_Damage']
    num_vars = ['Age', 'Annual_Premium', 'Vintage']
    cat_vars = ['Region_Code', 'Vehicle_Age', 'Policy_Sales_Channel']

    # Turn the boolean variables into 0 and 1
    df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
    df['Vehicle_Damage'] = df['Vehicle_Damage'].map({'Yes': 1, 'No': 0})
    df[boolean_vars] = df[boolean_vars].astype('float16')

    # Standardize The numerical variables
    scaler = StandardScaler()
    df[num_vars] = scaler.fit_transform(df[num_vars]).astype('float16')

    # One hot encode the categorical variables
    df = pd.get_dummies(df, columns=cat_vars, dtype='float16')

    # Downscale Majority class 10 times to even out the classes 
    majorityClass = df.where(df['Response'] == 0).dropna()
    minorityClass = df.where(df['Response'] == 1).dropna()
    minorityCount = len(minorityClass)
    downSampled = majorityClass.sample(n=minorityCount, random_state=42)
    df = pd.concat([downSampled, minorityClass]) 

    # Shuffle the data
    df = df.sample(frac=1, random_state=42)

    # Save the data
    df.to_csv("processedData.csv", index=False)


In [3]:
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from keras.models import Sequential
from keras.initializers import HeNormal
from keras.regularizers import l2
from keras.callbacks import LearningRateScheduler
from keras.optimizers.schedules import ExponentialDecay
from keras.layers import Dropout
from sklearn.base import BaseEstimator, ClassifierMixin
from keras.losses import BinaryCrossentropy 
from sklearn.metrics import recall_score
import numpy as np
import keras


class B3D3AD_Classifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.classes_ = np.array([0, 1])  
        self.model = Sequential([
            self.DenseLayer(4096, activation='relu'),
            self.DenseLayer(512, activation='relu'),
            self.DenseLayer(512, activation='relu'),
            self.DenseLayer(512, activation='relu'),
            self.DropoutLayer(0.3),
            self.DenseLayer(1, activation='sigmoid'),
        ])

    # Customer Dense layer
    def DenseLayer(self, nodes, activation='relu'):
        return Dense(
            nodes, activation=activation, 
            kernel_initializer=HeNormal(), bias_initializer=HeNormal(),
            kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)
        )

    # Custom dropout layer
    def DropoutLayer(self, rate):
        return Dropout(rate)

    # Resets weights to HeNormal
    def reset_weights(self):
        initial_weights = self.model.get_weights()
        self.model.set_weights(initial_weights)

    def predict(self, X, threshold=0.5):
        # Predict probabilities
        probabilities = self.model.predict(X)
        # Convert probabilities to binary predictions using the threshold
        predictions = (probabilities >= threshold).astype(int)
        return predictions

    # compile the model
    def compile(self):
        lr_scheduler = ExponentialDecay(initial_learning_rate=0.001, decay_steps=1, decay_rate=.1)
        self.model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr_scheduler),
                           loss=BinaryCrossentropy(), metrics=['accuracy'])

    # Calculate recall
    def recall(self, X, y):
        predictions = self.predict(X)
        return recall_score(y, predictions)

    # Run the model. Forward fit using a learning rate scheduler
    def fit(self, X, training_labels, epochs=32, batch_size=64):
        self.compile()
        self.model.fit(X, training_labels, epochs=epochs, batch_size=batch_size)

2024-07-12 14:22:10.413150: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-12 14:22:10.429584: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-12 14:22:10.436067: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-12 14:22:10.453580: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
from sklearn.model_selection import train_test_split


def train():
    # Load the data
    data = pd.read_csv("processedData.csv")

    # Split features and target columns
    columns = data.columns.drop('Response')
    X = data[columns]
    y = data['Response'].astype(np.int8)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    display(X_train.head(4))
    display(y_train.head(4))

    # Initialize lists to store evaluation results
    accuracies = []
    recalls = []

    # Train the model on a custom Model Class (NLTK_Binary_Classifier)
    model = B3D3AD_Classifier()
    model.fit(X_train, y_train)  # Assuming your fit method works with your custom class

    # Evaluate the model
    accuracy = model.score(X_test, y_test)
    accuracies.append(accuracy)
    recall = model.recall(X_test, y_test)
    recalls.append(recall)

    # Calculate and print the mean accuracy
    mean_accuracy = np.mean(accuracies)
    print(f"Mean Accuracy: {mean_accuracy}")
    print(f"Mean recall: {np.mean(recalls)}")

     # Display the Confusion Matrix
    conf_matrix = confusion_matrix(y_train,  model.model.predict(X_train).round()) 
    plt.figure(figsize=(10, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted Loan Default')
    plt.ylabel('Actual Load Default')
    plt.title('Confusion Matrix Loan Default Prediction (0-Non-Default, 1-Default)')
    plt.show()

train()

Unnamed: 0,id,Gender,Age,Driving_License,Previously_Insured,Vehicle_Damage,Annual_Premium,Vintage,Region_Code_0.0,Region_Code_1.0,...,Policy_Sales_Channel_152.0,Policy_Sales_Channel_153.0,Policy_Sales_Channel_154.0,Policy_Sales_Channel_155.0,Policy_Sales_Channel_156.0,Policy_Sales_Channel_157.0,Policy_Sales_Channel_158.0,Policy_Sales_Channel_159.0,Policy_Sales_Channel_160.0,Policy_Sales_Channel_163.0
2406,10849176.0,1.0,1.039,1.0,0.0,1.0,-0.03693,-0.1021,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29,370506.0,0.0,-0.893,1.0,1.0,0.0,0.1646,-0.1646,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8181,8822662.0,0.0,-1.159,1.0,1.0,1.0,-1.701,0.2852,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5580,519699.0,1.0,1.239,1.0,1.0,0.0,0.522,-0.927,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


2406    1
29      0
8181    0
5580    0
Name: Response, dtype: int8

Epoch 1/32


2024-07-12 14:22:12.062767: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2024-07-12 14:22:12.062790: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:135] retrieving CUDA diagnostic information for host: rashaka-MS-7A38
2024-07-12 14:22:12.062796: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:142] hostname: rashaka-MS-7A38
2024-07-12 14:22:12.062876: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:166] libcuda reported version is: 555.42.2
2024-07-12 14:22:12.062897: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:170] kernel reported version is: 555.42.2
2024-07-12 14:22:12.062902: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:249] kernel version seems to match DSO: 555.42.2
