In [22]:
import pandas as pd
import sklearn
from sklearn.preprocessing import StandardScaler
import os
import sys

In [23]:
"""
Read in the data, we are only interested in headlines and category. 
One hot encode the categories
"""
if not os.path.exists("processedData.csv"):

    df = pd.read_csv("data/train.csv")

    # Downsample the data to 50000 samples
    df = df.sample(n=100000, random_state=42)

    target = ['Response']
    boolean_vars = ['Gender', 'Driving_License', 'Previously_Insured', 
                    'Vehicle_Damage']
    num_vars = ['Age', 'Annual_Premium', 'Vintage']
    cat_vars = ['Region_Code', 'Vehicle_Age', 'Policy_Sales_Channel']

    # Turn the boolean variables into 0 and 1
    df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
    df['Vehicle_Damage'] = df['Vehicle_Damage'].map({'Yes': 1, 'No': 0})
    df['Vehicle_Age'] = df['Vehicle_Age'].map({'< 1 Year': 0, '1-2 Year': 0.5, '> 2 Years': 1})
    df[boolean_vars] = df[boolean_vars].astype('float16')

    # Standardize The numerical variables
    scaler = StandardScaler()
    df[num_vars] = scaler.fit_transform(df[num_vars]).astype('float16')

    # One hot encode the categorical variables
    df = pd.get_dummies(df, columns=cat_vars, dtype='float16')

    # Downscale Majority class 10 times to even out the classes 
    majorityClass = df.where(df['Response'] == 0).dropna()
    minorityClass = df.where(df['Response'] == 1).dropna()
    minorityCount = len(minorityClass)
    downSampled = majorityClass.sample(n=minorityCount, random_state=42)
    df = pd.concat([downSampled, minorityClass]) 

    # Shuffle the data
    df = df.sample(frac=1, random_state=42)

    # display(df.columns)
    # We are not going to use Region_Code or Policy_Sales_Channel
    df = df[['Response', 'Age', 'Annual_Premium', 'Vintage',
             'Gender', 'Vehicle_Damage', 'Previously_Insured', 'Driving_License']]
    df['Response'] = df['Response'].astype('int8')

    # Save the data
    df.to_csv("processedData.csv", index=False)


In [24]:
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from keras.models import Sequential
from keras.initializers import HeNormal
from keras.regularizers import l2
from keras.callbacks import LearningRateScheduler
from keras.optimizers.schedules import ExponentialDecay
from keras.layers import Dropout
from sklearn.base import BaseEstimator, ClassifierMixin
from keras.losses import BinaryCrossentropy 
from sklearn.metrics import recall_score
import numpy as np
import keras


class B3D3AD_Classifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.classes_ = np.array([0, 1])  
        self.model = Sequential([
            self.DenseLayer(5000, activation='relu'),
            self.DenseLayer(500, activation='relu'),
            self.DropoutLayer(0.3),
            self.DenseLayer(1, activation='sigmoid'),
        ])
        self.compile()

    # Customer Dense layer
    def DenseLayer(self, nodes, activation='relu'):
        return Dense(
            nodes, activation=activation, 
            kernel_initializer=HeNormal(), bias_initializer=HeNormal(),
            kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)
        )

    # Custom dropout layer
    def DropoutLayer(self, rate):
        return Dropout(rate)

    # Resets weights to HeNormal
    def reset_weights(self):
        initial_weights = self.model.get_weights()
        self.model.set_weights(initial_weights)

    def predict(self, X, threshold=0.5):
        # Predict probabilities
        probabilities = self.model.predict(X)
        # Convert probabilities to binary predictions using the threshold
        predictions = (probabilities >= threshold).astype(int)
        return predictions

    # compile the model
    def compile(self):
        lr_scheduler = ExponentialDecay(initial_learning_rate=0.001, decay_steps=1, decay_rate=.1)
        self.model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr_scheduler),
                           loss=BinaryCrossentropy(), metrics=['accuracy'])
    # Calculate recall
    def recall(self, X, y):
        predictions = self.predict(X)
        return recall_score(y, predictions)

    # Run the model. Forward fit using a learning rate scheduler
    def fit(self, X, training_labels, epochs=6, batch_size=32):
        self.model.fit(X, training_labels, epochs=epochs, batch_size=batch_size)


In [32]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
import numpy as np


def train():
    # Load the data
    data = pd.read_csv("processedData.csv")

    # Split features and target columns
    columns = data.columns.drop('Response')
    X = data[columns]
    y = data['Response'].astype(np.int8)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=True)

    # Define the parameter grid
    param_grid = {
        'max_depth': [10, 15, 20],
        'min_child_weight': [1, 5, 10],
        'gamma': [0, 1, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'eta': [0.01, 0.1, 0.2],
        'lambda': [1, 2, 3],
        'alpha': [0, 0.5, 1]
    }

    # Initialize the XGBoost classifier
    xgb_clf = xgb.XGBClassifier(
        objective='binary:logistic',
        booster='dart',
        eval_metric='logloss',
        tree_method='hist',
        scale_pos_weight=1,
        nthread=8,
        use_label_encoder=False
    )

    # Perform grid search
    grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, scoring='accuracy', cv=3, verbose=1)
    grid_search.fit(X_train, y_train)

    # Get the best parameters and model
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    print(f'Best parameters found: {best_params}')

    # Make predictions with the best model
    y_pred_prob = best_model.predict_proba(X_test)[:, 1]
    y_pred = (y_pred_prob > 0.5).astype(int)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy}')

train()

Accuracy: 0.7879901960784313
