In [10]:
import pandas as pd
import sklearn
from sklearn.preprocessing import StandardScaler
import os
import sys

In [11]:
"""
Read in the data, we are only interested in headlines and category. 
One hot encode the categories
"""
if not os.path.exists("processedData.csv"):

    df = pd.read_csv("data/train.csv")

    # Downsample the data to 50000 samples
    df = df.sample(n=100000, random_state=42)

    target = ['Response']
    boolean_vars = ['Gender', 'Driving_License', 'Previously_Insured', 
                    'Vehicle_Damage']
    num_vars = ['Age', 'Annual_Premium', 'Vintage']
    cat_vars = ['Region_Code', 'Vehicle_Age', 'Policy_Sales_Channel']

    # Turn the boolean variables into 0 and 1
    df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
    df['Vehicle_Damage'] = df['Vehicle_Damage'].map({'Yes': 1, 'No': 0})
    df['Vehicle_Age'] = df['Vehicle_Age'].map({'< 1 Year': 0, '1-2 Year': 0.5, '> 2 Years': 1})
    df[boolean_vars] = df[boolean_vars].astype('float16')

    # Standardize The numerical variables
    scaler = StandardScaler()
    df[num_vars] = scaler.fit_transform(df[num_vars]).astype('float16')

    # One hot encode the categorical variables
    df = pd.get_dummies(df, columns=cat_vars, dtype='float16')

    # Downscale Majority class 10 times to even out the classes 
    majorityClass = df.where(df['Response'] == 0).dropna()
    minorityClass = df.where(df['Response'] == 1).dropna()
    minorityCount = len(minorityClass)
    downSampled = majorityClass.sample(n=minorityCount, random_state=42)
    df = pd.concat([downSampled, minorityClass]) 

    # Shuffle the data
    df = df.sample(frac=1, random_state=42)

    # display(df.columns)
    # We are not going to use Region_Code or Policy_Sales_Channel
    df = df[['Response', 'Age', 'Annual_Premium', 'Vintage',
             'Gender', 'Vehicle_Damage', 'Previously_Insured', 'Driving_License']]
    df['Response'] = df['Response'].astype('int8')

    # Save the data
    df.to_csv("processedData.csv", index=False)


In [17]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
import numpy as np


def train():
    # Load the data
    data = pd.read_csv("processedData.csv")

    # Split features and target columns
    columns = data.columns.drop('Response')
    X = data[columns]
    y = data['Response'].astype(np.int8)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=True)

    # # Define the parameter grid
    # param_grid = {
    #     'max_depth': [10, 15, 20],
    #     'min_child_weight': [1, 5, 10],
    #     'gamma': [0, 1, 5],
    #     'subsample': [0.6, 0.8, 1.0],
    #     'colsample_bytree': [0.6, 0.8, 1.0],
    #     'eta': [0.01, 0.1, 0.2],
    #     'lambda': [1],
    #     'alpha': [0, 0.5, 1]
    # }

    # Define the parameter grid
    param_grid = {
        'max_depth': [10],
        'min_child_weight': [1],
        'gamma': [1],
        'subsample': [0.8],
        'colsample_bytree': [0.8],
        'eta': [0.1],
        'lambda': [1],
        'alpha': [1]
    }

    # Initialize the XGBoost classifier
    xgb_clf = xgb.XGBClassifier(
        device='cuda',
        objective='binary:logistic',
        booster='dart',
        eval_metric='logloss',
        tree_method='hist',
        scale_pos_weight=1,
        nthread=8,
    )

    # Perform grid search
    grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, scoring='accuracy', cv=2, verbose=1)
    grid_search.fit(X_train, y_train)

    # Get the best parameters and model
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    print(f'Best parameters found: {best_params}')

    # Make predictions with the best model
    y_pred_prob = best_model.predict_proba(X_test)[:, 1]
    y_pred = (y_pred_prob > 0.5).astype(int)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy}')

train()

Fitting 2 folds for each of 2187 candidates, totalling 4374 fits


KeyboardInterrupt: 