# Model Training


In [None]:
#Import Data and Required Packages
import pandas as pd

In [None]:
import numpy as np

In [None]:
#Import the CSV Data as Pandas DataFrame
df = pd.read_csv('./data/fraud_oracle.csv')

# Data processing


In [None]:
from sklearn.model_selection import train_test_split
# Encoding
from sklearn.preprocessing import LabelEncoder

In [None]:
# Assuming 'df' is your DataFrame and 'target_column' is the column you're predicting
X = df.drop('FraudFound_P', axis=1)  # Features
y = df['FraudFound_P']  # Target

# Split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
Xtrain_v1=X_train.copy()
Xtest_v1=X_test.copy()

In [None]:
binary_columns = [col for col in X.columns if X[col].nunique() == 2]

In [None]:
print('The number of Binary Columns: ', len(binary_columns))
print('Binary Columns: ', binary_columns)

The number of Binary Columns:  6
Binary Columns:  ['AccidentArea', 'Sex', 'Fault', 'PoliceReportFiled', 'WitnessPresent', 'AgentType']


In [None]:
#train set processing
# Encoding Categorical Variables (Binary Variable) - Label Encoding¶
le = LabelEncoder()
for col  in binary_columns:
    Xtrain_v1[col] = le.fit_transform(Xtrain_v1[col])
    print('Label Mapping: ', dict(zip(le.classes_, le.transform(le.classes_))))

Label Mapping:  {'Rural': 0, 'Urban': 1}
Label Mapping:  {'Female': 0, 'Male': 1}
Label Mapping:  {'Policy Holder': 0, 'Third Party': 1}
Label Mapping:  {'No': 0, 'Yes': 1}
Label Mapping:  {'No': 0, 'Yes': 1}
Label Mapping:  {'External': 0, 'Internal': 1}


In [None]:
#test set processing
le = LabelEncoder()
for col  in binary_columns:
    Xtest_v1[col] = le.fit_transform(Xtest_v1[col])
    print('Label Mapping: ', dict(zip(le.classes_, le.transform(le.classes_))))

Label Mapping:  {'Rural': 0, 'Urban': 1}
Label Mapping:  {'Female': 0, 'Male': 1}
Label Mapping:  {'Policy Holder': 0, 'Third Party': 1}
Label Mapping:  {'No': 0, 'Yes': 1}
Label Mapping:  {'No': 0, 'Yes': 1}
Label Mapping:  {'External': 0, 'Internal': 1}


An Ordinal Categorical Variable is a categorical variable where the categories have a logical order or ranking
A Nominal Categorical Variable is a categorical variable where the categories do not have a logical order or ranking


In [None]:
for col in ['VehiclePrice', 'DriverRating', 'AgeOfVehicle', 'BasePolicy']:
    print(f"The unique values in {col}: {Xtrain_v1[col].unique()}")

The unique values in VehiclePrice: ['20000 to 29000' '30000 to 39000' 'more than 69000' '40000 to 59000'
 'less than 20000' '60000 to 69000']
The unique values in DriverRating: [4 1 2 3]
The unique values in AgeOfVehicle: ['7 years' 'more than 7' '6 years' '5 years' 'new' '3 years' '4 years'
 '2 years']
The unique values in BasePolicy: ['Liability' 'Collision' 'All Perils']


In [None]:
vehicleprice_label = {'more than 69000': 1, '20000 to 29000': 0,  '30000 to 39000': 0, 'less than 20000': 1, '40000 to 59000': 1, '60000 to 69000': 0}
ageofvehicle_label = {'new': 2, '2 years': 0, '3 years': 2, '4 years': 2, '5 years': 1, '6 years': 1, '7 years': 0, 'more than 7': 0}
basepolicy_label = {'Liability': 0, 'Collision': 1, 'All Perils': 2}

In [None]:
#train set processing
Xtrain_v1['VehiclePrice'] = Xtrain_v1['VehiclePrice'].map(vehicleprice_label)
Xtrain_v1['AgeOfVehicle'] = Xtrain_v1['AgeOfVehicle'].map(ageofvehicle_label)
Xtrain_v1['BasePolicy'] = Xtrain_v1['BasePolicy'].map(basepolicy_label)

In [None]:
#test set processing
Xtest_v1['VehiclePrice'] = Xtest_v1['VehiclePrice'].map(vehicleprice_label)
Xtest_v1['AgeOfVehicle'] = Xtest_v1['AgeOfVehicle'].map(ageofvehicle_label)
Xtest_v1['BasePolicy'] = Xtest_v1['BasePolicy'].map(basepolicy_label)

In [None]:
dtype_change_string = ['RepNumber', 'Deductible', 'Year']
#train set processing
for col in dtype_change_string:
    Xtrain_v1[col] = Xtrain_v1[col].astype(str)

In [None]:
#test set processing
for col in dtype_change_string:
    Xtest_v1[col] = Xtest_v1[col].astype(str)

In [None]:
onehot_encoding_columns = ['Make', 'MonthClaimed', 'MaritalStatus', 'PolicyType', 'VehicleCategory', 'RepNumber', 'Deductible', 'Days_Policy_Accident', 'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfPolicyHolder', 'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars', 'Year']
print("The number of one-hot encoding target features: ", len(onehot_encoding_columns))

The number of one-hot encoding target features:  15


In [None]:
#dummification
Xtrain_v1= pd.get_dummies(Xtrain_v1, columns=onehot_encoding_columns)

In [None]:
Xtest_v1= pd.get_dummies(Xtest_v1, columns=onehot_encoding_columns)

Now, all features' datatypes are "Integer" or "Boolean

# Data Reduction

In [None]:
useless_columns = ['Month', 'WeekOfMonth', 'DayOfWeek', 'DayOfWeekClaimed', 'WeekOfMonthClaimed', 'PolicyNumber']
Xtrain_v2 = Xtrain_v1.drop(columns=useless_columns, axis=1)

In [None]:
Xtest_v2 = Xtest_v1.drop(columns=useless_columns, axis=1)

In [None]:
onehot_encoded_columns = [col for col in Xtrain_v2.columns if '_' in col]
print("The Number of One-hot Encoded Columns: ", len(onehot_encoded_columns))

The Number of One-hot Encoded Columns:  104


In [None]:
constant_features = []
for col in onehot_encoded_columns:
    if Xtrain_v2 [col].sum() <= 5:
        constant_features.append(col)
print("The Number of Constant Features: ", len(constant_features))

The Number of Constant Features:  8


In [None]:
constant_features

['Make_Ferrari',
 'Make_Jaguar',
 'Make_Lexus',
 'Make_Mecedes',
 'Make_Porche',
 'Deductible_300',
 'AddressChange_Claim_under 6 months',
 'NumberOfCars_more than 8']

In [None]:
Xtrain_v2.drop(columns=constant_features, axis=1, inplace=True)
Xtrain_v2.shape

(12336, 107)

# Outlier Handeling

In [None]:
import lightgbm as lgb

In [None]:
# Function02: Outlier Replacement using LightGBM
def lightgbm_imputation(dataframe, feature):
    df_copy = dataframe.copy()
    train_data = df_copy[df_copy[feature].notnull()]
    test_data = df_copy[df_copy[feature].isnull()]

    X_train = train_data.drop(columns=[feature], axis=1)
    y_train = train_data[feature]
    X_test = test_data.drop(columns=[feature], axis=1)

    train_dataset = lgb.Dataset(X_train, label=y_train)

    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'verbose': -1
    }

    model = lgb.train(params, train_dataset, num_boost_round=100)

    predictions = model.predict(X_test)

    df_copy.loc[df_copy[feature].isnull(), feature] = predictions

    return df_copy
# Change 0s and outliers i

In [None]:
# Change 0s and outliers into NullValues
Xtrain_v2['Age'] = Xtrain_v2['Age'].apply(lambda x: np.nan if x == 0 or x > 74 else x)

In [None]:
# Change 0s and outliers into NullValues
Xtest_v2['Age'] = Xtest_v2['Age'].apply(lambda x: np.nan if x == 0 or x > 74 else x)

In [None]:
# Apply LightGBM Imputation
Xtrain_v3 = lightgbm_imputation(Xtrain_v2, 'Age')
# Round up floats
Xtrain_v3['Age'] = Xtrain_v3["Age"].apply(lambda x: round(x))

In [None]:
# Apply LightGBM Imputation
Xtest_v3 = lightgbm_imputation(Xtest_v2, 'Age')
# Round up floats
Xtest_v3['Age'] = Xtest_v3["Age"].apply(lambda x: round(x))

# Oversampling

In [None]:
# SMOTE Oversampling
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE(random_state=0)
X_train_over, y_train_over = smote.fit_resample(Xtrain_v3, y_train)

print("Before SMOTE: ", Xtrain_v3.shape, y_train.shape)
print("After Smote: ", X_train_over.shape, y_train_over.shape)
print()
print("After SMOTE Label Distribution: ", pd.Series(y_train_over).value_counts())

Before SMOTE:  (12336, 107) (12336,)
After Smote:  (23220, 107) (23220,)

After SMOTE Label Distribution:  0    11610
1    11610
Name: FraudFound_P, dtype: int64


# Feature Selection - Boruta

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Use a Random Forest model as the base
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_over, y_train_over)

In [None]:
#pip install BorutaShap

In [None]:
from BorutaShap import BorutaShap

In [None]:
import xgboost as xgb

# Create an XGBoost model with GPU support
xgboost_model = xgb.XGBClassifier(tree_method='gpu_hist', predictor='gpu_predictor', use_label_encoder=False, eval_metric='logloss')

# Apply BorutaShap using XGBoost
feat_selector = BorutaShap(model=xgboost_model, importance_measure='shap', classification=True)

# Fit BorutaShap (with sampling to speed it up)
feat_selector.fit(X=X_train_over, y=y_train_over, n_trials=50, sample=0.1, train_or_test='train', normalize=True)

  0%|          | 0/50 [00:00<?, ?it/s]

54 attributes confirmed important: ['PastNumberOfClaims_1', 'NumberOfSuppliments_1 to 2', 'MonthClaimed_May', 'Fault', 'RepNumber_14', 'BasePolicy', 'NumberOfSuppliments_3 to 5', 'PastNumberOfClaims_2 to 4', 'PolicyType_Sedan - Collision', 'MonthClaimed_Jan', 'MonthClaimed_Mar', 'Make_Chevrolet', 'MaritalStatus_Single', 'RepNumber_16', 'RepNumber_8', 'MonthClaimed_Jun', 'NumberOfSuppliments_none', 'RepNumber_11', 'RepNumber_2', 'RepNumber_9', 'MonthClaimed_Dec', 'VehicleCategory_Sport', 'Year_1994', 'Make_Toyota', 'Age', 'Make_Pontiac', 'MonthClaimed_Aug', 'MonthClaimed_Feb', 'Year_1995', 'RepNumber_15', 'MonthClaimed_Sep', 'RepNumber_5', 'AddressChange_Claim_2 to 3 years', 'Make_Mazda', 'Year_1996', 'MonthClaimed_Nov', 'RepNumber_1', 'MaritalStatus_Married', 'RepNumber_13', 'PolicyType_Sedan - Liability', 'RepNumber_4', 'NumberOfSuppliments_more than 5', 'DriverRating', 'RepNumber_7', 'RepNumber_10', 'RepNumber_3', 'PastNumberOfClaims_none', 'Make_Honda', 'PastNumberOfClaims_more than

In [None]:
# Get the selected features
selected_features = feat_selector.accepted

In [1]:
selected_features

NameError: name 'selected_features' is not defined

In [None]:
# Filter the training set to keep only the selected features
# Use 'loc' to filter your training set with selected feature names
X_train_selected = X_train_over[selected_features]

In [None]:
x_test_selected= Xtest_v3[selected_features]

# Modelling

In [None]:
# Machine Learning - Preparation

# Machine Learning - Algorithm
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier
# Machine Learning - Optuna
import optuna
from optuna.samplers import TPESampler


In [None]:
# Function04: Objective
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score

def objective(trial):
    classifier_name = trial.suggest_categorical('classifier', [
        'LogisticRegression', 'SVM', 'RandomForest', 'LightGBM', 'CatBoost'
    ])

    if classifier_name == 'LogisticRegression':
        C = trial.suggest_float('lr_C', 0.1, 10)
        solver = trial.suggest_categorical('lr_solver', ['liblinear'])
        classifier_obj = LogisticRegression(C=C, solver=solver)

    elif classifier_name == 'SVM':
        C = trial.suggest_float('svm_C', 0.1, 10)
        kernel = trial.suggest_categorical('svm_kernel', ['linear', 'rbf'])
        classifier_obj = SVC(C=C, kernel=kernel, probability=True)

    elif classifier_name == 'RandomForest':
        n_estimators = trial.suggest_int('rf_n_estimators', 10, 100)
        max_depth = trial.suggest_categorical('rf_max_depth', [10, 20, None])
        classifier_obj = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)

    elif classifier_name == 'LightGBM':
        num_leaves = trial.suggest_int('lgb_num_leaves', 31, 50)
        learning_rate = trial.suggest_float('lgb_learning_rate', 0.01, 0.2)
        n_estimators = trial.suggest_int('lgb_n_estimators', 100, 200)
        classifier_obj = lgb.LGBMClassifier(num_leaves=num_leaves, learning_rate=learning_rate, n_estimators=n_estimators, verbose=-1)

    elif classifier_name == 'CatBoost':
        iterations = trial.suggest_int('cat_iterations', 100, 200)
        learning_rate = trial.suggest_float('cat_learning_rate', 0.01, 0.2)
        depth = trial.suggest_int('cat_depth', 3, 9)
        classifier_obj = CatBoostClassifier(iterations=iterations, learning_rate=learning_rate, depth=depth, verbose=0)

    # Cross-validation setup
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    f1_scores = []
    roc_auc_scores = []

    for train_index, valid_index in skf.split(X_train_selected, y_train_over):
        X_train_cv, X_valid_cv = X_train_selected.iloc[train_index], X_train_selected.iloc[valid_index]
        y_train_cv, y_valid_cv = y_train_over.iloc[train_index], y_train_over.iloc[valid_index]


        classifier_obj.fit(X_train_cv, y_train_cv)
        y_prob = classifier_obj.predict_proba(X_valid_cv)[:, 1]

        # Threshold optimization on validation set
        thresholds = np.arange(0.1, 0.9, 0.01)
        best_threshold = 0.5
        best_score = 0.0

        for threshold in thresholds:
            y_pred = (y_prob >= threshold).astype(int)
            score = f1_score(y_valid_cv, y_pred)
            if score > best_score:
                best_score = score
                best_threshold = threshold

        # Record the best scores
        f1_scores.append(best_score)
        roc_auc_scores.append(roc_auc_score(y_valid_cv, (y_prob >= best_threshold).astype(int)))

    # Store the best threshold for this trial
    trial.set_user_attr('best_threshold', best_threshold)
    trial.set_user_attr('classifier_obj', classifier_obj)

    return np.mean(roc_auc_scores)  # Return the mean ROC AUC score from cross-validation


In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
# Optuna Study
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective, n_trials=100)

In [None]:
best_trial = study.best_trial
print('Best trial: ', best_trial.values)
print('Best hyperparameters: ', best_trial.params)

Best trial:  [0.9715331610680448]
Best hyperparameters:  {'classifier': 'RandomForest', 'rf_n_estimators': 81, 'rf_max_depth': None}


In [None]:
# Optimized Alogrithm Combination
best_algorithms = [trial.params['classifier'] for trial in study.trials]
algorithm_counts = pd.Series(best_algorithms).value_counts()
print('Optimal algorithm proportions: ', algorithm_counts)

Optimal algorithm proportions:  RandomForest          68
CatBoost               9
LogisticRegression     8
SVM                    8
LightGBM               7
Name: count, dtype: int64


# Evaluation


In [None]:
# Function05: Evaluation
def evaluate_model(model, X_train_selected, y_train_over, X_test_selected, y_test, threshold):
    model.fit(X_train_selected, y_train_over)
    y_prob = model.predict_proba(X_test_selected)[:, 1]
    y_pred = (y_prob >= threshold).astype(int)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)

    return accuracy, precision, recall, f1, roc_auc


In [None]:
top_5_trials = study.trials_dataframe().sort_values(by='value', ascending=False).head(5)
results = []

for i, row in top_5_trials.iterrows():
    model_name = row['params_classifier']
    threshold = study.trials[row['number']].user_attrs['best_threshold']
    model_params = study.trials[row['number']].params

    if model_name == 'LogisticRegression':
        params = {'C': model_params['lr_C'], 'solver': model_params['lr_solver']}
        final_model = LogisticRegression(**params)
    elif model_name == 'SVM':
        params = {'C': model_params['svm_C'], 'kernel': model_params['svm_kernel']}
        final_model = SVC(**params, probability=True)
    elif model_name == 'RandomForest':
        params = {'n_estimators': model_params['rf_n_estimators'], 'max_depth': model_params['rf_max_depth']}
        final_model = RandomForestClassifier(**params)
    elif model_name == 'LightGBM':
        params = {'num_leaves': model_params['lgb_num_leaves'], 'learning_rate': model_params['lgb_learning_rate'], 'n_estimators': model_params['lgb_n_estimators']}
        final_model = lgb.LGBMClassifier(**params, verbose=-1)
    elif model_name == 'CatBoost':
        params = {'iterations': model_params['cat_iterations'], 'learning_rate': model_params['cat_learning_rate'], 'depth': model_params['cat_depth']}
        final_model = CatBoostClassifier(**params, verbose=0)

    accuracy, precision, recall, f1, roc_auc = evaluate_model(final_model, X_train_selected,y_train_over, X_test_selected, y_test, threshold)

    results.append({
        'model': model_name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc,
        'best_params': params,
        'best_threshold': threshold
    })

results_df = pd.DataFrame(results)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


In [None]:
results_df

Unnamed: 0,model,accuracy,precision,recall,f1_score,roc_auc,best_params,best_threshold
0,RandomForest,0.929637,0.25,0.050761,0.084388,0.821092,"{'n_estimators': 81, 'max_depth': None}",0.5
1,RandomForest,0.931258,0.258065,0.040609,0.070175,0.821104,"{'n_estimators': 72, 'max_depth': None}",0.49
2,RandomForest,0.932555,0.333333,0.055838,0.095652,0.825572,"{'n_estimators': 91, 'max_depth': None}",0.48
3,RandomForest,0.932231,0.3125,0.050761,0.087336,0.819483,"{'n_estimators': 81, 'max_depth': None}",0.5
4,RandomForest,0.934501,0.380952,0.040609,0.073394,0.830279,"{'n_estimators': 80, 'max_depth': None}",0.53
