In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import read_csv
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
import lightgbm as lgb
from sklearn import svm
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTEN
from imblearn.under_sampling import RandomUnderSampler
import re

In [2]:
df = pd.read_csv('insurance_claims.csv')

In [3]:
to_drop = [
    'policy_number',
    'policy_bind_date',
    'insured_zip',
    'incident_date', 
    'incident_location',
    'incident_hour_of_the_day',
    'auto_model',
    'auto_year',
    '_c39'
]

df = df.drop(to_drop, axis = 1)

In [4]:
df.replace('?', np.nan, inplace = True)

In [5]:
to_drop = [
    'property_damage',
    'police_report_available'
]

df = df.drop(to_drop, axis = 1)

In [6]:
for col in df.columns:
    if df[col].dtype in [np.float64, np.int64]:  
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
    else: 
        mode_val = df[col].mode()[0]   
        df[col].fillna(mode_val, inplace=True)

In [7]:
df['insured_hobbies']=df['insured_hobbies'].apply(lambda x :'Other' if x!='chess' and x!='cross-fit' else x)

In [8]:
x = df.drop('fraud_reported', axis = 1)
y = df['fraud_reported']

In [9]:
le = LabelEncoder()
y = le.fit_transform(y)

In [10]:
cat_df = x.select_dtypes(include = ['object'])
x.drop(columns = cat_df.columns, inplace = True)

cat_df = pd.get_dummies(cat_df, drop_first = True)
x = pd.concat([cat_df, x], axis = 1)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 42)

In [12]:
#RUS
# from collections import Counter

# print("Trước khi undersampling:", Counter(y_train))

# rus = RandomUnderSampler(sampling_strategy = {0: 250, 1: 198}, random_state = 42)
# x_train, y_train = rus.fit_resample(x_train, y_train)

# print("Sau khi undersampling:", Counter(y_train))

In [13]:
num = x_train.select_dtypes(include = ['int64', 'float64'])
scaler = StandardScaler()
scaled_num = scaler.fit_transform(num)
scaled_num_df = pd.DataFrame(data = scaled_num, columns = num.columns, index = x_train.index)
x_train.drop(columns = scaled_num_df.columns, inplace = True)
x_train = pd.concat([scaled_num_df, x_train], axis = 1)

In [14]:
num = x_test.select_dtypes(include = ['int64', 'float64'])
scaled_num = scaler.transform(num)
scaled_num_df = pd.DataFrame(data = scaled_num, columns = num.columns, index = x_test.index)
x_test.drop(columns = scaled_num_df.columns, inplace = True)
x_test = pd.concat([scaled_num_df, x_test], axis = 1)

In [15]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import Perceptron

In [16]:
def AUC(test, proba):
    auc = roc_auc_score(test, proba)
    return round(auc, 2)

In [17]:
class_weight = 'balanced'

DECISION TREE

In [None]:
model = DecisionTreeClassifier(random_state = 42, class_weight = class_weight)
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
y_proba = model.predict_proba(x_test)[:, 1]
print(classification_report(y_test, y_predict, digits = 2))
print('_______________________________________________________________')
print('AUC =', AUC(y_test, y_proba))

              precision    recall  f1-score   support

           0       0.82      0.83      0.83       151
           1       0.46      0.45      0.45        49

    accuracy                           0.73       200
   macro avg       0.64      0.64      0.64       200
weighted avg       0.73      0.73      0.73       200

_______________________________________________________________
AUC = 0.64


In [19]:
param_grid = {
    "criterion": ["gini", "entropy", "log_loss"],   # tiêu chí chia nhánh
    "max_depth": [None, 3, 5, 10, 20, 30],          # độ sâu cây
    "min_samples_split": [2, 5, 10, 20],            # số mẫu tối thiểu để chia một node
    "min_samples_leaf": [1, 2, 5, 10],              # số mẫu tối thiểu tại một lá
    "max_features": [None, "sqrt", "log2"],         # số lượng đặc trưng khi chia
}

grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = 'accuracy', cv = 4, verbose = 1)
grid.fit(x_train, y_train)
y_predict = grid.predict(x_test)
print(grid.best_params_)
print(grid.best_score_)
print(classification_report(y_test, y_predict, digits = 2))

best_model = grid.best_estimator_
print(best_model)
best_model.fit(x_train, y_train)
y_proba = best_model.predict_proba(x_test)[:, 1]
print('_______________________________________________________________')
print('AUC =', AUC(y_test, y_proba))

Fitting 4 folds for each of 864 candidates, totalling 3456 fits
{'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
0.8349999999999999
              precision    recall  f1-score   support

           0       0.92      0.81      0.87       151
           1       0.58      0.80      0.67        49

    accuracy                           0.81       200
   macro avg       0.75      0.81      0.77       200
weighted avg       0.84      0.81      0.82       200

DecisionTreeClassifier(class_weight='balanced', max_depth=5, min_samples_leaf=2,
                       random_state=42)
_______________________________________________________________
AUC = 0.78


RANDOM FOREST CLASSIFIER

In [20]:
model = RandomForestClassifier(random_state = 42, class_weight = class_weight)
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
y_proba = model.predict_proba(x_test)[:, 1]
print(classification_report(y_test, y_predict, digits = 2))
print('_______________________________________________________________')
print('AUC =', AUC(y_test, y_proba))

              precision    recall  f1-score   support

           0       0.77      0.95      0.85       151
           1       0.50      0.14      0.22        49

    accuracy                           0.76       200
   macro avg       0.64      0.55      0.54       200
weighted avg       0.71      0.76      0.70       200

_______________________________________________________________
AUC = 0.84


In [21]:
param_grid = {
    "n_estimators": [200, 500, 800],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 8],
    "max_features": ["sqrt", "log2"],  # 0.5 = dùng 50% số đặc trưng
    "bootstrap": [True],
    "max_samples": [None, 0.8]
}

grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = 'accuracy', cv = 4, verbose = 1)
grid.fit(x_train, y_train)
y_predict = grid.predict(x_test)
print(grid.best_params_)
print(grid.best_score_)
print(classification_report(y_test, y_predict, digits = 2))

best_model = grid.best_estimator_
print(best_model)
best_model.fit(x_train, y_train)
y_proba = best_model.predict_proba(x_test)[:, 1]
print('_______________________________________________________________')
print('AUC =', AUC(y_test, y_proba))

Fitting 4 folds for each of 768 candidates, totalling 3072 fits
{'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'max_samples': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
0.8200000000000001
              precision    recall  f1-score   support

           0       0.89      0.85      0.87       151
           1       0.60      0.67      0.63        49

    accuracy                           0.81       200
   macro avg       0.74      0.76      0.75       200
weighted avg       0.82      0.81      0.81       200

RandomForestClassifier(class_weight='balanced', max_depth=10,
                       min_samples_leaf=2, min_samples_split=10,
                       n_estimators=200, random_state=42)
_______________________________________________________________
AUC = 0.82


RIDGE CLASSIFIER CV

In [22]:
model = RidgeClassifierCV(class_weight = class_weight)
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
y_proba = model.decision_function(x_test)
print(classification_report(y_test, y_predict, digits = 2))
print('_______________________________________________________________')
print('AUC =', AUC(y_test, y_proba))

              precision    recall  f1-score   support

           0       0.94      0.85      0.89       151
           1       0.64      0.84      0.73        49

    accuracy                           0.84       200
   macro avg       0.79      0.84      0.81       200
weighted avg       0.87      0.84      0.85       200

_______________________________________________________________
AUC = 0.83


In [23]:
param_grid = {
    "alphas": [
        [0.1, 1, 10],
        [1, 10, 100],
        [0.01, 0.1, 1, 10, 100]
    ],
    "fit_intercept": [True, False],
}

grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = 'accuracy', cv = 4, verbose = 1)
grid.fit(x_train, y_train)
y_predict = grid.predict(x_test)
print(grid.best_params_)
print(grid.best_score_)
print(classification_report(y_test, y_predict, digits = 2))

best_model = grid.best_estimator_
print(best_model)
best_model.fit(x_train, y_train)
y_proba = best_model.decision_function(x_test)
print('_______________________________________________________________')
print('AUC =', AUC(y_test, y_proba))

Fitting 4 folds for each of 6 candidates, totalling 24 fits
{'alphas': [0.1, 1, 10], 'fit_intercept': True}
0.855
              precision    recall  f1-score   support

           0       0.94      0.85      0.89       151
           1       0.64      0.84      0.73        49

    accuracy                           0.84       200
   macro avg       0.79      0.84      0.81       200
weighted avg       0.87      0.84      0.85       200

RidgeClassifierCV(alphas=[0.1, 1, 10], class_weight='balanced')
_______________________________________________________________
AUC = 0.83


LOGISTIC REGRESSION

In [24]:
model = LogisticRegression(random_state = 42, class_weight = class_weight)
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
y_proba = model.predict_proba(x_test)[:, 1]
print(classification_report(y_test, y_predict, digits = 2))
print('_______________________________________________________________')
print('AUC =', AUC(y_test, y_proba))

              precision    recall  f1-score   support

           0       0.92      0.85      0.89       151
           1       0.63      0.78      0.70        49

    accuracy                           0.83       200
   macro avg       0.78      0.81      0.79       200
weighted avg       0.85      0.83      0.84       200

_______________________________________________________________
AUC = 0.84


In [25]:
param_grid = [
    {
        "solver": ["lbfgs", "newton-cg", "sag"],
        "penalty": ["l2"],
        "C": [0.01, 0.1, 1, 10, 100],
        "fit_intercept": [True, False],
        "multi_class": ["ovr", "multinomial"],
        "max_iter": [500, 1000]
    },
    {
        "solver": ["liblinear"],
        "penalty": ["l1", "l2"],
        "C": [0.01, 0.1, 1, 10, 100],
        "fit_intercept": [True, False],
        "multi_class": ["ovr"],
        "max_iter": [500, 1000]
    },
    {
        "solver": ["saga"],
        "penalty": ["l1", "l2", "elasticnet"],
        "C": [0.01, 0.1, 1, 10, 100],
        "l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9],  # dùng khi elasticnet
        "fit_intercept": [True, False],
        "multi_class": ["ovr", "multinomial"],
        "max_iter": [500, 1000]
    }
]


grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = 'accuracy', cv = 4, verbose = 1)
grid.fit(x_train, y_train)
y_predict = grid.predict(x_test)
print(grid.best_params_)
print(grid.best_score_)
print(classification_report(y_test, y_predict, digits = 2))

best_model = grid.best_estimator_
print(best_model)
best_model.fit(x_train, y_train)
y_proba = best_model.predict_proba(x_test)[:, 1]
print('_______________________________________________________________')
print('AUC =', AUC(y_test, y_proba))

Fitting 4 folds for each of 760 candidates, totalling 3040 fits
{'C': 0.1, 'fit_intercept': True, 'l1_ratio': 0.7, 'max_iter': 500, 'multi_class': 'multinomial', 'penalty': 'elasticnet', 'solver': 'saga'}
0.85
              precision    recall  f1-score   support

           0       0.95      0.84      0.89       151
           1       0.64      0.86      0.73        49

    accuracy                           0.84       200
   macro avg       0.79      0.85      0.81       200
weighted avg       0.87      0.84      0.85       200

LogisticRegression(C=0.1, class_weight='balanced', l1_ratio=0.7, max_iter=500,
                   multi_class='multinomial', penalty='elasticnet',
                   random_state=42, solver='saga')
_______________________________________________________________
AUC = 0.84


SVC

In [26]:
model = svm.SVC(random_state = 42, class_weight = class_weight)
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
y_proba = model.decision_function(x_test)
print(classification_report(y_test, y_predict, digits = 2))
print('_______________________________________________________________')
print('AUC =', AUC(y_test, y_proba))

              precision    recall  f1-score   support

           0       0.88      0.84      0.86       151
           1       0.57      0.65      0.61        49

    accuracy                           0.80       200
   macro avg       0.73      0.75      0.74       200
weighted avg       0.81      0.80      0.80       200

_______________________________________________________________
AUC = 0.82


In [27]:
param_grid = [
    {
        "kernel": ["rbf"],
        "C": [0.01, 0.1, 1, 10, 100, 1000],
        "gamma": ["scale", "auto", 0.001, 0.01, 0.1, 1],
        "shrinking": [True, False],
    },
    {
        "kernel": ["linear"],
        "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        "shrinking": [True, False],
    },
    {
        "kernel": ["poly"],
        "C": [0.01, 0.1, 1, 10, 100],
        "degree": [2, 3, 4, 5],
        "gamma": ["scale", "auto", 0.001, 0.01, 0.1],
        "coef0": [0.0, 0.5, 1.0],
        "shrinking": [True, False],
    },
    {
        "kernel": ["sigmoid"],
        "C": [0.01, 0.1, 1, 10, 100],
        "gamma": ["scale", "auto", 0.001, 0.01, 0.1],
        "coef0": [0.0, 0.5, 1.0],
        "shrinking": [True, False],
    },
]
grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = 'accuracy', cv = 4, verbose = 2)
grid.fit(x_train, y_train)
y_predict = grid.predict(x_test)
print(grid.best_params_)
print(grid.best_score_)
print(classification_report(y_test, y_predict, digits = 2))

best_model = grid.best_estimator_
print(best_model)
best_model.fit(x_train, y_train)
y_proba = best_model.decision_function(x_test)
print('_______________________________________________________________')
print('AUC =', AUC(y_test, y_proba))

Fitting 4 folds for each of 836 candidates, totalling 3344 fits
[CV] END ....C=0.01, gamma=scale, kernel=rbf, shrinking=True; total time=   0.0s
[CV] END ....C=0.01, gamma=scale, kernel=rbf, shrinking=True; total time=   0.0s
[CV] END ....C=0.01, gamma=scale, kernel=rbf, shrinking=True; total time=   0.0s
[CV] END ....C=0.01, gamma=scale, kernel=rbf, shrinking=True; total time=   0.0s
[CV] END ...C=0.01, gamma=scale, kernel=rbf, shrinking=False; total time=   0.0s
[CV] END ...C=0.01, gamma=scale, kernel=rbf, shrinking=False; total time=   0.0s
[CV] END ...C=0.01, gamma=scale, kernel=rbf, shrinking=False; total time=   0.0s
[CV] END ...C=0.01, gamma=scale, kernel=rbf, shrinking=False; total time=   0.0s
[CV] END .....C=0.01, gamma=auto, kernel=rbf, shrinking=True; total time=   0.0s
[CV] END .....C=0.01, gamma=auto, kernel=rbf, shrinking=True; total time=   0.0s
[CV] END .....C=0.01, gamma=auto, kernel=rbf, shrinking=True; total time=   0.0s
[CV] END .....C=0.01, gamma=auto, kernel=rbf,