In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import read_csv
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
import lightgbm as lgb
from sklearn import svm
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTEN
import re

In [None]:
df = pd.read_csv('insurance_claims.csv')

In [None]:
df_original = df.copy()

In [None]:
print(df.info())

In [None]:
colum_name =[]
unique_value=[]
# Iterate through the columns
for col in df:
    if df[col].dtype == 'object':
        # If 2 or fewer unique categories
        colum_name.append(str(col)) 
        unique_value.append(df[col].nunique())
table= pd.DataFrame()
table['Col_name'] = colum_name
table['Value']= unique_value
            
table=table.sort_values('Value',ascending=False)
table

In [None]:
to_drop = [
    'policy_number',
    'policy_bind_date',
    'insured_zip',
    'incident_date', 
    'incident_location',
    'incident_hour_of_the_day',
    'auto_model',
    'auto_year',
    '_c39'
]

df = df.drop(to_drop, axis = 1)

In [None]:
print(df.info())

In [None]:
df.replace('?', np.nan, inplace=True)

In [None]:
df.isna().sum()

In [None]:
df['collision_type'] = df['collision_type'].fillna(df['collision_type'].mode()[0])
df['property_damage'] = df['property_damage'].fillna(df['property_damage'].mode()[0])
df['police_report_available'] = df['police_report_available'].fillna(df['police_report_available'].mode()[0])

In [None]:
df.isna().sum()

In [None]:
df['insured_hobbies']=df['insured_hobbies'].apply(lambda x :'Other' if x!='chess' and x!='cross-fit' else x)

In [None]:
x = df.drop('fraud_reported', axis = 1)
y = df['fraud_reported']

In [None]:
cat_df = x.select_dtypes(include = ['object'])

In [None]:
for col in cat_df.columns:
    print(f"{col}: \n{cat_df[col].unique()}\n")

In [None]:
cat_df = pd.get_dummies(cat_df, drop_first = True)

In [None]:
print(cat_df.info())

In [None]:
num_df = x.select_dtypes(include = ['int64', 'float64'])

In [None]:
x = pd.concat([num_df, cat_df], axis = 1)

In [None]:
print(x.info())

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)

In [None]:
num_df = x_train.select_dtypes(include = ['int64', 'float64'])

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(num_df)

In [None]:
scaled_num_df = pd.DataFrame(data = scaled_data, columns = num_df.columns, index = x_train.index)

In [None]:
x_train.drop(columns = scaled_num_df.columns, inplace = True)

In [None]:
x_train = pd.concat([scaled_num_df, x_train], axis = 1)

In [None]:
# First get the numerical columns from X_test (same columns as num_df)
num_test = x_test.select_dtypes(include = ['int64', 'float64'])

# Scale the numerical columns
scaled_test_data = scaler.transform(num_test)

# Convert to DataFrame
scaled_num_test = pd.DataFrame(data=scaled_test_data, columns=num_test.columns, index=x_test.index)

# Drop original numerical columns from X_test
x_test.drop(columns=num_test.columns, inplace=True)

# Concatenate scaled numerical data with remaining columns
x_test = pd.concat([scaled_num_test, x_test], axis=1)

In [None]:
print(x_train.info())
print(x_test.info())
print(x_train.shape)
print(x_test.shape)

TRAIN

In [None]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(x_train, x_test, y_train, y_test)
print(models)

In [None]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import Perceptron

In [None]:
model = MLPClassifier(random_state = 42)
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
y_proba = model.predict_proba(x_test)[:, 1]
print(classification_report(y_test, y_predict, digits = 4))

auc = roc_auc_score(y_test, y_proba)
print("AUC =", round(auc, 4))

In [None]:
model = LogisticRegression(random_state=42)

param = {
    'C': [0.5, 1.0, 1.5],
    'tol': [1e-3, 1e-4], 
    'fit_intercept': [True, False],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}

In [None]:
grid = GridSearchCV(estimator = model, param_grid = param, scoring = 'accuracy', cv = 4, verbose = 2)
grid.fit(x_train, y_train)
y_predict = grid.predict(x_test)
print(grid.best_params_)
print(grid.best_score_)
print(classification_report(y_test, y_predict, digits = 4))
print("AUC =", round(auc, 4))

OK SMOTENC

In [None]:
df2 = df.copy()

In [None]:
x2 = df2.drop('fraud_reported', axis = 1)
y2 = df2['fraud_reported']

In [None]:
le = LabelEncoder()
y2 = le.fit_transform(y2)

In [None]:
print(x2.info())

In [None]:
print(x2.shape)
print(y2.shape)

In [None]:
x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size = 0.2, random_state=42, stratify = y)

In [None]:
from imblearn.over_sampling import SMOTENC

categorical_features = [i for i, col_type in enumerate(x2_train.dtypes) if col_type == 'object']

smt =  SMOTENC(random_state = 42, k_neighbors = 2, categorical_features=categorical_features)
x2_train, y2_train = smt.fit_resample(x2_train, y2_train)

In [None]:
print(x2_train.shape)
print(y2_train.shape)

In [None]:
cat_df2 = []
num_df2 = []
cat = []

In [None]:
cat_df2 = x2_train.select_dtypes(include=['object'])
cat_df2 = pd.get_dummies(cat_df2, drop_first=True)

In [None]:
cat = x2_train.select_dtypes(include = ['object'])
x2_train.drop(columns = cat.columns, inplace = True)
x2_train = pd.concat([cat_df2, x2_train], axis = 1)

In [None]:
num_df2 = x2_train.select_dtypes(include = ['int64', 'float64'])
scaler = StandardScaler()
scaled__train2 = scaler.fit_transform(num_df2)
scaled_train2_df = pd.DataFrame(data = scaled__train2, columns = num_df2.columns, index = x2_train.index)

In [None]:
x2_train.drop(columns = scaled_train2_df.columns, inplace = True)
x2_train = pd.concat([scaled_train2_df, x2_train], axis = 1)

for x_test

In [None]:
cat_df2 = []
num_df2 = []
cat = []

In [None]:
cat_df2 = x2_test.select_dtypes(include=['object'])
cat_df2 = pd.get_dummies(cat_df2, drop_first=True)

In [None]:
cat = x2_test.select_dtypes(include = ['object'])
x2_test.drop(columns = cat.columns, inplace = True)
x2_test = pd.concat([cat_df2, x2_test], axis = 1)

In [None]:
num_df2 = x2_test.select_dtypes(include = ['int64', 'float64'])
scaled__train2 = scaler.transform(num_df2)
scaled_train2_df = pd.DataFrame(data = scaled__train2, columns = num_df2.columns, index = x2_test.index)

In [None]:
x2_test.drop(columns = scaled_train2_df.columns, inplace = True)
x2_test = pd.concat([scaled_train2_df, x2_test], axis = 1)

In [None]:
model = MLPClassifier(random_state=42)
model.fit(x2_train, y2_train)
y_predict = model.predict(x2_test)
y_proba = model.predict_proba(x2_test)[:, 1]
print(classification_report(y2_test, y_predict, digits = 4))

auc = roc_auc_score(y2_test, y_proba)
print("AUC =", round(auc, 4))

RANDOM OVER SAMPLER

In [None]:
x3 = df.drop('fraud_reported', axis = 1)
y3 = df['fraud_reported']

In [None]:
le = LabelEncoder()
y3 = le.fit_transform(y3)

In [None]:
cat_df = x3.select_dtypes(include = ['object'])
cat_df = pd.get_dummies(cat_df, drop_first = True)

num_df = x3.select_dtypes(include = ['int64', 'float64'])
x3 = pd.concat([num_df, cat_df], axis = 1)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x3, y3, test_size = 0.2, random_state=42, stratify = y3)

In [None]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

print("Trước khi oversampling:", Counter(y_train))

ros = RandomOverSampler(random_state=42)
x_train, y_train = ros.fit_resample(x_train, y_train)

print("Sau khi oversampling:", Counter(y_train))

In [None]:
num_df = x_train.select_dtypes(include = ['int64', 'float64'])
scaler = StandardScaler()
scaled__train = scaler.fit_transform(num_df)
scaled_train_df = pd.DataFrame(data = scaled__train, columns = num_df.columns, index = x_train.index)

In [None]:
x_train.drop(columns = scaled_train_df.columns, inplace = True)
x_train = pd.concat([scaled_train_df, x_train], axis = 1)

In [None]:
num_df = []

In [None]:
num_df = x_test.select_dtypes(include = ['int64', 'float64'])

scaled__test = scaler.transform(num_df)
scaled_test_df = pd.DataFrame(data = scaled__test, columns = num_df.columns, index = x_test.index)

In [None]:
x_test.drop(columns = scaled_test_df.columns, inplace = True)
x_test = pd.concat([scaled_test_df, x_test], axis = 1)

In [None]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifierCV, RidgeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import Perceptron, PassiveAggressiveClassifier
from sklearn.svm import LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
model = XGBClassifier(random_state = 42)

param_grid = {
    "n_estimators": [100, 200, 500],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
    "gamma": [0, 0.1, 0.3],
    "reg_alpha": [0, 0.01, 0.1],
    "reg_lambda": [1, 1.5, 2.0]
}

In [None]:
grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = 'accuracy', cv = 4, verbose = 1)
grid.fit(x_train, y_train)
y_predict = grid.predict(x_test)
print(grid.best_params_)
print(grid.best_score_)
print(classification_report(y_test, y_predict, digits = 4))
auc = roc_auc_score(y_test, y_predict)
print("AUC =", round(auc, 4))

In [None]:

model.fit(x_train, y_train)
y_predict = model.predict(x_test)
y_proba = model.predict_proba(x_test)[:, 1]
print(classification_report(y_test, y_predict, digits = 4))

auc = roc_auc_score(y_test, y_proba)
print("AUC =", round(auc, 4))