In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
insurance=pd.read_csv("/content/drive/MyDrive/Dataset/insurance_data.csv")
employee=pd.read_csv("/content/drive/MyDrive/Dataset/employee_data.csv")
vendor=pd.read_csv("/content/drive/MyDrive/Dataset/vendor_data.csv")

In [None]:
insurance.info()


In [None]:
insurance.head()

In [None]:
employee.info()


In [None]:
employee.head()

In [None]:
vendor.head()

In [None]:
vendor.info()

In [None]:
insurance_employee= employee.merge(insurance, how= 'outer', on= 'AGENT_ID')
insurance_employee.head()

In [None]:
insurance_employee.rename(columns={'ADDRESS_LINE1_x': 'CUSTOMER_ADDRESS_LINE1', 'ADDRESS_LINE2_x': 'CUSTOMER_ADDRESS_LINE2',
                                   'CITY_x': 'CUSTOMER_CITY', 'STATE_x': 'CUSTOMER_STATE', 'POSTAL_CODE_x': 'CUSTOMER_POSTAL_CODE',
                                   'ADDRESS_LINE1_y': 'AGENT_ADDRESS_LINE1', 'ADDRESS_LINE2_y': 'AGENT_ADDRESS_LINE2',
                                   'CITY_y': 'AGENT_CITY', 'STATE_y': 'AGENT_STATE', 'POSTAL_CODE_y': 'AGENT_POSTAL_CODE'},
                          inplace=True)

insurance_employee.columns.tolist()

In [None]:
df = insurance_employee.merge(vendor, how= 'left', on= 'VENDOR_ID')

In [None]:
df.dtypes

In [None]:
df.to_csv('Insurance.csv',index=False)

In [None]:
#checking null values in dataset
df.isnull().sum()

In [None]:
#checking for duplicate rows
df.duplicated().sum()

In [None]:
#droping duplicates if any
df.drop_duplicates()

In [None]:
print("Size of Dataset: {} rows , {} columns".format(df.shape[0],df.shape[1]))

In [None]:
df.rename(columns={'ADDRESS_LINE1': 'VENDOR_ADDRESS_LINE1', 'ADDRESS_LINE2': 'VENDOR_ADDRESS_LINE2',
                     'CITY': 'VENDOR_CITY', 'STATE': 'VENDOR_STATE', 'POSTAL_CODE': 'VENDOR_POSTAL_CODE'},
            inplace=True)

df.columns.tolist()

In [None]:
df.head(10)

In [None]:
df.info()

In [None]:
#A means "Approved" and D means "Denied"
df.CLAIM_STATUS.unique()


In [None]:
df['INSURANCE_TYPE'].value_counts().sort_values()

In [None]:
#to get top three values
df['INSURANCE_TYPE'].value_counts().sort_values(ascending=False)[:3]

In [None]:
df['CLAIM_STATUS'].value_counts()

In [None]:
df.RISK_SEGMENTATION.unique()

In [None]:
#dropping unnecessary columns
df=df.drop(['AGENT_NAME','AGENT_ID', 'TRANSACTION_ID', 'EMP_ROUTING_NUMBER', 'CUSTOMER_NAME','DATE_OF_JOINING',
            'ACCT_NUMBER', 'TXN_DATE_TIME','EMP_ACCT_NUMBER','POLICY_NUMBER', 'POLICY_EFF_DT', 'LOSS_DT', 'REPORT_DT',
            'VENDOR_NAME','CUSTOMER_ID',"CUSTOMER_ADDRESS_LINE1","CUSTOMER_ADDRESS_LINE2","CUSTOMER_CITY",
            "CUSTOMER_STATE", 'VENDOR_ID',"CUSTOMER_POSTAL_CODE","AGENT_ADDRESS_LINE1","AGENT_ADDRESS_LINE2",
            "AGENT_CITY","AGENT_STATE","AGENT_POSTAL_CODE", "INCIDENT_STATE","INCIDENT_CITY","VENDOR_ADDRESS_LINE1",
            'ROUTING_NUMBER', 'SSN',"VENDOR_ADDRESS_LINE2","VENDOR_CITY","VENDOR_STATE","VENDOR_POSTAL_CODE"],axis=1)


In [None]:
df.nunique()

In [None]:
#histogram
df["RISK_SEGMENTATION"].hist(bins=40,figsize=(5,5))
plt.show()

In [None]:
df.corr()

In [None]:
#correlation heatmap
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(),annot=True)

In [None]:
#checking for missing values

def missing (df):
    missing_number = df.isnull().sum().sort_values(ascending=False)
    missing_percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_values = pd.concat([missing_number, missing_percent], axis=1, keys=['Missing_Number', 'Missing_Percent'])
    return missing_values

missing(df)

In [None]:
#Suspicious_df = df[(df['CLAIM_STATUS'] == 'A') & (df['RISK_SEGMENTATION'] == 'H') & (df['INCIDENT_SEVERITY'] == 'Major Loss')]
#Suspicious_df.describe()

In [None]:
df.nunique()

In [None]:
#the rows with null values are droped
#df.dropna(axis=0,inplace=True)

In [None]:
df.info()

In [None]:
df.columns

In [None]:
new_cols = [ 'INSURANCE_TYPE', 'PREMIUM_AMOUNT', 'CLAIM_AMOUNT', 'MARITAL_STATUS', 'AGE', 'TENURE', 'EMPLOYMENT_STATUS','NO_OF_FAMILY_MEMBERS', 'RISK_SEGMENTATION', 'HOUSE_TYPE','SOCIAL_CLASS', 'CUSTOMER_EDUCATION_LEVEL', 'INCIDENT_SEVERITY','AUTHORITY_CONTACTED', 'ANY_INJURY', 'POLICE_REPORT_AVAILABLE','INCIDENT_HOUR_OF_THE_DAY','CLAIM_STATUS']
df=df.reindex(columns=new_cols)
df.info()

In [None]:
df.groupby(["RISK_SEGMENTATION", "CLAIM_STATUS"]).size()

In [None]:
df

In [None]:
df.nunique()

In [None]:
#df=pd.get_dummies(df)
#df.info()
  #<class 'pandas.core.frame.DataFrame'>
  #Int64Index: 10000 entries, 0 to 9999
  #Columns: 58374 entries, EMP_ROUTING_NUMBER to CLAIM_STATUS_D
  #dtypes: float64(1), int64(9), uint8(58364)
  #memory usage: 557.4 MB

In [None]:
#x-independent features y-dependent features
x=df.iloc[:,:-1]
y=df.iloc[:,-1].values

In [None]:
x

In [None]:
x=pd.get_dummies(x)


In [None]:
x

In [None]:
x.shape

In [None]:
x.info()

In [None]:
#x=x.drop(['MARITAL_STATUS_N','EMPLOYMENT_STATUS_Y'],axis=1)


In [None]:
x=x.values
x

In [None]:
y

In [None]:
plt.hist(y)
plt.show()

In [None]:
#LabelEncoder
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)

In [None]:
y

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()


In [None]:
#x,y=oversampling.fit_resample(x,y)
df['CLAIM_STATUS'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
#x_train,x_test,y_train,y_test=train_test_split(x,y,stratify=y,train_size=0.8,random_state=0)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.3, random_state=0, shuffle=True, stratify=y)

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE(random_state=0)
x_train_smote, y_train_smote = oversample.fit_resample(x_train, y_train)

In [None]:
x_train_smote

In [None]:
y_train=y_train_smote

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train_smote)
x_test = sc.transform(x_test) 
print(x_train) 
print(x_test) 


In [None]:
x_train.shape

In [None]:
y_train.shape

In [None]:
plt.hist(["x_train_smote","y_train_smote"])
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score, KFold
cv=KFold(n_splits=5,random_state=0,shuffle=True)

In [None]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression
lrModel = LogisticRegression()
lrModel.fit(x_train,y_train_smote)
scores = cross_val_score(lrModel, x_train, y_train, cv = cv,scoring='accuracy')
print("Logistics Regression Average CV Score: ",scores.mean())
#y_pred = lrModel.predict(x_test)


In [None]:
# KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
KnModel = KNeighborsClassifier()
scores = cross_val_score(KnModel, x_train, y_train, cv = cv,scoring='accuracy')
print("KNeighbors Average CV Score: ",scores.mean())

In [None]:
# RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
RfModel = RandomForestClassifier()
scores = cross_val_score(RfModel, x_train, y_train, cv = cv,scoring='accuracy')
print("Random Forest Average CV Score: ",scores.mean())

In [None]:
# DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
DtModel = DecisionTreeClassifier()
scores = cross_val_score(DtModel, x_train, y_train, cv = cv,scoring='accuracy')
print("Decision Tree Average CV Score: ",scores.mean())

In [None]:
# SVC
from sklearn.svm import SVC
SvmModel = SVC()
scores = cross_val_score(SvmModel, x_train, y_train, cv = cv,scoring='accuracy')
print("Support Vector Machine Average CV Score: ",scores.mean())

In [None]:
# xgb
import xgboost as xgb
XgbModel = xgb.XGBClassifier()
scores = cross_val_score(XgbModel, x_train, y_train, cv = cv,scoring='accuracy')
print("XGBoost Average CV Score: ",scores.mean())

In [None]:
# BaggingClassifier
from sklearn.ensemble import BaggingClassifier
BcModel = BaggingClassifier()
scores = cross_val_score(BcModel, x_train, y_train, cv = cv,scoring='accuracy')
print("Bagging Classifier Average CV Score: ",scores.mean())

In [None]:
# AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
AdbModel = AdaBoostClassifier()
scores = cross_val_score(AdbModel, x_train, y_train, cv = cv,scoring='accuracy')
print("AdaBoost Tree Average CV Score: ",scores.mean())

In [None]:
# Create an Isolation Forest classifier
from sklearn.ensemble import IsolationForest
IslModel= IsolationForest()

In [None]:
# Create an Isolation Forest classifier
IslModel= IsolationForest()
scores = cross_val_score(IslModel, x_train, y_train, cv = cv,scoring='accuracy')
print("Isolation Forest Average CV Score: ",scores.mean())

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
# Select the top 15 features using ANOVA
selector = SelectKBest(f_classif, k=15)
X_new = selector.fit_transform(x, y)
x=X_new
# Print the scores and p-values for each feature
scores = selector.scores_
pvalues = selector.pvalues_
for i in range(len(scores)):
    print(f"Feature {i+1}: score = {scores[i]:.2f}, p-value = {pvalues[i]:.2f}")
# Print the indices of the selected features
selected_indices = selector.get_support(indices=True)
print(f"Selected feature indices: {selected_indices}")

**AFTER** **ANOVA**

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, precision_score, recall_score

In [None]:
lrModel = LogisticRegression()
lrModel.fit(x_train,y_train_smote)
scores = cross_val_score(lrModel, x_train, y_train, cv = cv,scoring='accuracy')
print("Logistics Regression Average CV Score: ",scores.mean())
y_pred = lrModel.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(2,2))
sns.heatmap(cm, annot=True, linewidths=1, square = True, cmap = 'Blues_r')
plt.show()
print("\nAccuracy after smote : ",accuracy_score(y_test,y_pred))
print("\nF1 score after smote : ",f1_score(y_test,y_pred))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KnModel = KNeighborsClassifier()
scores = cross_val_score(KnModel, x_train, y_train, cv = cv,scoring='accuracy')
print("KNeighbors Average CV Score: ",scores.mean())

In [None]:
from sklearn.ensemble import RandomForestClassifier
RfModel = RandomForestClassifier()
scores = cross_val_score(RfModel, x_train, y_train, cv = cv,scoring='accuracy')
print("Random Forest Average CV Score: ",scores.mean())

In [None]:
from sklearn.tree import DecisionTreeClassifier
DtModel = DecisionTreeClassifier()
scores = cross_val_score(DtModel, x_train, y_train, cv = cv,scoring='accuracy')
print("Decision Tree Average CV Score: ",scores.mean())

In [None]:
from sklearn.svm import SVC
SvmModel = SVC()
scores = cross_val_score(SvmModel, x_train, y_train, cv = cv,scoring='accuracy')
print("Support Vector Machine Average CV Score: ",scores.mean())

In [None]:
import xgboost as xgb
XgbModel = xgb.XGBClassifier()
scores = cross_val_score(XgbModel, x_train, y_train, cv = cv,scoring='accuracy')
print("XGBoost Average CV Score: ",scores.mean())

In [None]:
from sklearn.ensemble import BaggingClassifier
BcModel = BaggingClassifier()
scores = cross_val_score(BcModel, x_train, y_train, cv = cv,scoring='accuracy')
print("Bagging Classifier Average CV Score: ",scores.mean())

In [None]:
from sklearn.ensemble import AdaBoostClassifier
AdbModel = AdaBoostClassifier()
scores = cross_val_score(AdbModel, x_train, y_train, cv = cv,scoring='accuracy')
print("AdaBoost Tree Average CV Score: ",scores.mean())

In [None]:
# Create an Isolation Forest classifier
IslModel= IsolationForest()
scores = cross_val_score(IslModel, x_train, y_train, cv = cv,scoring='accuracy')
print("Isolation Forest Average CV Score: ",scores.mean())

In [None]:
KnModel.fit(x_train,y_train)
prediction = KnModel.predict(x_test)
print("The Accuracy Score on test data:", accuracy_score(prediction,y_test))
print("The precision Score :", precision_score(prediction,y_test))
print("The recall Score :", recall_score(prediction,y_test))
print("The f1 score :", f1_score(prediction,y_test))
print("The Classification report : \n",classification_report(prediction,y_test))
sns.heatmap(confusion_matrix(prediction,y_test), annot = True)
plt.show()

In [None]:
RfModel.fit(x_train,y_train)
prediction = RfModel.predict(x_test)
print("The Accuracy Score on test data:", accuracy_score(prediction,y_test))
print("The precision Score :", precision_score(prediction,y_test))
print("The recall Score :", recall_score(prediction,y_test))
print("The f1 score :", f1_score(prediction,y_test))
print("The Classification report : \n",classification_report(prediction,y_test))
sns.heatmap(confusion_matrix(prediction,y_test), annot = True)
plt.show()

In [None]:
DtModel.fit(x_train,y_train)
prediction = DtModel.predict(x_test)
print("The Accuracy Score on test data:", accuracy_score(prediction,y_test))
print("The precision Score :", precision_score(prediction,y_test))
print("The recall Score :", recall_score(prediction,y_test))
print("The f1 score :", f1_score(prediction,y_test))
print("The Classification report : \n",classification_report(prediction,y_test))
sns.heatmap(confusion_matrix(prediction,y_test), annot = True)
plt.show()

In [None]:
lrModel.fit(x_train,y_train)
prediction = lrModel.predict(x_test)
print("The Accuracy Score on test data:", accuracy_score(prediction,y_test))
print("The precision Score :", precision_score(prediction,y_test))
print("The recall Score :", recall_score(prediction,y_test))
print("The f1 score :", f1_score(prediction,y_test))
print("The Classification report : \n",classification_report(prediction,y_test))
sns.heatmap(confusion_matrix(prediction,y_test), annot = True)
plt.show()

In [None]:
SvmModel.fit(x_train,y_train)
prediction = SvmModel.predict(x_test)
print("The Accuracy Score on test data:", accuracy_score(prediction,y_test))
print("The precision Score :", precision_score(prediction,y_test))
print("The recall Score :", recall_score(prediction,y_test))
print("The f1 score :", f1_score(prediction,y_test))
print("The Classification report : \n",classification_report(prediction,y_test))
sns.heatmap(confusion_matrix(prediction,y_test), annot = True)
plt.show()

In [None]:
XgbModel.fit(x_train,y_train)
prediction = XgbModel.predict(x_test)
print("The Accuracy Score on test data:", accuracy_score(prediction,y_test))
print("The precision Score :", precision_score(prediction,y_test))
print("The recall Score :", recall_score(prediction,y_test))
print("The f1 score :", f1_score(prediction,y_test))
print("The Classification report : \n",classification_report(prediction,y_test))
sns.heatmap(confusion_matrix(prediction,y_test), annot = True)
plt.show()

In [None]:
BcModel.fit(x_train,y_train)
prediction = BcModel.predict(x_test)
print("The Accuracy Score on test data:", accuracy_score(prediction,y_test))
print("The precision Score :", precision_score(prediction,y_test))
print("The recall Score :", recall_score(prediction,y_test))
print("The f1 score :", f1_score(prediction,y_test))
print("The Classification report : \n",classification_report(prediction,y_test))
sns.heatmap(confusion_matrix(prediction,y_test), annot = True)
plt.show()

In [None]:
AdbModel.fit(x_train,y_train)
prediction = AdbModel.predict(x_test)
print("The Accuracy Score on test data:", accuracy_score(prediction,y_test))
print("The precision Score :", precision_score(prediction,y_test))
print("The recall Score :", recall_score(prediction,y_test))
print("The f1 score :", f1_score(prediction,y_test))
print("The Classification report : \n",classification_report(prediction,y_test))
sns.heatmap(confusion_matrix(prediction,y_test), annot = True)
plt.show()

In [None]:
IslModel.fit(x_train,y_train)
prediction = IslModel.predict(x_test)
print("The Accuracy Score on test data:", accuracy_score(prediction,y_test))
print("The Precision Score : ",precision_score(prediction,y_test,average='micro'))
print("The Recall Score : ",recall_score(prediction,y_test,average='micro'))
print("The f1 score :", f1_score(prediction,y_test,average='micro'))
print("The Classification report : \n",classification_report(prediction,y_test))
sns.heatmap(confusion_matrix(prediction,y_test), annot = True)
plt.show()

In [None]:
svc1=SVC(kernel="rbf",C=100,gamma=0.001,random_state=0)
svc1.fit(x_train,y_train)
y_pred_svc1 = svc1.predict(x_test)
from sklearn import metrics 
cm_svc1=metrics.confusion_matrix(y_pred_svc1,y_test)
plt.figure(figsize=(2,2))
sns.heatmap(cm_svc1, annot=True, linewidths=1, square = True, cmap = 'Blues_r')
plt.show()
print("\nAccuracy after smote : ",accuracy_score(y_pred_svc1,y_test))
print("\nF1 score after smote : ",f1_score(y_pred_svc1,y_test))

In [None]:
from sklearn.model_selection import GridSearchCV

# define the AdaBoost classifier
adaboost = AdaBoostClassifier()

# define the hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.1, 0.5, 1.0, 2.0]
}

# define the GridSearchCV object
grid_search = GridSearchCV(adaboost, param_grid, cv=5)

# fit the GridSearchCV object to the data
grid_search.fit(x_train,y_train)

# print the best hyperparameters and the corresponding score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

bagging = BaggingClassifier()

# define the hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 150, 200]
    ,'max_samples': [0.5, 0.7, 0.9, 1.0]
  #  ,'max_features': [0.5, 0.7, 0.9, 1.0]
}

# define the GridSearchCV object
grid_search = GridSearchCV(bagging, param_grid, cv=5)

# fit the GridSearchCV object to the data
grid_search.fit(x_train,y_train)

# print the best hyperparameters and the corresponding score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)
