In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
from scipy.stats import zscore
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
import joblib
import warnings
warnings.filterwarnings("ignore")

In [None]:
df=pd.read_csv("Insurance_fraud.csv")

In [None]:
df.head(5)

In [None]:
pd.set_option("display.max_rows",None)

In [None]:
#Maximum values will be displayed

In [None]:
df.shape

In [None]:
#There are 1000 rows and 40 columns in the dataset

In [None]:
df["months_as_customer"].value_counts()

In [None]:
df["age"].value_counts()

In [None]:
df["policy_number"].value_counts()

In [None]:
df["policy_deductable"].value_counts()

In [None]:
df["umbrella_limit"].value_counts()

In [None]:
df.drop("umbrella_limit",axis=1,inplace=True)

In [None]:
#there are 798 columns with 0 values means the dta is not provided so better we will drop the column

In [None]:
df["insured_zip"].value_counts()

In [None]:
df["capital-gains"].value_counts()

In [None]:
df["capital-loss"].value_counts()

In [None]:
#There are 508 "0.00" values in capital-gains and "475" "0.00" columns in the capital-loss column.

In [None]:
sns.distplot(df["capital-gains"],kde=True)

In [None]:
sns.distplot(df["capital-loss"],kde=True)

In [None]:
#capital-loss column is left skewed and capital-gains column is right skewed so we will replace by median

In [None]:
df["capital-gains"].mean()

In [None]:
df["capital-gains"]=df["capital-gains"].replace(0.00,25126.1)

In [None]:
df["capital-loss"].median()

In [None]:
df["capital-loss"]=df["capital-loss"].replace(0.00,-23250.0)

In [None]:
#replaced the columns with mean and median as in capital gain column he value with median was coming zero so we took mean

In [None]:
df["incident_hour_of_the_day"].value_counts()

In [None]:
df["number_of_vehicles_involved"].value_counts()

In [None]:
df["bodily_injuries"].value_counts()

In [None]:
df["witnesses"].value_counts()

In [None]:
df["total_claim_amount"].value_counts()

In [None]:
df["injury_claim"].value_counts()

In [None]:
df["property_claim"].value_counts()

In [None]:
df["vehicle_claim"].value_counts()

In [None]:
df["auto_year"].value_counts()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df["_c39"].value_counts()

In [None]:
#dropping the "_c39" column

In [None]:
df.drop("_c39",axis=1,inplace=True)

In [None]:
#bivariate analysis
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(8,5))
ax = df.groupby('incident_state').fraud_reported.count().plot.bar(ylim=0)
ax.set_ylabel('Fraud reported')
plt.show()

In [None]:
#NY column has the higest number of fraud reported

In [None]:
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(8,5))
ax = df.groupby('police_report_available').fraud_reported.count().plot.bar(ylim=0)
ax.set_ylabel('fraud_reported')
plt.show()

In [None]:
#"?" value are present in the police report available column

In [None]:
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(18,6))
ax = df.groupby('incident_date').total_claim_amount.count().plot.bar(ylim=0)
ax.set_ylabel('Claim amount ($)')
plt.show()

In [None]:
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(8,5))
ax = df.groupby('policy_state').fraud_reported.count().plot.bar(ylim=0)
ax.set_ylabel('Fraud reported')
plt.show()

In [None]:
#In OH state the fraud is reported highest

In [None]:
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(8,5))
ax = df.groupby('incident_type').fraud_reported.count().plot.bar(ylim=0)
ax.set_xticklabels(ax.get_xticklabels(), rotation=20, ha="right")
ax.set_ylabel('Fraud reported')
plt.show()

In [None]:
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(8,5))
ax = sns.countplot(x='incident_state', data=df)

In [None]:
plt.rcParams['figure.figsize'] = [7, 4]
table=pd.crosstab(df.insured_education_level, df.fraud_reported)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of insured education vs Fraud reported', fontsize=12)
plt.xlabel('Insured_education_level')
plt.ylabel('Fraud reported')

In [None]:
ax = (df['insured_sex'].value_counts()*100.0 /len(df))\
.plot.pie(autopct='%.1f%%', labels = ['Male', 'Female'], fontsize=12)                                                                           
ax.set_title('% Gender')
plt.show()

In [None]:
#There is 53.7% of male gender and 46.3% of female gender in insured sex column

In [None]:
table=pd.crosstab(df.insured_sex, df.fraud_reported)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of insured_sex vs Fraud', fontsize=12)
plt.xlabel('Insured_sex')
plt.ylabel('Fraud reported')
plt.show()

In [None]:
ax = (df['insured_relationship'].value_counts()*100.0 /len(df))\
.plot.pie(autopct='%.1f%%', labels = ['husband', 'wife', 'own-child', 'unmarried', 'other-relative', 'not-in-family'],fontsize=12)                                                                           
ax.set_title('% Relationship')
plt.show()

In [None]:
table=pd.crosstab(df.insured_relationship, df.fraud_reported)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of insured_relationship vs Fraud', fontsize=12)
plt.xlabel('insured_relationship')
plt.ylabel('Fraud reported')
plt.show()

In [None]:
fig = plt.figure(figsize=(8,4))
ax = (df['incident_type'].value_counts()*100.0 /len(df))\
.plot.pie(autopct='%.1f%%', labels = ['Parked Car', 'Single Vehile Collision', 'Multi-vehicle Collision', 'Vehicle Theft'],fontsize=12)

In [None]:
fig = plt.figure(figsize=(8,4))
ax = (df['authorities_contacted'].value_counts()*100.0 /len(df))\
.plot.pie(autopct='%.1f%%', labels = ['Police', 'Fire', 'Other', 'None', 'Ambulance'],fontsize=12)

In [None]:
fig = plt.figure(figsize=(8,4))
ax = sns.countplot(x='auto_make', data=df)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.show()

In [None]:
fig = plt.figure(figsize=(8,4))
ax = (df['incident_severity'].value_counts()*100.0 /len(df))\
.plot.pie(autopct='%.1f%%', labels = ['Major Damage', 'Total Loss', 'Minor Damage', 'Trivial Damage'],fontsize=12)

In [None]:
#Damage to the property. Major damage % is 35.4%

In [None]:
fig = plt.figure(figsize=(8,4))
ax = sns.countplot(x='insured_hobbies', data=df)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.show()

In [None]:
#hobbies of insured is reading at the highest level

In [None]:
df.info()

In [None]:
df["months_as_customer"].value_counts()

In [None]:
df["policy_bind_date"].value_counts()

In [None]:
df.policy_bind_date = pd.to_datetime(df.policy_bind_date)

In [None]:
df["months"]=df["policy_bind_date"].dt.month

In [None]:
df["days"]=df["policy_bind_date"].dt.day

In [None]:
df["years"]=df["policy_bind_date"].dt.year

In [None]:
df.drop("policy_bind_date",axis=1,inplace=True)

The date column has been segregated to days, months and years and date column has been dropped

In [None]:
df.head(5)

In [None]:
df["policy_state"].value_counts()

In [None]:
LE = LabelEncoder()
df["policy_state"]=LE.fit_transform(df["policy_state"])

In [None]:
df["policy_state"].value_counts()

In [None]:
df["policy_csl"].value_counts()

In [None]:
df = df.replace('?',np.NaN)
#"?" value is replaced by nan to later treat thoe nan values
df.isnull().any()

In [None]:
df.info()

In [None]:
#There are nan values in collision_type column, property_damage column, police_report_available column

In [None]:
df["police_report_available"].value_counts()

In [None]:
df["property_damage"].value_counts()

In [None]:
df["collision_type"].value_counts()

In [None]:
#missing value treatment using fillna

# we will replace the '?' by the most common collision type as we are unaware of the type.
df['collision_type'].fillna(df['collision_type'].mode()[0], inplace = True)

# It may be the case that there are no responses for property damage then we might take it as No property damage.
df['property_damage'].fillna('NO', inplace = True)

# again, if there are no responses fpr police report available then we might take it as No report available
df['police_report_available'].fillna('NO', inplace = True)

df.isnull().any().any()

In [None]:
df.info()

In [None]:
df["policy_csl"].value_counts()

In [None]:
LE = LabelEncoder()
df["policy_csl"]=LE.fit_transform(df["policy_csl"])

In [None]:
df["insured_sex"].value_counts()

In [None]:
df["insured_sex"]=LE.fit_transform(df["insured_sex"])

In [None]:
df["insured_sex"].value_counts()

In [None]:
df["insured_education_level"].value_counts()

In [None]:
df["insured_education_level"]=LE.fit_transform(df["insured_education_level"])

In [None]:
#Label Encoding has been done to convert the object datatype columns to int datatype

In [None]:
df["insured_occupation"].value_counts()

In [None]:
df["insured_hobbies"].value_counts()

In [None]:
df["insured_relationship"].value_counts()

In [None]:
#df["incident_date"].value_counts()

In [None]:
df = df.drop(['incident_date','incident_location','auto_model'], axis = 1)

In [None]:
#dropping this unneccessary columns

In [None]:
df["incident_type"].value_counts()

In [None]:
df["collision_type"].value_counts()

In [None]:
columnss=["incident_severity","fraud_reported","authorities_contacted","incident_state","incident_city","property_damage","police_report_available","auto_make","auto_year"]

for i in columnss:
    print(df[i].value_counts())

Applied for loop to check all the object dtype columns

In [None]:
columnss=["collision_type","incident_type","fraud_reported","insured_occupation","insured_hobbies","insured_relationship","incident_severity","authorities_contacted","incident_state","incident_city","property_damage","police_report_available","auto_make","auto_year"]

for i in columnss:
    df[i]=LE.fit_transform(df[i])

In [None]:
#Applied label encoding to the categorical columns

In [None]:
df.info()

label encoding has been applied to all the columns and now the dtype for all the columns is int or either float

In [None]:
sns.heatmap(df.isnull())

In [None]:
df.describe()

numerical column decription

In [None]:
plt.style.use('fivethirtyeight')
ax = sns.countplot(x='fraud_reported', data=df, hue='fraud_reported')

In [None]:
#class imbalance problem but not treating it as class imbalance problem

In [None]:
fig=plt.figure(figsize=(100,40))
hc=df.corr(method="pearson")
sns.heatmap(hc,annot=True,cmap="Purples")

In [None]:
#Correlation with values

In [None]:
df_corrr=df.corr()
df_corrr

In [None]:
#correlation of column with each other

In [None]:
columns=df.columns

for i in df[columns]:
    plt.figure()
    sns.displot(df[i])

In [None]:
#distribution plot for all the column

In [None]:
columns=["months_as_customer","age","policy_number","policy_deductable","policy_annual_premium","insured_zip","capital-gains","capital-loss","incident_hour_of_the_day","number_of_vehicles_involved","bodily_injuries","witnesses","total_claim_amount","injury_claim","property_claim","vehicle_claim","auto_year"]

for i in df[columns]:
    plt.figure()
    df[i].plot.box()

In [None]:
#boxplot to check the outliers present in the numerical columns
#there are outliers present in the age,policy_annual_premium,umbrella_limit,total_claim_amount,property_claim
#outliers to be treated of policy_annual_premium,capital-gains,property_claim

In [None]:
columns=["policy_annual_premium","capital-gains","property_claim"]

for i in df[columns]:
    plt.figure()
    sns.kdeplot(df[i])

In [None]:
df.skew()

In [None]:
sns.pairplot(df)

In [None]:
df["insured_zip"]=np.log(df["insured_zip"])
df["capital-gains"]=np.log(df["capital-gains"])
df["capital-loss"]=np.cbrt(df["capital-loss"])

In [None]:
#skeweness has been removed with log transformation

In [None]:
df.skew()


In [None]:
from scipy.stats import zscore

In [None]:
df1=df[["policy_annual_premium","capital-gains","property_claim"]]
z=np.abs(zscore(df1))
df_new=df[(z<3).all(axis=1)]

In [None]:
print("shape before and after")
print("shape before".ljust(20),":",df.shape)
print("shape after".ljust(20),":",df_new.shape)
print("pecentage loss".ljust(20),":",(df.shape[0]-df_new.shape[0])/df.shape[0])

In [None]:
#OUTLIERS HAVE BEEN REMOVED AND THERE IS .5% DTATA LOSS 

In [None]:
q1=df1.quantile(0.25)
q3=df1.quantile(0.75)
IQR=q3-q1

In [None]:
df_new1=df[~((df1<(q1-1.5*IQR)) |(df1>(q3+1.5*IQR))).any(axis=1)]

In [None]:
print("shape before and after")
print("shape before".ljust(20),":",df.shape)
print("shape after".ljust(20),":",df_new1.shape)
print("pecentage loss".ljust(20),":",(df.shape[0]-df_new1.shape[0])/df.shape[0])

# OUTLIERS HAVE BEEN REMOVED THROUGH IQR AND THERE IS 1.6% DTATA LOSS SO WE WILL GO THROUGH THE ZSCORE HERE THERE IS LESS DATA LOSS 

In [None]:
for i in df[columns]:
    plt.figure()
    sns.kdeplot(df[i])

In [None]:
x=df_new.drop("fraud_reported",axis=1)
y=df_new["fraud_reported"]

In [None]:

sc=StandardScaler()
scaledx=sc.fit_transform(x)

In [None]:
#Scaled the data

In [None]:
from sklearn.decomposition import PCA
testpca=PCA()
Y=testpca.fit(scaledx)

In [None]:
var_cumu=np.cumsum(Y.explained_variance_ratio_*100)
var_cumu

In [None]:
k=np.argmax(var_cumu>90)
print("number of component:",k)

In [None]:
FinalPCA=PCA(n_components=27)
FinalData=FinalPCA.fit_transform(scaledx)

In [None]:
FinalData2=pd.DataFrame(FinalData)
FinalData2

In [None]:
x=FinalData2

In [None]:
#successufully applied PCA and removed 10 columns through PCA 
x.shape

In [None]:
from sklearn.linear_model import LogisticRegression
maxAccu=0
maxRS=0
for i in range(1,200):
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.30,random_state=i)
    LR=LogisticRegression()
    LR.fit(x_train,y_train)
    predrf=LR.predict(x_test)
    acc=accuracy_score(y_test,predrf)
    if acc>maxAccu:
        maxAccu=acc
        maxRS=i
print("best accuracy is ",maxAccu," on Random sate ",maxRS)

print(accuracy_score(y_test,predrf))
print(confusion_matrix(y_test,predrf))
print(classification_report(y_test,predrf))

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.30,random_state=133)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
gnb=GaussianNB()
gnb.fit(x_train,y_train)
predg=gnb.predict(x_test)
print("accuracy score:",)
print(accuracy_score(y_test,predg))
print(confusion_matrix(y_test,predg))
print(classification_report(y_test,predg))

In [None]:
sv=SVC()
sv.fit(x_train,y_train)
pred2=sv.predict(x_test)
print("accuracy score:",)
print(accuracy_score(y_test,pred2))
print(confusion_matrix(y_test,pred2))
print(classification_report(y_test,pred2))

In [None]:
rf=RandomForestClassifier(n_estimators=100)
rf.fit(x_train,y_train)
pred3=rf.predict(x_test)
print("accuracy score:",)
print(accuracy_score(y_test,pred3))
print(confusion_matrix(y_test,pred3))
print(classification_report(y_test,pred3))

In [None]:
ad=AdaBoostClassifier(n_estimators=100)
ad.fit(x_train,y_train)
pred4=ad.predict(x_test)
print("accuracy score:",)
print(accuracy_score(y_test,pred4))
print(confusion_matrix(y_test,pred4))
print(classification_report(y_test,pred4))

In [None]:
#APPLYING CROSS VALIDATION
score=cross_val_score(LR,x,y,cv=5)
print(score)
print("/n")
print(score.mean())

In [None]:
score=cross_val_score(gnb,x,y,cv=5)
print(score)
print(score.mean())

In [None]:
score=cross_val_score(sv,x,y,cv=5)
print(score)
print(score.mean())

In [None]:
score=cross_val_score(rf,x,y,cv=5)
print(score)
print(score.mean())

In [None]:
score=cross_val_score(ad,x,y,cv=5)
print(score)
print(score.mean())

In [None]:
#after checking the difference of model accuracy and cross validation the best performing model is random forest classifier 78%

In [None]:
print(confusion_matrix(y_test,pred3))

In [None]:
print(classification_report(y_test,pred3))

In [None]:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

dtc=DecisionTreeClassifier()

grid_param={"criterion":["gini","entropy"] }
gd_sr=GridSearchCV(estimator=dtc,param_grid=grid_param,scoring="accuracy",cv=5)

gd_sr.fit(x_train,y_train)

bestparam=gd_sr.best_params_
print(bestparam)
bestresult=gd_sr.best_score_
print(bestresult)

In [None]:
#tried with decision tree classifier but less accuracy

In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score

In [None]:
y_pred_prob=rf.predict_proba(x_test)[:,1]

In [None]:
y_pred_prob

In [None]:
fpr,tpr,thresholds=roc_curve(y_test,y_pred_prob)

In [None]:
fpr

In [None]:
tpr

In [None]:
thresholds

In [None]:
plt.plot([0,1],[0,1],"k--")
plt.plot(fpr,tpr,label="Random Forest")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Random Forest")
plt.show()

In [None]:
import joblib
joblib.dump(ad,"Insurance_Fraud.obj")
print("object of the dataset has been created")