In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import plotly as py
import plotly.express as px
import plotly.graph_objs as go
import ipywidgets as widgets
from scipy import special

py.offline.init_notebook_mode(connected=True) 

In [None]:
#load & view raw data
df = pd.read_csv('insurance_claims.csv')
df.head(10)

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
df.nunique()

In [None]:
ax = pd.value_counts(df['fraud_reported']).plot.bar(color=['blue', 'red'], figsize=(10,5))
ax.set_xlabel('Fraud reported')
ax.set_ylabel('Number of claims')
plt.show()

In [None]:
df['fraud_reported'].value_counts() 

In [None]:
# Which states did the incidents occur
df['incident_state'].value_counts()

In [None]:
fig = px.bar(df, x='incident_state', y='total_claim_amount', color='incident_state' )
fig.show()

In [None]:
# breakdown of the incident states as a function of incident dates
fig = px.bar(df, x='incident_date', y='fraud_reported', color='incident_state' )
fig.show()

In [None]:
# distribution of age 
fig = px.histogram(df, x='age', color='fraud_reported', histnorm='probability density')
fig.show()

In [None]:
# Total claim amount distribution
fig = px.histogram(df, x="total_claim_amount", color="fraud_reported", marginal="box", # can be 'rug', `box`, `violin`
                         hover_data=df.columns)
fig.show()

In [None]:
# distribution in Gender
labels = ['Male', 'Female']
fig = px.pie(df, values=df['insured_sex'].value_counts(), names=labels, title='% Gender')
fig.show()

In [None]:
# types of the incidents
fig = px.pie(df, values=df['incident_type'].value_counts(), names=df['incident_type'].value_counts().keys(), title='Incident Type', )
fig.show()

In [None]:
df[df.auto_make =='Saab'].fraud_reported.value_counts()

In [None]:
df[df.auto_make =='BMW'].fraud_reported.value_counts()

In [None]:
fig = px.histogram(df, x='auto_make',  color='fraud_reported')
fig.show()

In [None]:
fig = px.histogram(df, x='insured_education_level',  color='fraud_reported')
fig.show()

In [None]:
fig = px.histogram(df, x='insured_hobbies',  color='fraud_reported')
fig.show()

In [None]:
fig = px.histogram(df, x='incident_severity',  color='fraud_reported')
fig.show()

In [None]:
fig = px.pie(df, values=df['incident_severity'].value_counts(), names=df['incident_severity'].value_counts().keys(), title='Incident Severity')
fig.show()

In [None]:
fig = px.histogram(df, x='incident_type',  color='fraud_reported')
fig.show()

In [None]:
fig = px.histogram(df, x='insured_education_level',  color='fraud_reported', histnorm='probability density')
fig.show()

In [None]:
fig = px.histogram(df, x='collision_type',  color='fraud_reported', histnorm='probability density')
fig.show()

In [None]:
fig = px.histogram(df, x='insured_occupation',  color='fraud_reported', histnorm='probability density')
fig.show()

In [None]:
#Data preparation and Modeling

import pandas as pd
import numpy as np
import itertools
from matplotlib import pyplot as plt

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
#from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, precision_score, recall_score, f1_score, precision_recall_curve


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis



import sklearn.metrics
from pylab import rcParams
plt.style.use('seaborn')
%matplotlib inline

# Turning-off the warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [None]:
#load & view the data
df = pd.read_csv('insurance_claims.csv')
df.head(10)

In [None]:
# unique entries. Useful to know the catagorical features
df.nunique()

In [None]:
# Total number of missing values
df.isna().sum().sum()

In [None]:
# column with missing values 
df.columns[df.isna().any()] 

In [None]:
# columns with ? entries
df.columns[(df == '?').any()]

In [None]:
df[df.columns[(df == '?').any()]].nunique()

In [None]:
df['property_damage'].replace(to_replace='?', value='NO', inplace=True)
df['police_report_available'].replace(to_replace='?', value='NO', inplace=True)

In [None]:
# check if there are duplicated entries
df.duplicated(subset=None, keep='first').sum()

In [None]:
# dropping uninformative features
colsToDelete = ["policy_number", "policy_bind_date", "insured_zip", "incident_location", "incident_date", "_c39"]
df = df.drop(columns = colsToDelete, axis=1)
df.head()

In [None]:
threshold = 0.97
# calculate correlations
corr_matrix = df.corr().abs()
# get the upper part of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# columns with correlation above threshold
redundent = [column for column in upper.columns if any(upper[column] >= threshold)]
print(f'Columns to drop with correlation > {threshold}: {redundent}')
df.drop(columns=redundent, inplace=True)

In [None]:
num_features = df._get_numeric_data().columns
cat_features = list(set(df.columns) - set(num_features))
cat_features.remove('fraud_reported')

In [None]:
cat_features

In [None]:
df[num_features].head()

In [None]:
df[cat_features].head()

In [None]:
# separate the target column from the features
y = df["fraud_reported"].map({"N":0, "Y":1})
X = df.drop("fraud_reported", axis=1)

In [None]:
preprocessor = ColumnTransformer([("numerical", "passthrough", num_features), 
                                  ("categorical", OneHotEncoder(sparse=False, handle_unknown="ignore"),
                                   cat_features)])

In [None]:
# Logistic Regression
lr_model = Pipeline([("preprocessor", preprocessor), 
                     ("model", LogisticRegression(class_weight="balanced", solver="liblinear", random_state=42))])

# Decision Tree
dt_model = Pipeline([("preprocessor", preprocessor), 
                     ("model", DecisionTreeClassifier(class_weight="balanced"))])
# LDA
lda_model = Pipeline([("preprocessor", preprocessor), 
                     ("model", LinearDiscriminantAnalysis())])

# Random Forest
rf_model = Pipeline([("preprocessor", preprocessor), 
                     ("model", RandomForestClassifier(class_weight="balanced", n_estimators=100, n_jobs=-1))])

# XGBoost
xgb_model = Pipeline([("preprocessor", preprocessor), 
                      # Add a scale_pos_weight to make it balanced
                      ("model", XGBClassifier(scale_pos_weight=(1 - y.mean()), n_jobs=-1))])

In [None]:
# A function to plot the ROC and PVR curves 
def plot_eval(testY, predY, auc):  
    fpr, tpr, thresh = sklearn.metrics.roc_curve(testY, predY[:,1])
    
    plt.plot(fpr, tpr, label='ROC curve (area = %.2f)' %auc)
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random guess')
    plt.title('ROC curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.grid()
    plt.legend()
    plt.show()
    
    precision_rt, recall_rt, threshold_rt = sklearn.metrics.precision_recall_curve(testY, predY[:,1])
    plt.plot(recall_rt, precision_rt, linewidth=5, label='Precision-Recall curve')
    plt.title('Recall vs Precision')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.show()

In [None]:
def model_evaluate(model, X, y, grid_params, plot_eval_curves = False):
    """Prepares a training and test set and evaluates the ML model
       on multiple metrices 
    
    Arguments:
    ---------
    model:        a defined ML model

    X:            the feature matix

    y:            the labels 

    grid_params:  hyperparameters to perform grid search on (dict)

    plot_eval_curves: If False, outputs metrices 
                      If True, plots ROC and precision vs. recall curves 
    """
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2, random_state=555)
    
    gs = GridSearchCV(model, grid_params, 
                  n_jobs=-1, cv=5, scoring="roc_auc")

    gs.fit(X_train, y_train)

    model.set_params(**gs.best_params_)
    
    model.fit(X_train, y_train)
    
    # Predict probabilities and labels
    probs = model.predict_proba(X_test)
    preds = model.predict(X_test)
    
    # Calculate ROC AUC
    auc = sklearn.metrics.roc_auc_score(y_test, probs[:, 1])
    # get the confusion matrix
    cnf_matrix = sklearn.metrics.confusion_matrix(y_test, preds)    
    # Plot ROC curve
    if plot_eval_curves:
        plot_eval(y_test, probs, auc)
    else: 
        print('Best Parameters:', gs.best_params_)
        print('Best Score:', gs.best_score_)
        print(f'ROC AUC: {round(auc, 4)}')
        print(f'Confusion Matrix:\n {cnf_matrix}')
        # compute the other evaluation metrices 
        for metric in [sklearn.metrics.precision_score, sklearn.metrics.recall_score, sklearn.metrics.f1_score]:
            print(f'{metric.__name__}: {round(metric(y_test, preds), 4)}')

        # Average performance using 5 x cross-validation    
        score = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
        print('Cross-validation AUC score: ', score.mean()) 
    return model, cnf_matrix

In [None]:
# Grid-search hyper Parameters to consider
model_grid_params = {'lr_model': {"model__C": [1, 1.3, 1.5]}, 'dt_model': {"model__max_depth": [3, 5, 7], "model__min_samples_split": [2, 5]}, 
              'rf_model': {"model__max_depth": [20, 10, 15],"model__min_samples_split": [5, 10]}, 'lda_model': {}, 
               'xgb_model':{"model__max_depth": [5, 10], "model__min_child_weight": [5, 10]}}

In [None]:
models = [lr_model, dt_model, rf_model, lda_model, xgb_model]
model_keys = [('lr_model', 'Logistic Regression'),
              ('dt_model', 'Decision Tree'),
              ('rf_model', 'Random Forest'),
              ('lda_model', 'Linear Discriminant Analysis'),
              ('xgb_model', 'Gradient Boosting')]

In [None]:
final_model, cnf_matrix = {}, {}
for idx, model in enumerate(models):
    print(45*'_', '\n{}'.format(model_keys[idx][1]))
    model, cnf = model_evaluate(model, X, y, model_grid_params[model_keys[idx][0]])
    final_model[model_keys[idx][0]] = model
    cnf_matrix[model_keys[idx][0]] = cnf 

In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.show()

In [None]:
# Plot the confusion matrix of the LDA model
plot_confusion_matrix(cnf_matrix['lda_model'], target_names=['legitimate', 'fraud'], normalize=False)

In [None]:
# Plot the confusion matrix of the DT model
plot_confusion_matrix(cnf_matrix['dt_model'], target_names=['legitimate', 'fraud'], normalize=False)

In [None]:
# ROC curve LDA 
model, cnf = model_evaluate(lda_model, X, y, model_grid_params['lda_model'], plot_eval_curves=True)

In [None]:
# ROC and Precision vs. Recall curves of the LDA model 
model, cnf = model_evaluate(lda_model, X, y, model_grid_params['lda_model'], plot_eval_curves=True)