In [2]:
import numpy as np
import pandas as pd
import networkx as nx
import pathlib
import sklearn
import time

from networkx.algorithms import bipartite
from networkx.algorithms.centrality import degree_centrality, subgraph_centrality
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
pd.__version__

'1.0.5'

In [3]:
import bokeh
from bokeh.io import output_file, show
from bokeh.models import (BoxZoomTool, Circle, HoverTool, Scatter, 
                          MultiLine, Plot, Range1d, ResetTool, 
                          ColumnDataSource, CustomJSTransform, LabelSet)
from bokeh.palettes import Spectral4, RdYlGn4
from bokeh.plotting import figure
from bokeh.transform import linear_cmap, factor_mark
from bokeh.models.graphs import from_networkx

bokeh.__version__

'2.1.1'

In [4]:
from pathlib import Path
Path.cwd()

PosixPath('/Users/dawnstaana/Documents/NUS/Year 4/NUS Fintech/Insurance')

In [5]:
from utils import (Bipartite, calculate_birank, 
preprocess, preprocess_new_claim, 
calculate_new_claim, get_subgraph, 
get_subgraph_regular, get_claim_features, get_fraud_features, make_graph)

In [6]:
data = pd.read_csv("insurance_cleaned_addcols.csv", header=0, parse_dates=["incident_date"], infer_datetime_format=True)
data.head()
data.columns

Index(['months_as_customer', 'age', 'policy_number', 'policy_bind_date',
       'policy_state', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'capital-gains', 'capital-loss',
       'incident_date', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_location', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_model', 'auto_year', 'fraud_reported', 'age_car_incident',
       'age_policy_incident', 'workshop', 'adjuster'],
      dtype='object')

In [16]:
data.head()

Unnamed: 0,claim_no,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,...,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported,age_car_incident,age_policy_incident,workshop,adjuster
0,0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,...,13020,52080,Saab,92x,2004,1,11,1,A,Frank
1,1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,...,780,3510,Mercedes,E400,2007,1,8,9,A,Frank
2,2,134,29,687698,2000-06-09,OH,100/300,2000,1413.14,5000000,...,3850,23100,Dodge,RAM,2007,0,8,15,A,Frank
3,3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,...,6340,50720,Chevrolet,Tahoe,2014,1,1,25,C,Harry
4,4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,...,650,4550,Accura,RSX,2009,0,6,1,C,Harry


In [13]:
data = data.reset_index()

In [15]:
data = data.rename(columns={'index': 'claim_no'})

In [21]:
relevant_cols = ['claim_no','policy_number','adjuster','workshop','fraud_reported']
df = data[relevant_cols]
df.head()

Unnamed: 0,claim_no,policy_number,adjuster,workshop,fraud_reported
0,0,521585,Frank,A,1
1,1,342868,Frank,A,1
2,2,687698,Frank,A,0
3,3,227811,Harry,C,1
4,4,367455,Harry,C,0


In [26]:
claims_df = df.copy()

In [22]:
policyholder_edgelist = df[['claim_no', 'policy_number']].rename(columns={'claim_no':'Claims', 'policy_number':'Parties'}).dropna()
expert_edgelist = df[['claim_no', 'adjuster']].rename(columns={'claim_no':'Claims', 'adjuster':'Parties'}).dropna()
garage_edgelist = df[['claim_no', 'workshop']].rename(columns={'claim_no':'Claims', 'workshop':'Parties'}).dropna()
edgelist_df = pd.concat([policyholder_edgelist, expert_edgelist, garage_edgelist], axis=0).sort_values(by='Claims').reset_index(drop=True)
edgelist_df = edgelist_df[['Parties', 'Claims']]
print(edgelist_df.shape)

(3000, 2)


In [40]:
mask = df['fraud_reported'] == 1
prior = df[mask]['claim_no'].values
prior

array([  0,   1,   3,   5,  14,  15,  22,  23,  25,  27,  31,  35,  36,
        39,  41,  47,  60,  63,  64,  65,  66,  70,  71,  79,  84,  89,
        91,  96,  97, 106, 108, 109, 111, 115, 117, 121, 122, 128, 129,
       135, 143, 145, 146, 148, 149, 152, 154, 155, 163, 166, 171, 183,
       185, 188, 196, 206, 213, 214, 215, 218, 220, 227, 234, 237, 241,
       245, 247, 250, 251, 253, 254, 257, 259, 261, 262, 266, 272, 277,
       278, 281, 283, 288, 292, 294, 305, 306, 307, 310, 311, 319, 324,
       328, 329, 331, 340, 342, 344, 349, 351, 358, 360, 361, 363, 364,
       365, 368, 373, 379, 382, 383, 394, 402, 404, 424, 425, 428, 432,
       437, 442, 445, 457, 460, 462, 470, 474, 476, 477, 478, 479, 482,
       489, 494, 498, 504, 513, 517, 529, 535, 538, 544, 547, 552, 555,
       558, 561, 567, 573, 574, 577, 579, 587, 591, 593, 597, 600, 602,
       616, 620, 623, 628, 629, 633, 635, 638, 643, 649, 650, 652, 657,
       665, 666, 683, 684, 691, 699, 700, 701, 703, 704, 705, 70

In [41]:
birank_df = calculate_birank(edgelist_df, prior)
print(birank_df.shape)
birank_df.tail()

No. of known prior fraudulent flags: 247.0 Length of prior_vector: 1000
(2010, 3)


Unnamed: 0,node,birank_score,birank_scaled
2005,995,0.149816,0.016896
2006,996,0.149816,0.016896
2007,997,0.207558,0.032304
2008,998,0.207558,0.032304
2009,999,0.207558,0.032304


In [38]:
claims_df = claims_df.rename(columns={'policy_number': 'CUST_CODE','claim_no':'CLAIM_NO','fraud_reported':'fraud_flag','workshop':'WORKSHOP_ID','adjuster':'CLAIM_HANDLER'})

In [39]:
claims_df.columns

Index(['CLAIM_NO', 'CUST_CODE', 'CLAIM_HANDLER', 'WORKSHOP_ID', 'fraud_flag',
       'investigation_flag'],
      dtype='object')

In [36]:
claims_df[['investigation_flag']] = claims_df[['fraud_flag']]

In [42]:
G, claims_attr, parties_attr = preprocess(claims_df, birank_df)

AttributeError: Can only use .str accessor with string values!