In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import pylab
from datetime import datetime
from pandas.plotting import scatter_matrix
import seaborn as sns
import re
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

%matplotlib inline
sns.set_style("whitegrid")

# Import Data

In [2]:
claimJ12 = pd.read_csv('claimData-12-J10.xls')
claimJ12.head()

Unnamed: 0,CLAIM,CLAIMDATE,CUSTOMER,OCCURRENCE,CLAIM_STATUS,HOSPITALID,AREA,CLAIMAMOUNT,CAUSE,PRODUCT_CD,ANNUAL_PAYMENT_AMT,ISSUE_DT,GENDER_TYPE,AGE
0,35392034,27-NOV-15,64570415,1,12,84571001,1,18.0,J12,P10017,2000.0,30-SEP-15,2,29
1,34692346,10-MAY-16,60870157,1,10,84571003,1,17854.64,J12,P10017,2188.0,08-SEP-13,1,24
2,34525166,10-MAR-16,63171535,1,10,84571003,1,4325.8,J12,P10017,2188.0,17-AUG-14,1,30
3,34382946,07-JAN-16,64597346,1,10,84571003,1,,J12,24261B,81238.4,14-OCT-15,1,42
4,32971670,30-MAY-14,60272293,1,10,84571003,1,2953.98,J12,P10017,2188.0,09-MAY-13,2,40


In [3]:
claimJ12.describe()

Unnamed: 0,CLAIM,CUSTOMER,OCCURRENCE,CLAIM_STATUS,HOSPITALID,AREA,CLAIMAMOUNT,ANNUAL_PAYMENT_AMT,GENDER_TYPE,AGE
count,85756.0,85756.0,85756.0,85756.0,85756.0,85756.0,64686.0,85756.0,85756.0,85756.0
mean,28184540.0,56505250.0,1.205035,9.877035,84574490.0,3.044079,15674.35,14496.64,1.465985,47.685701
std,3997075.0,4194084.0,0.712204,0.930369,3997.166,2.204023,61229.5,46272.49,0.498845,13.301568
min,21180660.0,50285430.0,1.0,1.0,84571000.0,1.0,0.0,0.0,1.0,1.0
25%,24816990.0,52641890.0,1.0,10.0,84571060.0,1.0,2280.0,1650.0,1.0,40.0
50%,28287920.0,55783970.0,1.0,10.0,84572600.0,2.0,4792.0,3945.6,1.0,49.0
75%,31649890.0,60925260.0,1.0,10.0,84575800.0,5.0,11928.0,14744.7,2.0,57.0
max,35430360.0,65367900.0,31.0,14.0,84583340.0,10.0,2724834.0,2490800.0,2.0,117.0


In [4]:
claimJ12.columns

Index(['CLAIM', 'CLAIMDATE', 'CUSTOMER', 'OCCURRENCE', 'CLAIM_STATUS',
       'HOSPITALID', 'AREA', 'CLAIMAMOUNT', 'CAUSE', 'PRODUCT_CD',
       'ANNUAL_PAYMENT_AMT', 'ISSUE_DT', 'GENDER_TYPE', 'AGE'],
      dtype='object')

In [5]:
claimJ12.groupby('GENDER_TYPE').size()

GENDER_TYPE
1    45795
2    39961
dtype: int64

In [6]:
claimJ12.groupby('OCCURRENCE').size()

OCCURRENCE
1     73823
2      8996
3      1822
4       524
5       248
6       126
7        86
8        67
9        26
10       12
11       10
12        6
13        1
31        9
dtype: int64

In [7]:
claimJ12.groupby('AGE').size()

AGE
1         1
2         5
3        40
4        50
5       135
6        89
7       163
8       117
9       206
10      243
11      132
12      216
13      236
14      162
15      199
16      206
17      206
18      219
19      244
20      325
21      511
22      439
23      422
24      476
25      548
26      711
27      719
28      837
29      776
30      786
       ... 
53     2535
54     2925
55     2816
56     2617
57     2667
58     2485
59     2266
60     2199
61     1987
62     2147
63     1657
64     1490
65     1314
66     1233
67      869
68      827
69      715
70      487
71      410
72      274
73      224
74      208
75      130
76       60
77       24
78       19
79        2
80        8
81        1
117       1
Length: 82, dtype: int64

In [8]:
claimJ12.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85756 entries, 0 to 85755
Data columns (total 14 columns):
CLAIM                 85756 non-null int64
CLAIMDATE             85756 non-null object
CUSTOMER              85756 non-null int64
OCCURRENCE            85756 non-null int64
CLAIM_STATUS          85756 non-null int64
HOSPITALID            85756 non-null int64
AREA                  85756 non-null int64
CLAIMAMOUNT           64686 non-null float64
CAUSE                 85756 non-null object
PRODUCT_CD            85756 non-null object
ANNUAL_PAYMENT_AMT    85756 non-null float64
ISSUE_DT              85756 non-null object
GENDER_TYPE           85756 non-null int64
AGE                   85756 non-null int64
dtypes: float64(2), int64(8), object(4)
memory usage: 7.9+ MB


### Convert quality
GENDER_TYPE --> GenderB (M=1, F=2)  
ANNUAL_PAYMENT_AMT --> int64  
CLAIMAMOUNT --> int64    
ANNUAL_PAYMENT_AMT --> int64    

### Age Range  

# Data wrangling

In [9]:
claimJ12 = claimJ12[claimJ12.CLAIMAMOUNT >= 0]
claimJ12 = claimJ12[claimJ12.ANNUAL_PAYMENT_AMT >= 0]

In [10]:
# Convert types
claimJ12.ANNUAL_PAYMENT_AMT  =  claimJ12.ANNUAL_PAYMENT_AMT.apply(np.int64)
claimJ12.CLAIMAMOUNT   =  claimJ12.CLAIMAMOUNT .apply(np.int64)
claimJ12['GENDER_TYPEB'] = claimJ12['GENDER_TYPE'].apply(lambda x: 0 if x == 2 else 1) # map 2(F) to 0
claimJ12.describe()

Unnamed: 0,CLAIM,CUSTOMER,OCCURRENCE,CLAIM_STATUS,HOSPITALID,AREA,CLAIMAMOUNT,ANNUAL_PAYMENT_AMT,GENDER_TYPE,AGE,GENDER_TYPEB
count,64686.0,64686.0,64686.0,64686.0,64686.0,64686.0,64686.0,64686.0,64686.0,64686.0,64686.0
mean,28319900.0,56860520.0,1.200383,9.885694,84574420.0,2.992719,15674.29,13610.07,1.455384,46.859398,0.544616
std,4059610.0,4235685.0,0.673929,0.926509,3959.84,2.173105,61229.49,46684.92,0.498009,14.040192,0.498009
min,21180660.0,50285430.0,1.0,1.0,84571000.0,1.0,0.0,0.0,1.0,1.0,0.0
25%,25020690.0,52881500.0,1.0,10.0,84571060.0,1.0,2280.0,1688.0,1.0,38.0,0.0
50%,28558130.0,56323590.0,1.0,10.0,84572590.0,2.0,4792.0,3290.0,1.0,49.0,1.0
75%,31826820.0,61138380.0,1.0,10.0,84575620.0,4.0,11928.0,13400.0,2.0,57.0,1.0
max,35430360.0,65367900.0,13.0,14.0,84583340.0,10.0,2724833.0,2490800.0,2.0,117.0,1.0


In [11]:
def ageRange (x):
    if x < 0 : return 0               
    elif x >= 0 and x < 10 : return 1  
    elif x >=10 and x < 20 : return 2  
    elif x >=20 and x < 30 : return 3 
    elif x >=30 and x < 40 : return 4 
    elif x >=40 and x < 50 : return 5 
    elif x >=50 and x < 60 : return 6  
    elif x >=60 and x < 70 : return 7 
    elif x >=70 and x < 80 : return 8 
    elif x >=80 and x < 90 : return 9  
    else: return 10                    

claimJ12['AGE_CLASS']= claimJ12['AGE'].apply(ageRange)

In [12]:
claimJ12.describe()

Unnamed: 0,CLAIM,CUSTOMER,OCCURRENCE,CLAIM_STATUS,HOSPITALID,AREA,CLAIMAMOUNT,ANNUAL_PAYMENT_AMT,GENDER_TYPE,AGE,GENDER_TYPEB,AGE_CLASS
count,64686.0,64686.0,64686.0,64686.0,64686.0,64686.0,64686.0,64686.0,64686.0,64686.0,64686.0,64686.0
mean,28319900.0,56860520.0,1.200383,9.885694,84574420.0,2.992719,15674.29,13610.07,1.455384,46.859398,0.544616,5.234394
std,4059610.0,4235685.0,0.673929,0.926509,3959.84,2.173105,61229.49,46684.92,0.498009,14.040192,0.498009,1.432872
min,21180660.0,50285430.0,1.0,1.0,84571000.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
25%,25020690.0,52881500.0,1.0,10.0,84571060.0,1.0,2280.0,1688.0,1.0,38.0,0.0,4.0
50%,28558130.0,56323590.0,1.0,10.0,84572590.0,2.0,4792.0,3290.0,1.0,49.0,1.0,5.0
75%,31826820.0,61138380.0,1.0,10.0,84575620.0,4.0,11928.0,13400.0,2.0,57.0,1.0,6.0
max,35430360.0,65367900.0,13.0,14.0,84583340.0,10.0,2724833.0,2490800.0,2.0,117.0,1.0,10.0


In [13]:
claimJ12.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64686 entries, 0 to 85755
Data columns (total 16 columns):
CLAIM                 64686 non-null int64
CLAIMDATE             64686 non-null object
CUSTOMER              64686 non-null int64
OCCURRENCE            64686 non-null int64
CLAIM_STATUS          64686 non-null int64
HOSPITALID            64686 non-null int64
AREA                  64686 non-null int64
CLAIMAMOUNT           64686 non-null int64
CAUSE                 64686 non-null object
PRODUCT_CD            64686 non-null object
ANNUAL_PAYMENT_AMT    64686 non-null int64
ISSUE_DT              64686 non-null object
GENDER_TYPE           64686 non-null int64
AGE                   64686 non-null int64
GENDER_TYPEB          64686 non-null int64
AGE_CLASS             64686 non-null int64
dtypes: int64(12), object(4)
memory usage: 7.4+ MB


In [14]:
claimJ12.head()

Unnamed: 0,CLAIM,CLAIMDATE,CUSTOMER,OCCURRENCE,CLAIM_STATUS,HOSPITALID,AREA,CLAIMAMOUNT,CAUSE,PRODUCT_CD,ANNUAL_PAYMENT_AMT,ISSUE_DT,GENDER_TYPE,AGE,GENDER_TYPEB,AGE_CLASS
0,35392034,27-NOV-15,64570415,1,12,84571001,1,18,J12,P10017,2000,30-SEP-15,2,29,0,3
1,34692346,10-MAY-16,60870157,1,10,84571003,1,17854,J12,P10017,2188,08-SEP-13,1,24,1,3
2,34525166,10-MAR-16,63171535,1,10,84571003,1,4325,J12,P10017,2188,17-AUG-14,1,30,1,4
4,32971670,30-MAY-14,60272293,1,10,84571003,1,2953,J12,P10017,2188,09-MAY-13,2,40,0,5
8,33228721,17-MAR-16,62247348,1,10,84571003,1,3349,J12,P10016,2700,26-NOV-98,1,46,1,5


In [15]:
dcf = claimJ12.groupby(['HOSPITALID'],as_index=False).count()

In [16]:
claimJ12_sort = dcf.sort_values(['CLAIM'],ascending=False).head(100)
claimJ12_re = claimJ12_sort['HOSPITALID']

In [17]:
claimJ12_data = claimJ12.loc[claimJ12['HOSPITALID'].isin(claimJ12_re)]
claimJ12_data.describe()

Unnamed: 0,CLAIM,CUSTOMER,OCCURRENCE,CLAIM_STATUS,HOSPITALID,AREA,CLAIMAMOUNT,ANNUAL_PAYMENT_AMT,GENDER_TYPE,AGE,GENDER_TYPEB,AGE_CLASS
count,47860.0,47860.0,47860.0,47860.0,47860.0,47860.0,47860.0,47860.0,47860.0,47860.0,47860.0,47860.0
mean,28322890.0,56821450.0,1.234622,9.875407,84573710.0,2.675198,17507.21,13895.22,1.476285,46.934246,0.523715,5.242666
std,4069230.0,4248713.0,0.739617,0.953417,3624.837,2.140385,68599.81,47976.42,0.499442,14.233649,0.499442,1.450131
min,21185710.0,50285430.0,1.0,1.0,84571010.0,1.0,0.0,0.0,1.0,2.0,0.0,1.0
25%,25002390.0,52827430.0,1.0,10.0,84571050.0,1.0,2515.75,1688.0,1.0,38.0,0.0,4.0
50%,28555310.0,56229550.0,1.0,10.0,84571620.0,1.0,5192.0,3288.0,1.0,49.0,1.0,5.0
75%,31845810.0,61133960.0,1.0,10.0,84574640.0,4.0,13020.0,13501.25,2.0,57.0,1.0,6.0
max,35430360.0,65340280.0,13.0,14.0,84583290.0,7.0,2724833.0,2490800.0,2.0,117.0,1.0,10.0


In [18]:
# Select support feature
feature_list = [  
                  'AGE_CLASS'
                , 'ANNUAL_PAYMENT_AMT'
                , 'GENDER_TYPEB'
                ]

In [19]:
X = claimJ12_data.loc[:, feature_list]
X

Unnamed: 0,AGE_CLASS,ANNUAL_PAYMENT_AMT,GENDER_TYPEB
66,6,2550,1
67,6,20287,1
68,7,32113,0
69,4,1680,1
70,6,25000,0
71,7,5600,0
72,6,1500,0
73,4,1670,1
74,4,7208,0
75,3,7286,0


In [20]:
# Train model to get 1 most nearest neigbor
nnb = NearestNeighbors(n_neighbors=10)

In [21]:
nbrs = nnb.fit(X)

In [22]:
t = [6, 2550, 1]

In [23]:
# Calculate most nearest to t[]
print(nbrs.kneighbors([t]))

(array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]]), array([[ 3727, 20906, 22250,  3409, 29146, 15146, 12805,  5358, 46184,
        41261]], dtype=int32))


In [24]:
claimJ12_data.iloc[3727]

CLAIM                  34429493
CLAIMDATE             18-JAN-16
CUSTOMER               64855237
OCCURRENCE                    1
CLAIM_STATUS                 10
HOSPITALID             84574110
AREA                          4
CLAIMAMOUNT               14136
CAUSE                       J12
PRODUCT_CD               P10016
ANNUAL_PAYMENT_AMT         2550
ISSUE_DT              16-JAN-16
GENDER_TYPE                   1
AGE                          57
GENDER_TYPEB                  1
AGE_CLASS                     6
Name: 5903, dtype: object

### Save model to file using pickle

In [26]:
import pickle

In [28]:
with open('hospital_recom.pickle','wb') as f:
    pickle.dump(nnb,f)

### Load model and test

In [29]:
with open('hospital_recom.pickle','rb') as f:
    loadhospital_recom = pickle.load(f)

In [30]:
# Test recommender
print(loadhospital_recom.kneighbors([t]))

(array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]]), array([[ 3727, 20906, 22250,  3409, 29146, 15146, 12805,  5358, 46184,
        41261]], dtype=int32))
