In [1]:
### import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns

from sklearn.model_selection import train_test_split, ShuffleSplit, learning_curve, GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.metrics import roc_curve, accuracy_score, confusion_matrix, classification_report, roc_auc_score, make_scorer, precision_recall_curve, average_precision_score 
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier, IsolationForest, VotingClassifier
from sklearn.neural_network import MLPClassifier

%matplotlib inline
plt.style.use('ggplot')

In [2]:
def read_data(tp = "Train", N = 1542865627584):
    target = pd.read_csv("{}-{}.csv".format(tp.title(), N))
    pt = pd.read_csv("{}_Beneficiarydata-{}.csv".format(tp.title(), N))
    in_pt = pd.read_csv("{}_Inpatientdata-{}.csv".format(tp.title(), N))
    out_pt = pd.read_csv("{}_Outpatientdata-{}.csv".format(tp.title(), N))
    return (in_pt, out_pt, pt, target)

In [3]:
in_pt, out_pt, asl, target = read_data()

In [4]:
asl = asl.replace({'ChronicCond_Alzheimer': 2, 'ChronicCond_Heartfailure': 2, 'ChronicCond_KidneyDisease': 2,
                           'ChronicCond_Cancer': 2, 'ChronicCond_ObstrPulmonary': 2, 'ChronicCond_Depression': 2, 
                           'ChronicCond_Diabetes': 2, 'ChronicCond_IschemicHeart': 2, 'ChronicCond_Osteoporasis': 2, 
                           'ChronicCond_rheumatoidarthritis': 2, 'ChronicCond_stroke': 2, 'Gender': 2 }, 0)
asl = asl.replace({'RenalDiseaseIndicator': 'Y'}, 1).astype({'RenalDiseaseIndicator': 'int64'})

In [5]:
asl['WhetherDead']= 0
asl.loc[asl.DOD.notna(),'WhetherDead'] = 1

In [6]:
target["target"] = np.where(target.PotentialFraud == "Yes", 1, 0) 

In [7]:
MediCare = pd.merge(in_pt, out_pt, left_on = [ x for x in out_pt.columns if x in in_pt.columns], right_on = [ x for x in out_pt.columns if x in in_pt.columns], how = 'outer')
MediCare.shape

(558211, 30)

In [8]:
data = pd.merge(MediCare, asl,left_on='BeneID',right_on='BeneID',how='inner')
data.shape

(558211, 55)

In [11]:
### Check Physicians columns for stange records and value length.
def len_check(data , l):
    S = dict()
    for i in data.columns:
         S[i] = [x for x in data.loc[ np.any(data[[i]].notnull().to_numpy(), axis = 1)][i].unique() if (len(str(x)) < l | len(str(x)) > l ) ]
    
    print(S)

len_check(data[['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician']], len('PHY388358'))  

{'AttendingPhysician': [], 'OperatingPhysician': [], 'OtherPhysician': []}


In [12]:
def uniq(a):
    return np.array([len(set([i for i in x[~pd.isnull(x)]])) for x in a.values])

In [13]:
### Create new variable and drop 'AttendingPhysician', 'OperatingPhysician', 'OtherPhysician'
data['NumPhysicians'] = uniq(data[['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician']]) 
data = data.drop(['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician'], axis = 1)

In [14]:
ClmProcedure_vars = ['ClmProcedureCode_{}'.format(x) for x in range(1,7)]
### Create new variable 
data['NumProc'] = data[ClmProcedure_vars].notnull().to_numpy().sum(axis = 1)

In [15]:
keep = ['BeneID', 'ClaimID', 'ClmAdmitDiagnosisCode', 'NumProc' ] + ClmProcedure_vars
### Checking if procedures is unique
print(data[keep].loc[data['NumProc'] != uniq( data[ClmProcedure_vars])])

data = data.drop(ClmProcedure_vars, axis = 1)

Empty DataFrame
Columns: [BeneID, ClaimID, ClmAdmitDiagnosisCode, NumProc, ClmProcedureCode_1, ClmProcedureCode_2, ClmProcedureCode_3, ClmProcedureCode_4, ClmProcedureCode_5, ClmProcedureCode_6]
Index: []


In [16]:
ClmDiagnosisCode_vars =['ClmAdmitDiagnosisCode'] + ['ClmDiagnosisCode_{}'.format(x) for x in range(1, 11)]

### Create new variable 
data['NumClaims'] = data[ClmDiagnosisCode_vars].notnull().to_numpy().sum(axis = 1)

In [17]:
keep = ['BeneID', 'ClaimID', 'ClmAdmitDiagnosisCode', 'NumClaims'] + ClmDiagnosisCode_vars

### Create new variable 
data['NumClaims'] = data[ClmDiagnosisCode_vars].notnull().to_numpy().sum(axis = 1)

print(data[keep].loc[data['NumClaims'] != uniq( data[ClmDiagnosisCode_vars])].head())
### if checking result of unique claims is not missing, we are going to add number of unique claims.

       BeneID    ClaimID ClmAdmitDiagnosisCode  NumClaims   
1   BENE11001   CLM66048                  6186          4  \
35  BENE11037  CLM283913                 42731          2   
38  BENE11041   CLM54944                 49121          7   
39  BENE11041   CLM78682                 51881         10   
70  BENE11066   CLM75192                 59971         10   

   ClmAdmitDiagnosisCode ClmDiagnosisCode_1 ClmDiagnosisCode_2   
1                   6186               6186               2948  \
35                 42731              42731                NaN   
38                 49121              49121               2752   
39                 51881              51881               2859   
70                 59971               5990              78829   

   ClmDiagnosisCode_3 ClmDiagnosisCode_4 ClmDiagnosisCode_5   
1               56400                NaN                NaN  \
35                NaN                NaN                NaN   
38               7812              71690       

In [18]:
data['NumUniqueClaims'] = uniq(data[ClmDiagnosisCode_vars])

data['ExtraClm'] = data['NumClaims'] - data['NumUniqueClaims']

data = data.drop(ClmDiagnosisCode_vars, axis = 1)
data = data.drop(['NumClaims'], axis = 1)

In [19]:
### 
data['AdmissionDt'] = pd.to_datetime(data['AdmissionDt'] , format = '%Y-%m-%d')
data['DischargeDt'] = pd.to_datetime(data['DischargeDt'],format = '%Y-%m-%d')

data['ClaimStartDt'] = pd.to_datetime(data['ClaimStartDt'] , format = '%Y-%m-%d')
data['ClaimEndDt'] = pd.to_datetime(data['ClaimEndDt'],format = '%Y-%m-%d')

data['DOB'] = pd.to_datetime(data['DOB'] , format = '%Y-%m-%d')
data['DOD'] = pd.to_datetime(data['DOD'],format = '%Y-%m-%d')

### Number of hospitalization days
data['AdmissionDays'] = ((data['DischargeDt'] - data['AdmissionDt']).dt.days) + 1
### Number of claim days 
data['ClaimDays'] = ((data['ClaimEndDt'] - data['ClaimStartDt']).dt.days) + 1

data['Age'] = round(((data['ClaimStartDt'] - data['DOB']).dt.days + 1)/365.25)

In [20]:
data['Hospt'] = np.where(data.DiagnosisGroupCode.notnull(), 1, 0)
data = data.drop(['DiagnosisGroupCode'], axis = 1)

In [21]:
### Check if there were any actions after death. 
data['DeadActions'] = np.where(np.any(np.array([ data[x] > data['DOD'] for x in ['AdmissionDt', 'DischargeDt', 'ClaimStartDt', 'ClaimEndDt']]), axis = 0), 1, 0)

print(data.loc[data['DeadActions'] > 0])

### If there is no actions after death date, we will drop this variable. 
data = data.drop(['AdmissionDt', 'DeadActions', 'DischargeDt', 'ClaimStartDt', 'ClaimEndDt', 'DOD', 'DOB'], axis = 1)

Empty DataFrame
Columns: [BeneID, ClaimID, ClaimStartDt, ClaimEndDt, Provider, InscClaimAmtReimbursed, AdmissionDt, DeductibleAmtPaid, DischargeDt, DOB, DOD, Gender, Race, RenalDiseaseIndicator, State, County, NoOfMonths_PartACov, NoOfMonths_PartBCov, ChronicCond_Alzheimer, ChronicCond_Heartfailure, ChronicCond_KidneyDisease, ChronicCond_Cancer, ChronicCond_ObstrPulmonary, ChronicCond_Depression, ChronicCond_Diabetes, ChronicCond_IschemicHeart, ChronicCond_Osteoporasis, ChronicCond_rheumatoidarthritis, ChronicCond_stroke, IPAnnualReimbursementAmt, IPAnnualDeductibleAmt, OPAnnualReimbursementAmt, OPAnnualDeductibleAmt, WhetherDead, NumPhysicians, NumProc, NumUniqueClaims, ExtraClm, AdmissionDays, ClaimDays, Age, Hospt, DeadActions]
Index: []

[0 rows x 43 columns]


In [22]:
data.describe(exclude = ['object'])

Unnamed: 0,InscClaimAmtReimbursed,DeductibleAmtPaid,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,...,OPAnnualDeductibleAmt,WhetherDead,NumPhysicians,NumProc,NumUniqueClaims,ExtraClm,AdmissionDays,ClaimDays,Age,Hospt
count,558211.0,557312.0,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0,...,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0,40474.0,558211.0,558211.0,558211.0
mean,997.012133,78.421085,0.421162,1.255011,0.196786,25.446969,378.588195,11.931472,11.93877,0.401868,...,649.698745,0.0074,1.292981,0.053557,3.23915,0.033117,6.665168,2.72794,73.301277,0.072507
std,3821.534891,274.016812,0.493746,0.717437,0.397569,15.192784,265.215531,0.889712,0.7859,0.490276,...,1002.020811,0.085707,0.505266,0.280534,2.570114,0.178991,5.638538,4.904984,13.021602,0.259325
min,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,25.0,0.0
25%,40.0,0.0,0.0,1.0,0.0,11.0,150.0,12.0,12.0,0.0,...,120.0,0.0,1.0,0.0,1.0,0.0,3.0,1.0,68.0,0.0
50%,80.0,0.0,0.0,1.0,0.0,24.0,350.0,12.0,12.0,0.0,...,340.0,0.0,1.0,0.0,2.0,0.0,5.0,1.0,74.0,0.0
75%,300.0,0.0,1.0,1.0,0.0,38.0,570.0,12.0,12.0,1.0,...,790.0,0.0,2.0,0.0,4.0,0.0,8.0,1.0,82.0,0.0
max,125000.0,1068.0,1.0,5.0,1.0,54.0,999.0,12.0,12.0,1.0,...,13840.0,1.0,3.0,5.0,11.0,2.0,36.0,37.0,101.0,1.0


In [23]:
### Sum all results
df1 = data.groupby(['Provider'], as_index = False)[['InscClaimAmtReimbursed', 'DeductibleAmtPaid', 'RenalDiseaseIndicator', 
                                                  'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
                                                  'ChronicCond_KidneyDisease', 'ChronicCond_Cancer', 
                                                  'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression', 
                                                  'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart', 
                                                  'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
                                                  'ChronicCond_stroke', 'WhetherDead', 'NumPhysicians', 
                                                  'NumProc','NumUniqueClaims', 'ExtraClm', 'AdmissionDays',
                                                  'ClaimDays', 'Hospt']].sum()
### Count number of records
df2 = data[['BeneID', 'ClaimID']].groupby(data['Provider']).nunique().reset_index()
### Calculate mean
df3 = data.groupby(['Provider'], as_index = False)[['NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
                                                    'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
                                                    'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt', 'Age']].mean()
### Combine all together
df = df2.merge(df1, on='Provider', how='left').merge(df3, on='Provider', how='left')
print(df.shape, target.shape)

(5410, 32) (5410, 3)


In [24]:
df1 = df.merge(target, on='Provider', how='left').drop(['Provider', 'target'], axis = 1)
df2 = df.merge(target, on='Provider', how='left').drop(['Provider', 'PotentialFraud'], axis = 1)
print(df.shape, target.shape)

(5410, 32) (5410, 3)


In [25]:
X_train, X_val, y_train, y_val = train_test_split(df.drop(['Provider'], axis = 1), target.target.to_numpy(), test_size=0.25, random_state=1)


In [26]:
X_train

Unnamed: 0,BeneID,ClaimID,InscClaimAmtReimbursed,DeductibleAmtPaid,RenalDiseaseIndicator,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,...,AdmissionDays,ClaimDays,Hospt,NoOfMonths_PartACov,NoOfMonths_PartBCov,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,Age
4796,16,17,3760,190.0,3,7,9,9,3,6,...,0.0,17,0,12.000000,12.000000,3431.764706,859.529412,1763.529412,456.470588,76.176471
1986,45,50,14170,150.0,14,20,29,19,9,17,...,0.0,138,0,12.000000,12.000000,5400.000000,576.720000,2020.400000,694.800000,67.640000
1398,49,62,17660,50.0,22,16,36,32,12,18,...,0.0,147,0,12.000000,12.000000,3698.064516,461.806452,2215.322581,599.838710,72.677419
770,77,79,210190,24634.0,14,35,44,29,18,24,...,141.0,286,24,12.000000,12.000000,7674.936709,1049.316456,1866.708861,566.455696,72.329114
4128,13,19,12810,50.0,7,5,9,5,0,6,...,0.0,70,0,12.000000,12.000000,2421.578947,281.052632,2853.157895,1021.578947,68.789474
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
905,270,305,105080,3296.0,51,125,177,123,47,111,...,5.0,732,2,11.960656,11.957377,4809.934426,487.134426,2142.262295,627.639344,73.363934
5192,36,38,7750,100.0,5,15,22,15,7,6,...,0.0,59,0,11.684211,11.842105,2100.526316,281.052632,2638.421053,761.052632,73.815789
3980,17,17,8700,0.0,5,1,7,9,1,2,...,0.0,60,0,12.000000,12.000000,2218.823529,251.294118,3920.000000,1147.647059,70.235294
235,32,52,107540,7636.0,11,23,35,21,7,29,...,32.0,167,7,11.769231,11.230769,8448.846154,1598.384615,1935.384615,493.653846,71.153846


In [27]:
y_pred=[[ 117, 132, 605670, 66286.0, 29, 56, 80, 64, 10, 41, 54,
       100, 112, 33, 38, 12, 1, 156, 48, 761, 10, 382.0, 617, 62,
       11.818181818181818, 11.871212121212121, 7568.181818181818,
       931.4242424242424, 2678.181818181818, 737.1212121212121,
       69.5530303030303]]

In [28]:
cols = X_train.columns

X_train = StandardScaler().fit_transform(X_train)
X_val = StandardScaler().fit_transform(X_val)

print("Train obs: {}; Features Number: {}".format(X_train.shape[0], X_train.shape[1]))
print("Validation obs: {};".format(X_val.shape[0]))

Train obs: 4057; Features Number: 31
Validation obs: 1353;


In [31]:
from sklearn.svm import SVC

# Create an instance of the SVC classifier and train it on the training data
clf = SVC()
clf.fit(X_train, y_train)

# Use the trained classifier to predict the target values for X_train
y_train_pred = clf.predict(y_pred)

# Print the predicted target values
print("Predicted target values for X_train:")
print(y_train_pred)

Predicted target values for X_train:
[0]
