In [61]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split

In [62]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn import  preprocessing # used for label encoding and imputing NaNs

In [63]:
claimsdata = pd.read_csv("processedData.csv")
claimsdata.head()

Unnamed: 0,Ticket Number,DateOfService,CarrierCode,Charges,Payments,InsurancePayment,PatientPayment,Writeoffs,InsuranceBal,PatientBal,...,CPT,DateOfEntry,PaymentPostDate,NPI,OrderingClinic,DeniedCode,FirstBilledDate,ProviderProfile,CaseCount,accepted
0,104452,1/2/2019,CFHP-E2,3900,0.0,0.0,0.0,0.0,0.0,3900.0,...,81507,1/7/2019,1/22/2019,1316428238,ALAMO WOMEN'S OBGYN,197,1/8/2019,NTINC,1,False
1,104453,1/2/2019,CFHP-E2,3900,0.0,0.0,0.0,0.0,0.0,3900.0,...,81507,1/7/2019,1/24/2019,1316428238,ALAMO WOMEN'S OBGYN,197,1/8/2019,NTINC,1,False
2,104454,1/2/2019,NCS,795,249.0,0.0,249.0,546.0,0.0,0.0,...,99999,1/7/2019,2/10/2019,1699760116,COMPREHENSIVE WOMEN'S HEALTHCARE,-,,NTINC,1,True
3,104496,1/2/2019,AETLIF-E,3900,0.0,0.0,0.0,0.0,3900.0,0.0,...,81420,1/7/2019,1/28/2019,1609216597,PEACHTREE WOMEN'S CLINIC - NORTHSIDE,-,1/8/2019,NTINC,1,True
4,104497,1/2/2019,AETLIF-E,3900,0.0,0.0,0.0,0.0,3900.0,0.0,...,81420,1/7/2019,1/30/2019,1609216597,PEACHTREE WOMEN'S CLINIC - NORTHSIDE,-,1/8/2019,NTINC,1,True


In [64]:
claimsdata.shape

(22696, 22)

In [65]:
claimsdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22696 entries, 0 to 22695
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Ticket Number     22696 non-null  int64  
 1   DateOfService     22696 non-null  object 
 2   CarrierCode       22696 non-null  object 
 3   Charges           22696 non-null  int64  
 4   Payments          22696 non-null  float64
 5   InsurancePayment  22696 non-null  float64
 6   PatientPayment    22696 non-null  float64
 7   Writeoffs         22696 non-null  float64
 8   InsuranceBal      22696 non-null  float64
 9   PatientBal        22696 non-null  float64
 10  AllowedAmount     22696 non-null  float64
 11  FinancialClass    22696 non-null  object 
 12  CPT               22696 non-null  object 
 13  DateOfEntry       22696 non-null  object 
 14  PaymentPostDate   22696 non-null  object 
 15  NPI               22696 non-null  int64  
 16  OrderingClinic    22690 non-null  object

In [66]:
claimsdata['NPI'] = claimsdata['NPI'].astype('object')

In [181]:
X = claimsdata.drop(['Ticket Number', 'CaseCount', 'PatientPayment', 'InsurancePayment', 'DateOfService', 'FirstBilledDate', 'DeniedCode', 'PaymentPostDate', 'DateOfEntry', 'PatientBal', 'InsuranceBal', 'accepted'], axis=1)
y = claimsdata['accepted']

In [182]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22696 entries, 0 to 22695
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CarrierCode      22696 non-null  object 
 1   Charges          22696 non-null  int64  
 2   Payments         22696 non-null  float64
 3   Writeoffs        22696 non-null  float64
 4   AllowedAmount    22696 non-null  float64
 5   FinancialClass   22696 non-null  object 
 6   CPT              22696 non-null  object 
 7   NPI              22696 non-null  object 
 8   OrderingClinic   22690 non-null  object 
 9   ProviderProfile  22696 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 1.7+ MB


In [183]:
for c in X.columns:
    if X[c].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X[c].values)) 
        X[c] = lbl.transform(list(X[c].values))

In [184]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22696 entries, 0 to 22695
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CarrierCode      22696 non-null  int64  
 1   Charges          22696 non-null  int64  
 2   Payments         22696 non-null  float64
 3   Writeoffs        22696 non-null  float64
 4   AllowedAmount    22696 non-null  float64
 5   FinancialClass   22696 non-null  int64  
 6   CPT              22696 non-null  int64  
 7   NPI              22696 non-null  int64  
 8   OrderingClinic   22696 non-null  int64  
 9   ProviderProfile  22696 non-null  int64  
dtypes: float64(3), int64(7)
memory usage: 1.7 MB


In [185]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

Check the records for Training and testing data sets

In [186]:
X_train.shape

(18156, 10)

In [187]:
X_test.shape

(4540, 10)

In [188]:
###Feature selection by Random Forest Classifier

In [189]:
randomForestselect = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1))
randomForestselect.fit(X_train, y_train)
randomForestselect.get_support() #selected vs not selected features

array([ True, False,  True,  True,  True, False, False,  True,  True,
       False])

In [190]:
features = X_train.columns[randomForestselect.get_support()]

In [191]:
features

Index(['CarrierCode', 'Payments', 'Writeoffs', 'AllowedAmount', 'NPI',
       'OrderingClinic'],
      dtype='object')

In [192]:
len(features)

6

In [193]:
np.mean(randomForestselect.estimator_.feature_importances_)

0.1

In [194]:
randomForestselect.estimator_.feature_importances_

array([0.14393163, 0.04745443, 0.22323687, 0.12524939, 0.15921636,
       0.03344602, 0.05648723, 0.10092677, 0.10825635, 0.00179495])

In [195]:
X_train_rfc = randomForestselect.transform(X_train)
X_test_rfc = randomForestselect.transform(X_test)

In [196]:
def run_randomForest(X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print("Accuracy: ", accuracy_score(y_test,y_pred))

In [197]:
run_randomForest(X_train_rfc, X_test_rfc, y_train, y_test)

Accuracy:  0.9511013215859031


In [198]:
run_randomForest(X_train, X_test, y_train, y_test)

Accuracy:  0.9618942731277533


In [199]:
####Recursive Feature Elimination

In [200]:
from sklearn.feature_selection import RFE
randomForestselectRFE = RFE(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1),n_features_to_select = 8)
randomForestselectRFE.fit(X_train,y_train)

RFE(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                     class_weight=None, criterion='gini',
                                     max_depth=None, max_features='auto',
                                     max_leaf_nodes=None, max_samples=None,
                                     min_impurity_decrease=0.0,
                                     min_impurity_split=None,
                                     min_samples_leaf=1, min_samples_split=2,
                                     min_weight_fraction_leaf=0.0,
                                     n_estimators=100, n_jobs=-1,
                                     oob_score=False, random_state=0, verbose=0,
                                     warm_start=False),
    n_features_to_select=8, step=1, verbose=0)

In [201]:
randomForestselectRFE.get_support()

array([ True,  True,  True,  True,  True, False,  True,  True,  True,
       False])

In [202]:
X_train_RFE = randomForestselectRFE.transform(X_train)
X_test_RFE = randomForestselectRFE.transform(X_test)

In [203]:
run_randomForest(X_train_RFE, X_test_RFE, y_train, y_test)
run_randomForest(X_train, X_test, y_train, y_test)

Accuracy:  0.9592511013215859
Accuracy:  0.9618942731277533


In [204]:
### Feature Selection By Gradient Boost Tree

In [205]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFE

In [208]:
gradientBoostRFE = RFE(GradientBoostingClassifier(n_estimators=100, random_state=0),n_features_to_select = 8)
gradientBoostRFE.fit(X_train,y_train)

RFE(estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                         criterion='friedman_mse', init=None,
                                         learning_rate=0.1, loss='deviance',
                                         max_depth=3, max_features=None,
                                         max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         n_estimators=100,
                                         n_iter_no_change=None,
                                         presort='deprecated', random_state=0,
                                         subsample=1.0, tol=0.0001,
                                         validation_frac

In [209]:
gradientBoostRFE.get_support()

array([ True,  True,  True,  True,  True,  True,  True, False,  True,
       False])

In [210]:
X_train_Grad_RFE = gradientBoostRFE.transform(X_train)
X_test_Grad_RFE = gradientBoostRFE.transform(X_test)

In [211]:
run_randomForest(X_train_Grad_RFE, X_test_Grad_RFE, y_train, y_test)
run_randomForest(X_train, X_test, y_train, y_test)

Accuracy:  0.944273127753304
Accuracy:  0.9618942731277533


In [212]:
for index in range(1, 11):
    gradientBoostRFELoop = RFE(GradientBoostingClassifier(n_estimators=100, random_state=0),n_features_to_select = index)
    gradientBoostRFELoop.fit(X_train,y_train)
    X_train_Grad_RFELoop = gradientBoostRFELoop.transform(X_train)
    X_test_Grad_RFELoop = gradientBoostRFELoop.transform(X_test)
    print("selected features:", index)
    run_randomForest(X_train_Grad_RFELoop, X_test_Grad_RFELoop, y_train, y_test)
    print()

selected features: 1
Accuracy:  0.7927312775330396

selected features: 2
Accuracy:  0.8519823788546256

selected features: 3
Accuracy:  0.8870044052863436

selected features: 4
Accuracy:  0.9279735682819383

selected features: 5
Accuracy:  0.926431718061674

selected features: 6
Accuracy:  0.9552863436123348

selected features: 7
Accuracy:  0.9458149779735683

selected features: 8
Accuracy:  0.944273127753304

selected features: 9
Accuracy:  0.9634361233480176

selected features: 10
Accuracy:  0.9618942731277533



In [None]:
#max value
submitTrainSensorData = pd.read_csv("data/train/train_submit.csv")
X_train_submit = submitTrainSensorData.drop('Class', axis=1)
y_train_submit = submitTrainSensorData['Class']

gradientBoostRFELoop = RFE(GradientBoostingClassifier(n_estimators=100, random_state=0),n_features_to_select = index)
gradientBoostRFELoop.fit(X_train_submit,y_train_submit)
X_train_Grad_RFELoop = gradientBoostRFELoop.transform(X_train_submit)
# X_test_Grad_RFELoop = gradientBoostRFELoop.transform(X_test)
print("selected features:", index)
clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
clf.fit(X_train_submit_Grad_RFE,y_train_submit)
y_pred_submit = clf.predict(x_submit_gradient_rfe)

prediction = pd.DataFrame(y_pred_submit, columns=['Class']).to_csv('submission.csv')
print()


In [213]:
for index in range(1, 11):
    gradientBoostRFELoop = RFE(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1),n_features_to_select = index)
    gradientBoostRFELoop.fit(X_train,y_train)
    X_train_Grad_RFELoop = gradientBoostRFE.transform(X_train)
    X_test_Grad_RFELoop = gradientBoostRFE.transform(X_test)
    print("selected features:", index)
    run_randomForest(X_train_Grad_RFELoop, X_test_Grad_RFELoop, y_train, y_test)
    print()

selected features: 1
Accuracy:  0.944273127753304

selected features: 2
Accuracy:  0.944273127753304

selected features: 3
Accuracy:  0.944273127753304

selected features: 4
Accuracy:  0.944273127753304

selected features: 5
Accuracy:  0.944273127753304

selected features: 6
Accuracy:  0.944273127753304

selected features: 7
Accuracy:  0.944273127753304

selected features: 8
Accuracy:  0.944273127753304

selected features: 9
Accuracy:  0.944273127753304

selected features: 10
Accuracy:  0.944273127753304

