In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_score,recall_score,roc_curve,auc,roc_auc_score

  from numpy.core.umath_tests import inner1d


In [2]:
# Loading the csv file into padas data frame
data=pd.read_csv('highUtilizationPredictionV2wco.csv')

In [3]:
# Shape of data frame (number of rows, number of columns)
data.shape

(113024, 68)

In [4]:
# Checking for Missing values in the data
s=pd.DataFrame(data.isnull().sum())
s[(s>0).all(axis=1)]

Unnamed: 0,0


As shown in above result, there are no missing values in the data frame. So, we can go ahead and split the data into training and test sets.

In [5]:
data.head()

Unnamed: 0,race,age,patient_id,ELIX1,ELIX2,ELIX3,ELIX4,ELIX5,ELIX6,ELIX7,...,drugs_m4-5,drugs_m5-6,drugs_m6-7,drugs_m7-8,drugs_m8-9,drugs_m9-10,drugs_m10-11,drugs_m11-12,HighUtilizationY2,claimCount
0,B,71,PAT136597,0,0,0,0,0,1,0,...,0,1,4,2,1,3,1,1,1,160
1,A,86,PAT119838,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,24
2,W,70,PAT11289,1,0,0,0,0,0,0,...,4,2,2,0,6,2,1,0,0,52
3,W,75,PAT178745,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,15
4,W,77,PAT50922,0,0,0,0,1,0,0,...,4,3,4,4,4,3,4,4,0,66


In [6]:
# Splitting the variable race into binary dummy variables
x=pd.get_dummies(data['race'])
newdf=pd.concat([data,x],axis=1)
del newdf['race']
del newdf['patient_id']
Y=newdf['HighUtilizationY2']
del newdf['HighUtilizationY2']
del newdf['claimCount']

In [7]:
# Splitting the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(newdf, Y, test_size=0.2)

In [8]:
# training the model on training data
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
# predicting using the trained model
Predictions = logmodel.predict(X_test)

In [10]:
#Classification Report of test set
print("Classification Report:")
print(classification_report(y_test,Predictions))

Classification Report:
             precision    recall  f1-score   support

          0       0.94      1.00      0.97     21158
          1       0.61      0.09      0.15      1447

avg / total       0.92      0.94      0.92     22605



In [11]:
# Evaluation metrices for test set
print("Confusion Matrix:")
print(confusion_matrix(y_test, Predictions))
print("Accuracy:",accuracy_score(y_test, Predictions))
print("Precision:",precision_score(y_test, Predictions))
print("Recall:",recall_score(y_test, Predictions))

Confusion Matrix:
[[21077    81]
 [ 1320   127]]
Accuracy: 0.9380225613802257
Precision: 0.6105769230769231
Recall: 0.08776779543883897


In [12]:
#AUC comparison for training and test sets
print("AUC score using logistic Regression:")
#training set
train_pred=logmodel.predict_proba(X_train)[:, 1]
print("AUC for train Set:",roc_auc_score(y_train, train_pred))

#test set
Predictions = logmodel.predict_proba(X_test)[:, 1]
print("AUC for test Set:",roc_auc_score(y_test, Predictions))

AUC score using logistic Regression:
AUC for train Set: 0.8213823018923839
AUC for test Set: 0.828505776755961


In [13]:
print("AUC score using Random Forest:")
rf=RandomForestClassifier()
rf.fit(X_train, y_train)
#training set
rf_probs = rf.predict_proba(X_train)[:, 1]
print("AUC for train Set:",roc_auc_score(y_train, rf_probs))
#test set
rf_probs = rf.predict_proba(X_test)[:, 1]
print("AUC for test Set:",roc_auc_score(y_test, rf_probs))

AUC score using Random Forest:
AUC for train Set: 0.9985552467067205
AUC for test Set: 0.7491143574852919


In [14]:
print("AUC score using Stochastic gradient descent:")
sgd=SGDClassifier(loss='modified_huber')
sgd.fit(X_train, y_train)
#training set
sgd_probs = sgd.predict_proba(X_train)[:, 1]
print("AUC for train Set:",roc_auc_score(y_train, sgd_probs))
#test set
sgd_probs = sgd.predict_proba(X_test)[:, 1]
print("AUC for test Set:",roc_auc_score(y_test, sgd_probs))

AUC score using Stochastic gradient descent:




AUC for train Set: 0.5115522538388467
AUC for test Set: 0.5122047479937205


In [15]:
print("AUC score using Naive Bayes:")
nb=GaussianNB()
nb.fit(X_train, y_train)
#training set
nb_probs = nb.predict_proba(X_train)[:, 1]
print("AUC for train Set:",roc_auc_score(y_train, nb_probs))
#test set
nb_probs = nb.predict_proba(X_test)[:, 1]
print("AUC for test Set:",roc_auc_score(y_test, nb_probs))

AUC score using Naive Bayes:
AUC for train Set: 0.7920435962245227
AUC for test Set: 0.7953138701132552


Results:

AUC score using logistic Regression:
AUC for train Set: 0.8213823018923839
AUC for test Set: 0.828505776755961

AUC score using Random Forest:
AUC for train Set: 0.9985552467067205
AUC for test Set: 0.7491143574852919


AUC score using Stochastic gradient descent:
AUC for train Set: 0.5115522538388467
AUC for test Set: 0.5122047479937205

AUC score using Naive Bayes:
AUC for train Set: 0.7920435962245227
AUC for test Set: 0.7953138701132552

From the results above, we can see that all the models excluding random forest have almost same auc scores for training and test set. Hence, we can say that logistic regression, Stochastic gradient descent and Naive bayes are not over fitting. But there is significant difference in the AUC scores of random forest models which says that random forest might be over fitting on the data. Since random forest model might be overfitting, it cannot be a good classifier model. So, excluding random forest, it is evident that logistic regression model has highest auc score among other 3 models and the model is a good classifier if the auc score is 1 or closer to 1. Hence, Logistic Regression is a better classifier as compared to other classifier since its auc score is highest of all and closest to 1, and there is no significant difference in the auc scores of training and test sets.