In [41]:
from sklearn.linear_model import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import numpy as np
import pandas as pd

In [2]:
# read the processed data 

data_file = pd.read_csv('../TAMU_FINAL_DATASET_2018.csv',sep=',')

In [5]:
print(data_file.columns)

print(data_file.dtypes.unique())

data_file.head()

Index(['ID', 'AGE', 'SEX_CD', 'AMI_FLAG', 'ESRD_IND', 'HOSPICE_IND',
       'ORIG_REAS_ENTITLE_CD', 'RECON_MA_RISK_SCORE_NBR',
       'RECON_RX_RISK_SCORE_NBR', 'PCP_ASSIGNMENT',
       ...
       'COL', 'COL_GAP', 'AMM', 'AMM_GAP', 'DIAB_PASS', 'ACE_PASS',
       'STATIN_PASS', 'ACE_ELIG', 'DIAB_ELIG', 'STATIN_ELIG'],
      dtype='object', length=448)
[dtype('int64') dtype('O') dtype('float64')]


Unnamed: 0,ID,AGE,SEX_CD,AMI_FLAG,ESRD_IND,HOSPICE_IND,ORIG_REAS_ENTITLE_CD,RECON_MA_RISK_SCORE_NBR,RECON_RX_RISK_SCORE_NBR,PCP_ASSIGNMENT,...,COL,COL_GAP,AMM,AMM_GAP,DIAB_PASS,ACE_PASS,STATIN_PASS,ACE_ELIG,DIAB_ELIG,STATIN_ELIG
0,1,77,F,0,N,N,0.0,0.424,0.402,MEMBER SELECTED,...,0,0,0,0,0,0,0,0,0,0
1,2,49,F,0,N,N,1.0,2.879,1.159,ATTRIBUTED,...,0,0,0,0,0,0,1,0,0,1
2,3,75,F,0,N,N,0.0,0.638,0.568,MEMBER SELECTED,...,1,0,0,0,0,1,1,1,0,1
3,4,68,M,0,N,N,0.0,0.584,0.886,MEMBER SELECTED,...,1,1,0,0,1,1,1,1,1,1
4,5,81,F,0,N,N,1.0,1.242,1.212,MEMBER SELECTED,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# only select the numeric data

df = data_file.select_dtypes(include=['float64','int64'])

In [10]:
print(df.columns)
print(df.dtypes.unique())

df.head()

Index(['ID', 'AGE', 'AMI_FLAG', 'ORIG_REAS_ENTITLE_CD',
       'RECON_MA_RISK_SCORE_NBR', 'RECON_RX_RISK_SCORE_NBR',
       'CON_VISIT_04_Q01', 'CON_VISIT_04_Q02', 'CON_VISIT_04_Q03',
       'CON_VISIT_04_Q04',
       ...
       'COL', 'COL_GAP', 'AMM', 'AMM_GAP', 'DIAB_PASS', 'ACE_PASS',
       'STATIN_PASS', 'ACE_ELIG', 'DIAB_ELIG', 'STATIN_ELIG'],
      dtype='object', length=437)
[dtype('int64') dtype('float64')]


Unnamed: 0,ID,AGE,AMI_FLAG,ORIG_REAS_ENTITLE_CD,RECON_MA_RISK_SCORE_NBR,RECON_RX_RISK_SCORE_NBR,CON_VISIT_04_Q01,CON_VISIT_04_Q02,CON_VISIT_04_Q03,CON_VISIT_04_Q04,...,COL,COL_GAP,AMM,AMM_GAP,DIAB_PASS,ACE_PASS,STATIN_PASS,ACE_ELIG,DIAB_ELIG,STATIN_ELIG
0,1,77,0,0.0,0.424,0.402,3,1,2,1,...,0,0,0,0,0,0,0,0,0,0
1,2,49,0,1.0,2.879,1.159,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
2,3,75,0,0.0,0.638,0.568,0,0,0,0,...,1,0,0,0,0,1,1,1,0,1
3,4,68,0,0.0,0.584,0.886,0,1,0,5,...,1,1,0,0,1,1,1,1,1,1
4,5,81,0,1.0,1.242,1.212,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# drop the null value 

print(df.shape)
df = df.dropna()
print(df.shape)

(100000, 437)
(74762, 437)


In [15]:
# define the target we would like to predict 
y = df['AMI_FLAG']

X = df.loc[:, df.columns != 'AMI_FLAG']

In [18]:
# sanity check on X, y dimensions 
print(X.shape)
print(y.shape)

(74762, 436)
(74762,)


In [21]:
# split training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=42)

In [22]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(50090, 436)
(24672, 436)
(50090,)
(24672,)


In [36]:
# construct linear model using sklearn 
lm = LogisticRegression(penalty='l1')
model = lm.fit(X_train, y_train)

In [37]:
pred_train = lm.predict(X_train)
pred_test = lm.predict(X_test)

In [38]:
print(pred_train[:10])
print(y_train[:10])

print(pred_test[:10])
print(y_test[:10])

[0 0 0 0 0 0 0 0 0 0]
5491     0
34892    0
41970    1
67758    0
56583    0
67417    0
48263    0
55124    0
20509    0
97574    0
Name: AMI_FLAG, dtype: int64
[0 0 0 0 0 0 0 0 0 0]
23429    0
91757    0
64834    0
63359    0
38531    0
41278    0
7992     0
726      0
27399    0
34888    0
Name: AMI_FLAG, dtype: int64


In [42]:
# logistic regression result 

print(np.count_nonzero(pred_train))
print(np.count_nonzero(y_train))


print(np.count_nonzero(pred_test))
print(np.count_nonzero(y_test))


print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))


26
1302
27
690
0.9741265721700938
0.971181906614786
