In [38]:
# Imports
import pandas as pd
import os
import numpy as np

In [39]:
# Read model_input
dtype_dict = {'HCP_ID': 'str',
    'PATIENT_AGE': 'Int64',
    'PATIENT_GENDER': 'category',
    'PATIENT_ID': 'str',
    'STATE': 'category',
    'HCP_SPECIALTY': 'category',
    'HCP_AGE': 'Int64',
    'HCP_GENDER': 'category',
    'STATE_NAME': 'category',
    'TXN_LOCATION_TYPE': 'category',
    'INSURANCE_TYPE': 'category',
    'TXN_TYPE': 'category',
    'TXN_DESC': 'category',
    'LOWCONT_COUNT': 'Int64',
    'MEDCONT_COUNT': 'Int64',
    'HIGHCONT_COUNT': 'Int64',
    'NUM_CONDITIONS': 'Int64',
    'NUM_CONTRAINDICATIONS': 'Int64',
    'NUM_SYMPTOMS': 'Int64',
    'DRUGA_COUNT': 'Int64',
    'TARGET': 'Int64'}

model_table = pd.read_csv('../../data/model_input/model_table.csv', dtype=dtype_dict)

# Parse dates to datetime
date_columns = ['TXN_DT', 'BIRTH_DT']
for col in date_columns:
    model_table[col] = pd.to_datetime(model_table[col], errors='coerce')


model_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4020 entries, 0 to 4019
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   PATIENT_ID             4020 non-null   object        
 1   BIRTH_DT               4020 non-null   datetime64[ns]
 2   PATIENT_AGE            4020 non-null   Int64         
 3   PATIENT_GENDER         4020 non-null   category      
 4   NUM_CONDITIONS         4020 non-null   Int64         
 5   NUM_CONTRAINDICATIONS  4020 non-null   Int64         
 6   TXN_DT                 4020 non-null   datetime64[ns]
 7   HCP_ID                 3407 non-null   object        
 8   TXN_LOCATION_TYPE      4020 non-null   category      
 9   INSURANCE_TYPE         4020 non-null   category      
 10  TXN_TYPE               4020 non-null   category      
 11  TXN_DESC               4020 non-null   category      
 12  RANK                   4020 non-null   int64         
 13  PAT

In [40]:
model_table.drop(columns=['BIRTH_DT'], inplace=True)
model_table.drop(columns=['STATE_NAME'], inplace=True)
model_table.drop(columns=['PATIENT_AGE'], inplace=True)
model_table.drop(columns=['HCP_AGE'], inplace=True)
model_table.drop(columns=['DRUGA_COUNT'], inplace=True)

model_table.drop(columns=['TXN_TYPE'], inplace=True)
model_table.drop(columns=['TXN_DESC'], inplace=True)
model_table.drop(columns=['RANK'], inplace=True)


model_table

Unnamed: 0,PATIENT_ID,PATIENT_GENDER,NUM_CONDITIONS,NUM_CONTRAINDICATIONS,TXN_DT,HCP_ID,TXN_LOCATION_TYPE,INSURANCE_TYPE,PATIENT_AGE_DIAGNOSED,STATE,HCP_SPECIALTY,HCP_GENDER,TARGET
0,1,M-Male,1,0,2022-06-11,24633,EMERGENCY ROOM - HOSPITAL,COMMERCIAL,34,TX,FAMILY MEDICINE,M-Male,0
1,2,M-Male,1,0,2022-06-22,7777,EMERGENCY ROOM - HOSPITAL,COMMERCIAL,2,PA,EMERGENCY MEDICINE,M-Male,0
2,3,M-Male,1,0,2022-06-20,17051,OFFICE,COMMERCIAL,49,MS,EMERGENCY MEDICINE,F-Female,0
3,4,M-Male,1,0,2022-06-30,19478,EMERGENCY ROOM - HOSPITAL,COMMERCIAL,0,PA,PEDIATRICS,F-Female,0
4,5,M-Male,1,0,2022-06-02,,INDEPENDENT LABORATORY,COMMERCIAL,34,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4015,4016,F-Female,71,6,2022-06-03,15294,OFFICE,MEDICARE,75,NJ,INTERNAL MEDICINE,F-Female,0
4016,4017,F-Female,64,54,2022-06-21,11575,OFF CAMPUS-OUTPATIENT HOSPITAL,MEDICARE,85,IL,FAMILY MEDICINE,F-Female,1
4017,4018,F-Female,68,37,2022-06-24,,OTHER PLACE OF SERVICE,COMMERCIAL,85,,,,0
4018,4019,M-Male,15,39,2022-06-07,5402,EMERGENCY ROOM - HOSPITAL,COMMERCIAL,64,TX,EMERGENCY MEDICINE,F-Female,1


In [41]:
# Drop rows where 'HCP_ID' is NaN, non in target list
model_table = model_table.dropna(subset=['HCP_ID'])
model_table.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3407 entries, 0 to 4019
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   PATIENT_ID             3407 non-null   object        
 1   PATIENT_GENDER         3407 non-null   category      
 2   NUM_CONDITIONS         3407 non-null   Int64         
 3   NUM_CONTRAINDICATIONS  3407 non-null   Int64         
 4   TXN_DT                 3407 non-null   datetime64[ns]
 5   HCP_ID                 3407 non-null   object        
 6   TXN_LOCATION_TYPE      3407 non-null   category      
 7   INSURANCE_TYPE         3407 non-null   category      
 8   PATIENT_AGE_DIAGNOSED  3407 non-null   int64         
 9   STATE                  3407 non-null   category      
 10  HCP_SPECIALTY          3407 non-null   category      
 11  HCP_GENDER             3407 non-null   category      
 12  TARGET                 3407 non-null   Int64         
dtypes: Int64

In [42]:
# Save Analytics Ready Dataset
model_table.to_csv('../../data/model_input/ads_model_table.csv', index=False)

In [43]:
model_table.info()


<class 'pandas.core.frame.DataFrame'>
Index: 3407 entries, 0 to 4019
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   PATIENT_ID             3407 non-null   object        
 1   PATIENT_GENDER         3407 non-null   category      
 2   NUM_CONDITIONS         3407 non-null   Int64         
 3   NUM_CONTRAINDICATIONS  3407 non-null   Int64         
 4   TXN_DT                 3407 non-null   datetime64[ns]
 5   HCP_ID                 3407 non-null   object        
 6   TXN_LOCATION_TYPE      3407 non-null   category      
 7   INSURANCE_TYPE         3407 non-null   category      
 8   PATIENT_AGE_DIAGNOSED  3407 non-null   int64         
 9   STATE                  3407 non-null   category      
 10  HCP_SPECIALTY          3407 non-null   category      
 11  HCP_GENDER             3407 non-null   category      
 12  TARGET                 3407 non-null   Int64         
dtypes: Int64

In [44]:
# Drop TARGET column
model_table = model_table.drop(columns=['TARGET'])

# Select 5 random rows
sampled_model_table = model_table.sample(n=5, random_state=1)

# Save to CSV
sampled_model_table.to_csv('../../data/model_input/file_to_predict.csv', index=False)