In [121]:
# Imports
import pandas as pd
import os
import numpy as np

In [122]:
# Define the directory path
directory_path = '../../data/features'

# List all files in the directory
files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]

# Define the correct datatypes for pandas
dtypes = {
    'HCP_ID': 'str',
    'PATIENT_AGE': 'Int64',
    'PATIENT_GENDER': 'category',
    'PATIENT_ID': 'str',
    'STATE': 'category',
    'HCP_SPECIALTY': 'category',
    'HCP_AGE': 'Int64',
    'HCP_GENDER': 'category',
    'STATE_NAME': 'category',
    'TXN_LOCATION_TYPE': 'category',
    'INSURANCE_TYPE': 'category',
    'TXN_TYPE': 'category',
    'TXN_DESC': 'category',
    'LOWCONT_COUNT': 'Int64',
    'MEDCONT_COUNT': 'Int64',
    'HIGHCONT_COUNT': 'Int64',
    'NUM_CONDITIONS': 'Int64',
    'NUM_CONTRAINDICATIONS': 'Int64',
    'NUM_SYMPTOMS': 'Int64',
    'DRUGA_COUNT': 'Int64',
    'TARGET': 'Int64'
}

# Specify the datetime columns
parse_dates = ['TXN_DT',  'BIRTH_DT']

# Load the CSV files into a dictionary of dataframes
dataframes = {}
for file in files:
    file_path = os.path.join(directory_path, file)
    
    # Read the CSV file without specifying parse_dates
    df = pd.read_csv(file_path, dtype=dtypes)
    
    # Check if 'TXN_DT' column exists and parse it as datetime if it does
    if 'TXN_DT' in df.columns:
        df['TXN_DT'] = pd.to_datetime(df['TXN_DT'])

    # Check if 'BIRTH_DT' column exists and parse it as datetime if it does
    if 'BIRTH_DT' in df.columns:
        df['BIRTH_DT'] = pd.to_datetime(df['BIRTH_DT'])
    
    dataframes[file] = df

# Print the loaded dataframes
for file, df in dataframes.items():
    print(f"Data from {file}:")
    print(df.head())

Data from dim_physician.csv:
  HCP_ID STATE        HCP_SPECIALTY  HCP_AGE HCP_GENDER  STATE_NAME
0      1    TX     PHYSICAL THERAPY     <NA>  U-Unknown       Texas
1      2    IN  PHYSICIAN ASSISTANT     <NA>  U-Unknown     Indiana
2      3    CA   EMERGENCY MEDICINE     <NA>  U-Unknown  California
3      4    TX   NURSE PRACTITIONER     <NA>  U-Unknown       Texas
4      5    WA   EMERGENCY MEDICINE     <NA>  U-Unknown  Washington
Data from grp_txn_treatment.csv:
  PATIENT_ID  DRUGA_COUNT  TARGET
0          1            0       0
1         10            0       0
2        100            0       0
3       1000            0       0
4       1001            0       0
Data from grp_txn_medcont.csv:
  PATIENT_ID  MEDCONT_COUNT
0          1              0
1         10              0
2        100              0
3       1000              0
4       1001              0
Data from grp_txn_num_contraindications.csv:
  PATIENT_ID  NUM_CONTRAINDICATIONS
0          1                      0
1         

In [123]:
# Load dfs into variables
grp_txn_treatment = dataframes['grp_txn_treatment.csv']
grp_txn_medcont = dataframes['grp_txn_medcont.csv']
grp_txn_num_contraindications = dataframes['grp_txn_num_contraindications.csv']
grp_txn_highcont = dataframes['grp_txn_highcont.csv']
grp_txn_num_symptoms = dataframes['grp_txn_num_symptoms.csv']
dim_patient = dataframes['dim_patient.csv']
grp_txn_lowcont = dataframes['grp_txn_lowcont.csv']
grp_txn_num_conditions = dataframes['grp_txn_num_conditions.csv']
grp_txn_disease_x_features = dataframes['grp_txn_disease_x_features.csv']
dim_physician = dataframes['dim_physician.csv']

In [124]:
#join model_table (dim_patient) with fact_txn_num_conditions to add NUM_CONDITIONS
#left join, left= model_table
model_table = pd.merge(left=dim_patient, right=grp_txn_num_conditions, how='left', left_on='PATIENT_ID', right_on='PATIENT_ID')
model_table

Unnamed: 0,PATIENT_ID,BIRTH_DT,PATIENT_AGE,PATIENT_GENDER,NUM_CONDITIONS
0,1,1988-01-01,36,M-Male,1
1,2,2020-01-01,4,M-Male,1
2,3,1973-01-01,51,M-Male,1
3,4,2022-01-01,2,M-Male,1
4,5,1988-01-01,36,M-Male,1
...,...,...,...,...,...
4015,4016,1947-01-01,77,F-Female,71
4016,4017,1937-01-01,87,F-Female,64
4017,4018,1937-01-01,87,F-Female,68
4018,4019,1958-01-01,66,M-Male,15


In [125]:
# Join contraindications
model_table = pd.merge(left=model_table, right=grp_txn_num_contraindications, how='left', left_on='PATIENT_ID', right_on='PATIENT_ID')
model_table

Unnamed: 0,PATIENT_ID,BIRTH_DT,PATIENT_AGE,PATIENT_GENDER,NUM_CONDITIONS,NUM_CONTRAINDICATIONS
0,1,1988-01-01,36,M-Male,1,0
1,2,2020-01-01,4,M-Male,1,0
2,3,1973-01-01,51,M-Male,1,0
3,4,2022-01-01,2,M-Male,1,0
4,5,1988-01-01,36,M-Male,1,0
...,...,...,...,...,...,...
4015,4016,1947-01-01,77,F-Female,71,6
4016,4017,1937-01-01,87,F-Female,64,54
4017,4018,1937-01-01,87,F-Female,68,37
4018,4019,1958-01-01,66,M-Male,15,39


In [126]:
#join model_table with fact_txn_disease_x_features to add DISEASEX_DT, LOCATION_TYPE and INSURANCE_TYPE (as extra feature)
#left join, left= model_table
model_table = pd.merge(left=model_table, right=grp_txn_disease_x_features, how='left', left_on='PATIENT_ID', right_on='PATIENT_ID')
model_table

Unnamed: 0,PATIENT_ID,BIRTH_DT,PATIENT_AGE,PATIENT_GENDER,NUM_CONDITIONS,NUM_CONTRAINDICATIONS,TXN_DT,HCP_ID,TXN_LOCATION_TYPE,INSURANCE_TYPE,TXN_TYPE,TXN_DESC,RANK
0,1,1988-01-01,36,M-Male,1,0,2022-06-11,24633,EMERGENCY ROOM - HOSPITAL,COMMERCIAL,CONDITIONS,DISEASE_X,1
1,2,2020-01-01,4,M-Male,1,0,2022-06-22,7777,EMERGENCY ROOM - HOSPITAL,COMMERCIAL,CONDITIONS,DISEASE_X,1
2,3,1973-01-01,51,M-Male,1,0,2022-06-20,17051,OFFICE,COMMERCIAL,CONDITIONS,DISEASE_X,1
3,4,2022-01-01,2,M-Male,1,0,2022-06-30,19478,EMERGENCY ROOM - HOSPITAL,COMMERCIAL,CONDITIONS,DISEASE_X,1
4,5,1988-01-01,36,M-Male,1,0,2022-06-02,,INDEPENDENT LABORATORY,COMMERCIAL,CONDITIONS,DISEASE_X,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4015,4016,1947-01-01,77,F-Female,71,6,2022-06-03,15294,OFFICE,MEDICARE,CONDITIONS,DISEASE_X,1
4016,4017,1937-01-01,87,F-Female,64,54,2022-06-21,11575,OFF CAMPUS-OUTPATIENT HOSPITAL,MEDICARE,CONDITIONS,DISEASE_X,1
4017,4018,1937-01-01,87,F-Female,68,37,2022-06-24,,OTHER PLACE OF SERVICE,COMMERCIAL,CONDITIONS,DISEASE_X,1
4018,4019,1958-01-01,66,M-Male,15,39,2022-06-07,5402,EMERGENCY ROOM - HOSPITAL,COMMERCIAL,CONDITIONS,DISEASE_X,1


In [127]:
#calculate age of patient when diagnosed with Disease X

#from datetime import date
def calculate_age_diagnosed(birth, txn_date):
    return txn_date.year - birth.year - ((txn_date.month, txn_date.day) < (birth.month, birth.day))

In [128]:
model_table['PATIENT_AGE_DIAGNOSED'] = model_table.apply(lambda x: calculate_age_diagnosed(x['BIRTH_DT'], x['TXN_DT']), axis=1)

In [129]:
#join model_table with dim_physician to add PHYSICIAN_TYPE, PHYSICIAN_STATE, PHYSICIAN_STATE_NAME (as extra feature for data viz)
model_table = pd.merge(left=model_table, right=dim_physician, how='left', left_on='HCP_ID', right_on='HCP_ID')
model_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4020 entries, 0 to 4019
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   PATIENT_ID             4020 non-null   object        
 1   BIRTH_DT               4020 non-null   datetime64[ns]
 2   PATIENT_AGE            4020 non-null   Int64         
 3   PATIENT_GENDER         4020 non-null   category      
 4   NUM_CONDITIONS         4020 non-null   Int64         
 5   NUM_CONTRAINDICATIONS  4020 non-null   Int64         
 6   TXN_DT                 4020 non-null   datetime64[ns]
 7   HCP_ID                 3407 non-null   object        
 8   TXN_LOCATION_TYPE      4020 non-null   category      
 9   INSURANCE_TYPE         4020 non-null   category      
 10  TXN_TYPE               4020 non-null   category      
 11  TXN_DESC               4020 non-null   category      
 12  RANK                   4020 non-null   int64         
 13  PAT

In [130]:
# Join target variable to model_table
model_table = pd.merge(left=model_table, right=grp_txn_treatment, how='left', left_on='PATIENT_ID', right_on='PATIENT_ID')


In [131]:
# Define the output directory and file name
output_directory = '../../data/model_input'
output_file = 'model_table.csv'

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Save the model_table dataframe to a CSV file
model_table.to_csv(os.path.join(output_directory, output_file), index=False)