In [57]:
# Imports
import joblib
import pandas as pd

In [58]:
# Load the model
model = joblib.load('../../models/saved_models/random_forest_model.joblib')

In [59]:
# Define the correct datatypes for pandas
dtype_dict = {
    'HCP_ID': 'str',
    'PATIENT_AGE': 'Int64',
    'PATIENT_GENDER': 'category',
    'PATIENT_ID': 'str',
    'STATE': 'category',
    'HCP_SPECIALTY': 'category',
    'HCP_AGE': 'Int64',
    'HCP_GENDER': 'category',
    'STATE_NAME': 'category',
    'TXN_LOCATION_TYPE': 'category',
    'INSURANCE_TYPE': 'category',
    'TXN_TYPE': 'category',
    'TXN_DESC': 'category',
    'LOWCONT_COUNT': 'Int64',
    'MEDCONT_COUNT': 'Int64',
    'HIGHCONT_COUNT': 'Int64',
    'NUM_CONDITIONS': 'Int64',
    'NUM_CONTRAINDICATIONS': 'Int64',
    'NUM_SYMPTOMS': 'Int64',
    'DRUGA_COUNT': 'Int64',
    'TARGET': 'Int64'
}

input_data = pd.read_csv('../../data/model_input/file_to_predict.csv')


# Convert columns to the specified data types
for col, dtype in dtype_dict.items():
    if col in input_data.columns:
        if dtype == 'category':
            input_data[col] = input_data[col].astype('category')
        elif dtype == 'Int64':
            input_data[col] = pd.to_numeric(input_data[col], errors='coerce').astype('Int64')
        else:
            input_data[col] = input_data[col].astype(dtype)

# Parse dates to datetime
date_columns = ['TXN_DT']
for col in date_columns:
    if col in input_data.columns:
        input_data[col] = pd.to_datetime(input_data[col], errors='coerce')

# Display the DataFrame info
input_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   PATIENT_ID             5 non-null      object        
 1   PATIENT_GENDER         5 non-null      category      
 2   NUM_CONDITIONS         5 non-null      Int64         
 3   TXN_DT                 5 non-null      datetime64[ns]
 4   HCP_ID                 5 non-null      object        
 5   TXN_LOCATION_TYPE      5 non-null      category      
 6   INSURANCE_TYPE         5 non-null      category      
 7   TXN_TYPE               5 non-null      category      
 8   TXN_DESC               5 non-null      category      
 9   RANK                   5 non-null      int64         
 10  PATIENT_AGE_DIAGNOSED  5 non-null      int64         
 11  STATE                  5 non-null      category      
 12  HCP_SPECIALTY          5 non-null      category      
 13  HCP_GENDE

In [60]:
input_data.head()

Unnamed: 0,PATIENT_ID,PATIENT_GENDER,NUM_CONDITIONS,TXN_DT,HCP_ID,TXN_LOCATION_TYPE,INSURANCE_TYPE,TXN_TYPE,TXN_DESC,RANK,PATIENT_AGE_DIAGNOSED,STATE,HCP_SPECIALTY,HCP_GENDER
0,1871,F-Female,82,2022-06-02,12642,OTHER PLACE OF SERVICE,COMMERCIAL,CONDITIONS,DISEASE_X,1,66,NY,FAMILY MEDICINE,M-Male
1,1977,F-Female,2,2022-06-16,24708,TELEHEALTH PROVIDED IN PATIENT'S HOME,COMMERCIAL,CONDITIONS,DISEASE_X,1,55,FL,FAMILY MEDICINE,F-Female
2,3261,F-Female,10,2022-06-06,4627,TELEHEALTH PROVIDED IN PATIENT'S HOME,COMMERCIAL,CONDITIONS,DISEASE_X,1,45,GA,FAMILY MEDICINE,F-Female
3,639,F-Female,1,2022-06-27,16860,EMERGENCY ROOM - HOSPITAL,COMMERCIAL,CONDITIONS,DISEASE_X,1,7,TX,DIAGNOSTIC RADIOLOGY,M-Male
4,2851,F-Female,3,2022-06-07,10657,HOSPITAL OUTPATIENT,COMMERCIAL,CONDITIONS,DISEASE_X,1,66,KS,INTERNAL MEDICINE,F-Female


In [61]:
# Preprocess categorical features
categorical_columns = input_data.select_dtypes(include=['category']).columns
input_data = pd.get_dummies(input_data, columns=categorical_columns, drop_first=True)

# Drop unnecessary columns
input_data = input_data.drop(columns=['PATIENT_ID', 'TXN_DT', 'HCP_ID'])

In [62]:
input_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 13 columns):
 #   Column                                                   Non-Null Count  Dtype
---  ------                                                   --------------  -----
 0   NUM_CONDITIONS                                           5 non-null      Int64
 1   RANK                                                     5 non-null      int64
 2   PATIENT_AGE_DIAGNOSED                                    5 non-null      int64
 3   TXN_LOCATION_TYPE_HOSPITAL OUTPATIENT                    5 non-null      bool 
 4   TXN_LOCATION_TYPE_OTHER PLACE OF SERVICE                 5 non-null      bool 
 5   TXN_LOCATION_TYPE_TELEHEALTH PROVIDED IN PATIENT'S HOME  5 non-null      bool 
 6   STATE_GA                                                 5 non-null      bool 
 7   STATE_KS                                                 5 non-null      bool 
 8   STATE_NY                                              

In [63]:
input_data.head()

Unnamed: 0,NUM_CONDITIONS,RANK,PATIENT_AGE_DIAGNOSED,TXN_LOCATION_TYPE_HOSPITAL OUTPATIENT,TXN_LOCATION_TYPE_OTHER PLACE OF SERVICE,TXN_LOCATION_TYPE_TELEHEALTH PROVIDED IN PATIENT'S HOME,STATE_GA,STATE_KS,STATE_NY,STATE_TX,HCP_SPECIALTY_FAMILY MEDICINE,HCP_SPECIALTY_INTERNAL MEDICINE,HCP_GENDER_M-Male
0,82,1,66,False,True,False,False,False,True,False,True,False,True
1,2,1,55,False,False,True,False,False,False,False,True,False,False
2,10,1,45,False,False,True,True,False,False,False,True,False,False
3,1,1,7,False,False,False,False,False,False,True,False,False,True
4,3,1,66,True,False,False,False,True,False,False,False,True,False


In [64]:
# Ensure all expected columns are present
expected_columns = model.feature_names_in_
for col in expected_columns:
    if col not in input_data.columns:
        input_data[col] = False
        
#Reorder columns to match the training data
input_data = input_data[expected_columns]

  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data[col] = False
  input_data

In [65]:
input_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Columns: 162 entries, NUM_CONDITIONS to HCP_GENDER_U-Unknown
dtypes: Int64(1), bool(159), int64(2)
memory usage: 1.0 KB


In [66]:
# Make prediction
prediction = model.predict(input_data)
#return {"prediction": prediction[0]}

In [72]:
prediction

array([0., 0., 0., 0., 0.])

In [67]:
df = pd.read_csv('../../data/model_input/file_to_predict.csv')
        
# Convert DataFrame to dictionary
data_dict = df.to_dict(orient='records')

In [68]:
import pandas as pd
from pydantic import BaseModel, create_model

# Generate the Pydantic model dynamically
fields = {col: (str, ...) if df[col].dtype == 'object' else (int, ...) for col in df.columns}
PatientData = create_model('PatientData', **fields)

# Print the dynamically created model schema
print(PatientData.schema_json(indent=2))

{
  "properties": {
    "PATIENT_ID": {
      "title": "Patient Id",
      "type": "integer"
    },
    "PATIENT_GENDER": {
      "title": "Patient Gender",
      "type": "string"
    },
    "NUM_CONDITIONS": {
      "title": "Num Conditions",
      "type": "integer"
    },
    "TXN_DT": {
      "title": "Txn Dt",
      "type": "string"
    },
    "HCP_ID": {
      "title": "Hcp Id",
      "type": "integer"
    },
    "TXN_LOCATION_TYPE": {
      "title": "Txn Location Type",
      "type": "string"
    },
    "INSURANCE_TYPE": {
      "title": "Insurance Type",
      "type": "string"
    },
    "TXN_TYPE": {
      "title": "Txn Type",
      "type": "string"
    },
    "TXN_DESC": {
      "title": "Txn Desc",
      "type": "string"
    },
    "RANK": {
      "title": "Rank",
      "type": "integer"
    },
    "PATIENT_AGE_DIAGNOSED": {
      "title": "Patient Age Diagnosed",
      "type": "integer"
    },
    "STATE": {
      "title": "State",
      "type": "string"
    },
    "HCP_SPEC

/var/folders/06/v9kx_rns7ln9cbrjmlcdb52r0000gn/T/ipykernel_80201/1887344650.py:9: PydanticDeprecatedSince20: The `schema_json` method is deprecated; use `model_json_schema` and json.dumps instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  print(PatientData.schema_json(indent=2))


In [73]:
data_dict

[{'PATIENT_ID': 1871,
  'PATIENT_GENDER': 'F-Female',
  'NUM_CONDITIONS': 82,
  'TXN_DT': '2022-06-02',
  'HCP_ID': 12642,
  'TXN_LOCATION_TYPE': 'OTHER PLACE OF SERVICE',
  'INSURANCE_TYPE': 'COMMERCIAL',
  'TXN_TYPE': 'CONDITIONS',
  'TXN_DESC': 'DISEASE_X',
  'RANK': 1,
  'PATIENT_AGE_DIAGNOSED': 66,
  'STATE': 'NY',
  'HCP_SPECIALTY': 'FAMILY MEDICINE',
  'HCP_GENDER': 'M-Male'},
 {'PATIENT_ID': 1977,
  'PATIENT_GENDER': 'F-Female',
  'NUM_CONDITIONS': 2,
  'TXN_DT': '2022-06-16',
  'HCP_ID': 24708,
  'TXN_LOCATION_TYPE': "TELEHEALTH PROVIDED IN PATIENT'S HOME",
  'INSURANCE_TYPE': 'COMMERCIAL',
  'TXN_TYPE': 'CONDITIONS',
  'TXN_DESC': 'DISEASE_X',
  'RANK': 1,
  'PATIENT_AGE_DIAGNOSED': 55,
  'STATE': 'FL',
  'HCP_SPECIALTY': 'FAMILY MEDICINE',
  'HCP_GENDER': 'F-Female'},
 {'PATIENT_ID': 3261,
  'PATIENT_GENDER': 'F-Female',
  'NUM_CONDITIONS': 10,
  'TXN_DT': '2022-06-06',
  'HCP_ID': 4627,
  'TXN_LOCATION_TYPE': "TELEHEALTH PROVIDED IN PATIENT'S HOME",
  'INSURANCE_TYPE': 'COM