In [1]:
import os
import base64
import pandas as pd
import numpy as np
import category_encoders
import json
import joblib
import pickle
import requests
import seaborn as sns
from uuid import uuid4
from copy import deepcopy

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
plt.rcParams['figure.figsize']=(4.8,3.6)

In [2]:
def load_data():
    df = pd.read_csv(os.path.join("data/train", "train.csv"))
    return df

df = load_data()
df.head()

Unnamed: 0,observation_id,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,...,Payment Typology 2,Payment Typology 3,Attending Provider License Number,Operating Provider License Number,Other Provider License Number,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
0,cdb7f86a-68c8-4a11-a54a-d6ab74fdb7fe,Western NY,Erie,1401014.0,208.0,Women And Children's Hospital Of Buffalo,0 to 17,143,F,White,...,Self-Pay,,270457.0,,,0,N,Y,$4623.66,$2633.42
1,78b71d32-2c27-4781-89b3-6bc0ed48b22a,Western NY,Erie,1401014.0,208.0,Women And Children's Hospital Of Buffalo,0 to 17,141,M,White,...,Self-Pay,,244296.0,,,0,N,Y,$2238.33,$993.57
2,bde8094e-d9bd-48f9-ba8e-7b318a0c0b76,Western NY,Erie,1401005.0,210.0,Erie County Medical Center,0 to 17,142,F,Other Race,...,Medicaid,,239618.0,,,0,N,Y,$13947.78,$7199.01
3,b68bee74-48e7-4382-ba99-a3795e52380e,Western NY,Erie,1401008.0,213.0,Mercy Hospital of Buffalo,18 to 29,142,F,White,...,,,210044.0,,,0,N,Y,$7700.16,$3875.62
4,c532d911-85ac-4689-945c-35e0b8dbff6a,Western NY,Erie,1401008.0,213.0,Mercy Hospital of Buffalo,18 to 29,142,F,Other Race,...,Medicaid,,229449.0,191190.0,,0,N,N,$8814.77,$5210.92


In [16]:
df.dtypes


observation_id                          object
Health Service Area                     object
Hospital County                         object
Operating Certificate Number           float64
Facility Id                            float64
Facility Name                           object
Age Group                               object
Zip Code - 3 digits                     object
Gender                                  object
Race                                    object
Ethnicity                               object
Length of Stay                           int64
Type of Admission                       object
Patient Disposition                     object
Discharge Year                           int64
CCS Diagnosis Code                       int64
CCS Diagnosis Description               object
CCS Procedure Code                       int64
CCS Procedure Description               object
APR DRG Code                             int64
APR DRG Description                     object
APR MDC Code 

In [17]:
#1) DUMMY Pipeline
X=df.copy()
#X=X.drop('salary', axis=1, inplace=True)
#drop the target and the columns that won't be sent in the request: 
columns_to_drop = ['Length of Stay', 'Patient Disposition', 'Discharge Year', 'Total Charges', 'Total Costs']

# Drop the specified columns if they exist in the DataFrame X
X.drop(columns=[col for col in columns_to_drop if col in X.columns], inplace=True)

y=df.copy()
y=y['Length of Stay']
#X=X[["age", "workclass", "education", "marital-status", "race", "sex", "capital-gain", "capital-loss", "hours-per-week"]]

categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = list(set(X.columns).difference(categorical_features))

preprocessor = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy='mean'), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)
pipeline = make_pipeline(
    preprocessor,
    RandomForestClassifier(max_depth=3, min_samples_leaf=.03, class_weight="balanced", random_state=42, n_jobs=-1)
)
pipeline.fit(X, y)

with open(os.path.join("pickles", "columns.json"), 'w') as fh:
    json.dump(X.columns.tolist(), fh)
with open(os.path.join("pickles", "dtypes.pickle"), 'wb') as fh:
    pickle.dump(X.dtypes, fh)
import joblib
joblib.dump(pipeline,os.path.join("pickles", "pipeline.pickle"))

['pickles/pipeline.pickle']

In [10]:
#checking if this would work to restructure the request (dummy example)
obs_test={"observation_id": "fd3f9217-9f93-47f5-9375-dd2386e92e5f", "Health Service Area": "Western NY", "Hospital County": "Erie","Operating Certificate Number": "1401014.0","Facility Id": "207.0"}
_id = obs_test['observation_id']
observation = {key: obs_test[key] for key in ['Health Service Area', 'Hospital County', 'Operating Certificate Number', 'Facility Id']}

In [13]:
#2) Restructuring the request



In [14]:
_id

'fd3f9217-9f93-47f5-9375-dd2386e92e5f'

In [15]:
observation

{'Health Service Area': 'Western NY',
 'Hospital County': 'Erie',
 'Operating Certificate Number': '1401014.0',
 'Facility Id': '207.0'}