In [None]:
import numpy as np
import pandas as pd
import matplotlib.plt as plt
%matplotlib inline
import seaborn as sns
plt.style.use('fivethirtyeight')

msk = pd.read_excel('...\MSK Analysis Exercise - PE Ambulatory Care (002).xslx', sheet_name='Data')

In [None]:
msk.head()

In [None]:
msk.shape()

In [None]:
msk.dtypes

In [None]:
msk.columns

In [None]:
sns.scatterplot(x='AGE', y='APPT_TIME', data=msk)

In [None]:
df = msk[['AGE','SEX','PT_SCHEDULED_APPT','ON_TIME_ARRIVAL','PT_LATE','PT_ARRIVE_TIME','PT_WAIT','DELAY','PT_START_TIME',
         'ON_TIME_START','PT_END_TIME','APPT_TIME']]

In [None]:
sns.pairplot(df)

In [None]:
#Need to clean up the AGE column due to number of ages recorded at 41/42

plt.hist(x=msk['AGE', bins=60])

In [None]:
msk_corr = msk.corr()
plt.figure(figsize=(12,8))
sns.heatmap(msk_corr, annot-True, cmap='coolwarm')

In [None]:
msk['TOTAL_DELAY'] = msl['PT_WAIT'] + msk['DELAY']
msk['LATE'] = np.where(df['PT_LATE']>0, 1, 0)
msk.drop(labels=(['PT_WAIT','DELAY','ONTIME+5','PT_LATE','PT_SCHEDULED_APPT']), axis=1, inplace=True)

In [None]:
msk['PT_CONDITION'].value_counts()

In [None]:
msk['PROVIDER_NAME'].value_counts()

In [None]:
#The patient condition is correlated with average appointment time
#The provider is not correlated with average appointment time
msk.groupby(['PATIENT_CONDITION','PROVIDER_NAME'])['APPT_TIME'].mean()

In [None]:
#Insurance type is not correlated with appointment time
msk.groupby('INSURANCE_TYPE')['APPT_TIME'].mean()

In [None]:
msk.groupby(['AGE RANGE','PATIENT_CONDITION'])['APPT_TIME'].count()

In [None]:
#Appointment time is positively correlated with patient age
msk.groupby('AGE RANGE')['APPT_TIME'].mean()

In [None]:
sns.scatterplot(x='AGE', y='APPT_TIME', hue='SEX', data=msk)

In [None]:
msk['HOUR'].hist()

In [None]:
df_msk = msk[['AGE','PATIENT_CONDITION','PROVIDER','SEX','TOTAL_DELAY']]
df_msk = pd.get_dummies(data=df_msk, columns=['PATIENT_CONDITION','PROVIDER_NAME','SEX'])

y = msk['APPT_TIME']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [None]:
X = df_msk
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
rfr = RandomForestRegressor(n_estimators=10)

In [None]:
rfr.fit(X_train, y_train)

In [None]:
y_pred = rfr.predict(X_test)

In [None]:
pred_comp = zip(y_pred, y_test)
                
pred_arr = [[x,y] for x, y in zip(y_pred, y_test)]

pred_arr

In [None]:
rfr.score(X_test, y_test)

In [None]:
sns.scatterplot(x=[i[1] for i in pred_arr], y=[i[0] for i in pre_arr])

In [None]:
feats = pd.Series(data=rfr.feature_importances_, index=X.columns)
feats

In [None]:
from sklearn.dummy import DummyRegressor

In [None]:
dummy = DummyRegressor()

dummy.fit(X_train, y_train)
y_dummy_pred = dummy.predict(X_test)

In [None]:
pred_dummy_comp = zip(y_dummy_pred, y_test)
dummy_pred_arr = [[x,y] for x,y in zip(y_dummy_pred, y_test)]

print(dummy.score(X_test, y_test))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

param_grid = {'n_estimators':[1,5,10,20,50,100],
             'max_features':['auto','sqrt',0.2,0.5,0.75],
             'min_samples_leaf':[5,25,50,100,250]}

In [None]:
gsc = GridSearchCV(estimator=RandomForestRegressor(),
                  param_grid=param_grid)

grid_result = gsc.fit(X_train, y_train)
best_params = gsc_result.best_params_

best_params

In [None]:
best_rfr = RandomForestRegressor(n_estimators=10, max_features='auto', min_samples_leaf=250)

best_rfr.fit(X_train, y_train)

In [None]:
best_pred = best_rfr.predict(X_test)

best_pred_comp = [[x,y] for x,y in zip(best_pred, y_test)]

In [None]:
best_rfr.score(X_test, y_test)

In [None]:
sns.scatterplot(x=[i[1] for i in best_pred_comp], y=[i[0] for i in best_pred_comp])

In [None]:
best_pred