In [13]:
import mlflow
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('https://full-stack-assets.s3.eu-west-3.amazonaws.com/Deployment/doctolib_simplified_dataset_01.csv')

In [3]:
df.drop(columns=['Unnamed: 0', 'PatientId', 'AppointmentID'], axis=1, inplace=True)
df[['ScheduledDay', 'AppointmentDay']] = df[['ScheduledDay', 'AppointmentDay']].apply(pd.to_datetime)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype              
---  ------          --------------   -----              
 0   Gender          110527 non-null  object             
 1   ScheduledDay    110527 non-null  datetime64[ns, UTC]
 2   AppointmentDay  110527 non-null  datetime64[ns, UTC]
 3   Age             110527 non-null  int64              
 4   Neighbourhood   110527 non-null  object             
 5   Scholarship     110527 non-null  int64              
 6   Hypertension    110527 non-null  int64              
 7   Diabetes        110527 non-null  int64              
 8   Alcoholism      110527 non-null  int64              
 9   Handcap         110527 non-null  int64              
 10  SMS_received    110527 non-null  int64              
 11  No-show         110527 non-null  object             
dtypes: datetime64[ns, UTC](2), int64(7), object(3)
memory usage: 10.1+ MB


In [4]:
df['No-show'] = df['No-show'].map({'Yes': 1, 'No': 0})
df['Gender'] = df['Gender'].map({'F': 1, 'M': 0})

In [5]:
df['ApppointmentWeekday'] = df['AppointmentDay'].dt.dayofweek
df['ScheduledDayWeekday'] = df['ScheduledDay'].dt.dayofweek
df

Unnamed: 0,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,ApppointmentWeekday,ScheduledDayWeekday
0,1,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,62,Quartier Saint-Germain-l'Auxerrois,0,1,0,0,0,0,0,4,4
1,0,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,56,Quartier Saint-Germain-l'Auxerrois,0,0,0,0,0,0,0,4,4
2,1,2016-04-29 16:19:04+00:00,2016-04-29 00:00:00+00:00,62,Quartier des Halles,0,0,0,0,0,0,0,4,4
3,1,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,8,Quartier du Palais-Royal,0,0,0,0,0,0,0,4,4
4,1,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,56,Quartier Saint-Germain-l'Auxerrois,0,1,1,0,0,0,0,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110522,1,2016-05-03 09:15:35+00:00,2016-06-07 00:00:00+00:00,56,Quartier Notre-Dame-des-Champs,0,0,0,0,0,1,0,1,1
110523,1,2016-05-03 07:27:33+00:00,2016-06-07 00:00:00+00:00,51,Quartier Notre-Dame-des-Champs,0,0,0,0,0,1,0,1,1
110524,1,2016-04-27 16:03:52+00:00,2016-06-07 00:00:00+00:00,21,Quartier Notre-Dame-des-Champs,0,0,0,0,0,1,0,1,2
110525,1,2016-04-27 15:09:23+00:00,2016-06-07 00:00:00+00:00,38,Quartier Notre-Dame-des-Champs,0,0,0,0,0,1,0,1,2


In [6]:
df = pd.get_dummies(data=df, columns=['Neighbourhood'], drop_first=True)
df

Unnamed: 0,Gender,ScheduledDay,AppointmentDay,Age,Scholarship,Hypertension,Diabetes,Alcoholism,Handcap,SMS_received,...,Neighbourhood_Quartier du Jardin-des-Plantes,Neighbourhood_Quartier du Mail,Neighbourhood_Quartier du Montparnasse,Neighbourhood_Quartier du Palais-Royal,Neighbourhood_Quartier du Parc-de-Montsouris,Neighbourhood_Quartier du Petit-Montrouge,Neighbourhood_Quartier du Pont-de-Flandre,Neighbourhood_Quartier du Père-Lachaise,Neighbourhood_Quartier du Val-de-Grâce,Neighbourhood_Unknown
0,1,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,62,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,56,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,2016-04-29 16:19:04+00:00,2016-04-29 00:00:00+00:00,62,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,8,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,56,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110522,1,2016-05-03 09:15:35+00:00,2016-06-07 00:00:00+00:00,56,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
110523,1,2016-05-03 07:27:33+00:00,2016-06-07 00:00:00+00:00,51,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
110524,1,2016-04-27 16:03:52+00:00,2016-06-07 00:00:00+00:00,21,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
110525,1,2016-04-27 15:09:23+00:00,2016-06-07 00:00:00+00:00,38,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [7]:
cols = df.columns.tolist()
del cols[10]
del cols[1:3]
cols

['Gender',
 'Age',
 'Scholarship',
 'Hypertension',
 'Diabetes',
 'Alcoholism',
 'Handcap',
 'SMS_received',
 'ApppointmentWeekday',
 'ScheduledDayWeekday',
 'Neighbourhood_Quartier Necker',
 'Neighbourhood_Quartier Notre-Dame (Paris)',
 'Neighbourhood_Quartier Notre-Dame-des-Champs',
 'Neighbourhood_Quartier Saint-Ambroise',
 'Neighbourhood_Quartier Saint-Fargeau',
 'Neighbourhood_Quartier Saint-Georges (Paris)',
 'Neighbourhood_Quartier Saint-Germain-des-Prés',
 "Neighbourhood_Quartier Saint-Germain-l'Auxerrois",
 'Neighbourhood_Quartier Saint-Gervais',
 'Neighbourhood_Quartier Saint-Lambert',
 'Neighbourhood_Quartier Saint-Merri',
 "Neighbourhood_Quartier Saint-Thomas-d'Aquin",
 'Neighbourhood_Quartier Saint-Victor',
 'Neighbourhood_Quartier Saint-Vincent-de-Paul',
 'Neighbourhood_Quartier Sainte-Avoye',
 'Neighbourhood_Quartier Sainte-Marguerite',
 'Neighbourhood_Quartier Vivienne',
 "Neighbourhood_Quartier d'Amérique",
 "Neighbourhood_Quartier d'Auteuil",
 'Neighbourhood_Quartier 

In [8]:
X = df[cols]
y = df['No-show']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [15]:
modelRFC = RandomForestClassifier(random_state=42)
xgb = XGBClassifier(max_depth=8, min_child_weight=10, n_estimators=100)

xgb.fit(X_train, y_train)
modelRFC.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [16]:
print(f"XGBoost Accuracy score on the train dataset: {xgb.score(X_train, y_train)}")
print(f"XGBoost Accuracy score on the test dataset: {xgb.score(X_test, y_test)}\n")
print(f"Random Forest Accuracy score on the train dataset: {modelRFC.score(X_train, y_train)}")
print(f"Random Forest Accuracy score on the test dataset: {modelRFC.score(X_test, y_test)}\n")

XGBoost Accuracy score on the train dataset: 0.8073345798902226
XGBoost Accuracy score on the test dataset: 0.79585263462652

Random Forest Accuracy score on the train dataset: 0.96411122504373
Random Forest Accuracy score on the test dataset: 0.7641864504921829



In [17]:
y_train.values

array([0, 0, 0, ..., 0, 1, 0])