# Evluating the embeddings
Loading of the data, adding the features to <italics>original_features<italics> and the training of the model is done using this link: https://towardsdatascience.com/machine-learning-with-datetime-feature-engineering-predicting-healthcare-appointment-no-shows-5e4ca3a85f96

In [11]:
# import libraries
import pandas as pd
import os

from Timestamp2Vec_Model.helper_functions import *
from Timestamp2Vec_Model.Timestamp2Vec import *
from sklearn.ensemble import RandomForestClassifier

SEED = 123

## Data preparation

### Loading the data
Loading the Medical Appointment No Shows Data Set, from this link https://www.kaggle.com/datasets/joniarroba/noshowappointments

### Creating features for timestamps vectorized by Timestamp2Vec

In [2]:
data_location = os.path.join(os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop'), 'data_thesis')

# load the data
df = pd.read_csv(data_location + "/KaggleV2-May-2016.csv")

In [3]:
timestamp2vec = Timestamp2Vec()



In [4]:
scheduled_day_vector = timestamp2vec(df["ScheduledDay"])
apppointment_day_vector = timestamp2vec(df["AppointmentDay"])
diff_day = apppointment_day_vector - scheduled_day_vector

In [5]:
for i in range(scheduled_day_vector.shape[-1]):
    df["latent_var" + str(i + 1) + "_scheduled"] = scheduled_day_vector[:, i]
    df["latent_var" + str(i + 1) + "_appointment"] = apppointment_day_vector[:, i]
    df["latent_var" + str(i + 1) + "_delta"] = diff_day[:, i]

### Extracting features according to the link

In [6]:
df["OUTPUT_LABEL"] = (df["No-show"] == ("Yes")).astype(int)

df["ScheduledDay"] = pd.to_datetime(df["ScheduledDay"], format = "%Y-%m-%dT%H:%M:%SZ", errors = "coerce")
df["AppointmentDay"] = pd.to_datetime(df["AppointmentDay"],  format = "%Y-%m-%dT%H:%M:%SZ", errors = "coerce")
df["AppointmentDay"] = df["AppointmentDay"] +pd.Timedelta("1d") - pd.Timedelta("1s")

In [7]:
df["ScheduledDay_year"] = df["ScheduledDay"].dt.year
df["ScheduledDay_month"] = df["ScheduledDay"].dt.month
df["ScheduledDay_week"] = df["ScheduledDay"].dt.week
df["ScheduledDay_day"] = df["ScheduledDay"].dt.day
df["ScheduledDay_hour"] = df["ScheduledDay"].dt.hour
df["ScheduledDay_minute"] = df["ScheduledDay"].dt.minute
df["ScheduledDay_dayofweek"] = df["ScheduledDay"].dt.dayofweek
df["AppointmentDay_year"] = df["AppointmentDay"].dt.year
df["AppointmentDay_month"] = df["AppointmentDay"].dt.month
df["AppointmentDay_week"] = df["AppointmentDay"].dt.week
df["AppointmentDay_day"] = df["AppointmentDay"].dt.day
df["AppointmentDay_hour"] = df["AppointmentDay"].dt.hour
df["AppointmentDay_minute"] = df["AppointmentDay"].dt.minute
df["AppointmentDay_dayofweek"] = df["AppointmentDay"].dt.dayofweek

  df["ScheduledDay_week"] = df["ScheduledDay"].dt.week
  df["AppointmentDay_week"] = df["AppointmentDay"].dt.week


In [8]:
df["delta_days"] = (df["AppointmentDay"]-df["ScheduledDay"]).dt.total_seconds()/(60*60*24)

In [9]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
def calc_specificity(y_actual, y_pred, thresh=0.201):
    # calculates specificity
    return sum((y_pred < thresh) & (y_actual == 0)) /sum(y_actual ==0)

def calc_prevalence(y):
    return (sum(y)/len(y))

def print_report(y_actual, y_pred, thresh=0.201):
    accuracy = accuracy_score(y_actual, (y_pred > thresh))
    recall = recall_score(y_actual, (y_pred > thresh))
    precision = precision_score(y_actual, (y_pred > thresh))
    f1 = 2 * (precision * recall) / (precision + recall)
    return accuracy, precision, recall, f1

# Evaluating

In [13]:
acc_normal, acc_embed_only, acc_normal_embed = [], [], []
prec_normal, prec_embed_only, prec_normal_embed = [], [], []
rec_normal, rec_embed_only, rec_normal_embed = [], [], []
f1_normal, f1_embed_only, f1_normal_embed = [], [], []

col2use_normal = ["ScheduledDay_day", "ScheduledDay_hour", "ScheduledDay_minute", "ScheduledDay_dayofweek", "AppointmentDay_day", "AppointmentDay_dayofweek", "delta_days"]
# col2use_only_embed = ["delta_days"]
col2use_only_embed = []
col2use_normal_embed = ["ScheduledDay_day", "ScheduledDay_hour", "ScheduledDay_minute", "ScheduledDay_dayofweek", "AppointmentDay_day", "AppointmentDay_dayofweek", "delta_days"]

for i in range(apppointment_day_vector.shape[-1]):
    col2use_only_embed.append("latent_var" + str(i + 1) + "_scheduled")
    col2use_normal_embed.append("latent_var" + str(i + 1) + "_scheduled")
    col2use_only_embed.append("latent_var" + str(i + 1) + "_appointment")
    col2use_normal_embed.append("latent_var" + str(i + 1) + "_appointment")
    col2use_only_embed.append("latent_var" + str(i + 1) + "_delta")
    col2use_normal_embed.append("latent_var" + str(i + 1) + "_delta")

for _ in range(25):
    # shuffle the samples
    df_test = df.sample(n = len(df))
    df_test = df_test.reset_index(drop = True)
    df_valid = df_test.sample(frac = 0.3)
    df_train = df_test.drop(df_valid.index)

    X_train_normal = df_train[col2use_normal].values
    X_valid_normal = df_valid[col2use_normal].values

    X_train_only_embed = df_train[col2use_only_embed].values
    X_valid_only_embed = df_valid[col2use_only_embed].values

    X_train_normal_embed = df_train[col2use_normal_embed].values
    X_valid_normal_embed = df_valid[col2use_normal_embed].values

    y_train = df_train["OUTPUT_LABEL"].values
    y_valid = df_valid["OUTPUT_LABEL"].values

    # creation
    rf_normal = RandomForestClassifier(max_depth = 5, n_estimators=100, random_state = 42)
    rf_only_embed = RandomForestClassifier(max_depth = 5, n_estimators=100, random_state = 42)
    rf_normal_embed = RandomForestClassifier(max_depth = 5, n_estimators=100, random_state = 42)

    # training
    rf_normal.fit(X_train_normal, y_train)
    rf_only_embed.fit(X_train_only_embed, y_train)
    rf_normal_embed.fit(X_train_normal_embed, y_train)

    y_normal_valid_preds = rf_normal.predict_proba(X_valid_normal)[:,1]
    y_only_embed_valid_preds = rf_only_embed.predict_proba(X_valid_only_embed)[:,1]
    y_normal_embed_valid_preds = rf_normal_embed.predict_proba(X_valid_normal_embed)[:,1]

    accuracy_norm, precision_norm, recall_norm, f1_norm = print_report(y_valid, y_normal_valid_preds)
    accuracy_embed, precision_embed, recall_embed, f1_embed = print_report(y_valid, y_only_embed_valid_preds)
    accuracy_norm_embed, precision_norm_embed, recall_norm_embed, f1_norm_embed = print_report(y_valid, y_normal_embed_valid_preds)

    acc_normal.append(accuracy_norm)
    acc_embed_only.append(accuracy_embed)
    acc_normal_embed.append(accuracy_norm_embed)

    prec_normal.append(precision_norm)
    prec_embed_only.append(precision_embed)
    prec_normal_embed.append(precision_norm_embed)

    rec_normal.append(recall_norm)
    rec_embed_only.append(recall_embed)
    rec_normal_embed.append(recall_norm_embed)

    f1_normal.append(f1_norm)
    f1_embed_only.append(f1_embed)
    f1_normal_embed.append(f1_norm_embed)