### Make prediction on the hospital Data

In [None]:
#### Install necessary dependencies if needed

In [None]:
# run this if you encouter dependencies issues
# !pip uninstall scikit-learn --yes
# !pip uninstall imblearn --yes
# !pip install scikit-learn==1.2.2
# !pip install imblearn

#### Import libraries

In [4]:
# Import libraries
import pandas as pd
import json
from pickle import load
from imblearn.ensemble import BalancedRandomForestClassifier

##### Preprocessing function

In [None]:
# preprocessing function
def age(dob, appointment_date):
    return appointment_date.year - dob.year - ((appointment_date.month, appointment_date.day) < (dob.month, dob.day))

def target_encoding(col, target, df):
    mean_values = df[[col,target]].groupby([col], as_index = False).mean().sort_values(by = col, ascending = False)
    
    values, mean = tuple(mean_values[col].values), tuple(mean_values[target].values)
    df[col] = df[col].replace(values, mean)
    return df[col]

def preprocess(df, encoder):
    df['next_appointment_time'] = df['next_appointment_time'].fillna('')
    df['cancelled_at'] = df['cancelled_at'].fillna('')
    df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')
    df['appointment_start_time'] = pd.to_datetime(df['appointment_start_time'], errors = 'coerce')
    df['age'] = [age(d[0], d[1]) for d in df[['date_of_birth', 'appointment_start_time']].values]
    #df['cancelled'] = df['cancelled'].replace(('Yes','No'),(1,0))
    df['sex'] = df['sex'].replace(('female', 'male', 'other', '9', 'AMAB'), ('Female', 'Male', 'Other', 'Other', 'Other'))#
    df['sex'] = df['sex'].replace(('Female','Male', 'Other'),(2, 1, 0))
    
    # lets create new column for day week and year of appointment time
    df['appointment_start_time_day'] = df['appointment_start_time'].dt.day
    df['appointment_start_time_week'] = df['appointment_start_time'].dt.month
    df['appointment_start_time_year'] = df['appointment_start_time'].dt.year
    df['appointment_start_time_hour'] = df['appointment_start_time'].dt.time.apply(lambda t: t.hour)
    
    # target encoding
    columns_to_encode = ['business_name', 'patient_status', 'patient_type',
                         'state', 'occupation', 'referred', 'category', 'billable_item',
                        'case_linked', 'case_type', 'appointment_type', ]
    for i,col in enumerate(columns_to_encode):
        df[col] = df[col].apply(lambda v: encoder[i][col][v])
    
    # lets drop null rows
    df = df.dropna()
    # reindex the data
    idxs = list(range(len(df)))
    df.index = idxs
    
    # let's delete unnecassary columns

    df = df.drop(['hospital_id','title', 'date_of_birth', 'city', 'appointment_status', 'next_appointment_time', 'time_of_day', 'month_period', 'day_of_week',
         'month_of_year', 'day_of_month', 'week_of_year', 'cancelled_at', 'customer_type', 'missed', 'appointment_start_time'], axis = 1)
    return df

#### Make prediction

In [None]:

data = pd.read_csv('test.csv')

def make_prediction(x):
    # preprocess data
    with open('Models/encoding.json', 'r') as openfile:
        # Reading from json file
        encoder = json.load(openfile)
    x = preprocess(x, encoder)
    print(x)
    # load model
    # with open("Models/model.pkl", "rb") as f:
    #     clf = load(f)
    
    # classes_dict = {1:'Yes', 0:'No'}
    # prediction = clf.predict(x)
    
    # return {classes_dict[prediction[0]]: prediction[0]}

# print(data)
pred = make_prediction(data)

In [None]:
pred

In [None]:
# sex_column
female = 0
male = 1
other = 2


# cancelled
yes = 0
No = 1

# Reffered
No = 0
Yes = 1

# case_linked
Yes = 1
No = 0

# appointment_status
Not_reebooked = 0
Rebooked = 1

# missed
Yes = 1
No = 0



In [None]:
def target_encoding(col, target, df):
    mean_values = df[[col,target]].groupby([col], as_index = False).mean().sort_values(by = col, ascending = False)
    
    values, mean = tuple(mean_values[col].values), tuple(mean_values[target].values)
    df[col] = df[col].replace(values, mean)
    return df[col]

In [40]:
def map_column_values(column_name, column_value, json_data):
    try:
        for j in json_data:
            if column_name in j.keys():
                mapping_dict = j[column_name]
                lower_mapping_dict = {k.lower(): v for k, v in mapping_dict.items()}
                return lower_mapping_dict[column_value.lower()]
    except Exception as e:
        return  sum(lower_mapping_dict.values()) / len(lower_mapping_dict)

def apply_mappings(df, columns_to_encode, json_data):
    for column in columns_to_encode:
        if column in df.columns:
            df[column] = df[column].apply(lambda x: map_column_values(column, x, json_data))
    return df

In [41]:
def preprocess_data(df, encoder):
    df['case_linked'] = df['case_linked'].str.lower().replace(('yes', 'no'), (1, 0))
    df['referred'] = df['referred'].str.lower().replace(('yes', 'no'), (1, 0))
    df['sex'] = df['sex'].str.lower().replace(("female", "male", "other"), (0, 1, 2))
    df['appointment_status'] = df['appointment_status'].str.lower().replace(('not rebooked', 'rebooked'), (0, 1))
    df['missed'] = df['missed'].str.lower().replace(('yes', 'no'), (1, 0))
    df['appointment_start_time'] = pd.to_datetime(df['appointment_start_time'], errors='coerce')
    df['appointment_start_time_day'] = df['appointment_start_time'].dt.day
    df['appointment_start_time_week'] = df['appointment_start_time'].dt.month
    df['appointment_start_time_year'] = df['appointment_start_time'].dt.year
    df['appointment_start_time_hour'] = df['appointment_start_time'].dt.time.apply(lambda t: t.hour)
    df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')
    df['age'] = df['appointment_start_time'].dt.year - df['date_of_birth'].dt.year
    df.drop(['appointment_start_time', 'date_of_birth'], axis = 1, inplace = True)

    # target encoding
    columns_to_encode = ['business_name', 'patient_status', 'patient_type',
                         'state', 'occupation', 'category', 'billable_item',
                        'case_type', 'appointment_type']
    # for i,col in enumerate(columns_to_encode):
        # df[col] = df[col].apply(lambda v: encoder[i][col][v])
    df = apply_mappings(df, columns_to_encode, encoder)
        
    return df

In [42]:
def MakePrediction(x):
    with open('Models/Encoding.json', 'r') as openfile:
        encoder = json.load(openfile)
    # print(encoder)
    x = preprocess_data(x, encoder)
    # print(encoder)
    return x
    

row = pd.read_csv('test.csv')
row = row.drop(['time_of_day', 'month_period', 'day_of_week', 'month_of_year','day_of_month', 'title', 'city','hospital_id', 'next_appointment_time', 'cancelled_at'], axis = 1)

prediction = MakePrediction(row)


In [43]:
prediction

Unnamed: 0,case_linked,case_type,total_open_invoices_before_appointemnt,total_good_appointments_before_appointment,total_cxl_appointments_before_appointment,patient_status,patient_type,week_of_year,appointment_status,notice,appointment_type,billable_item,category,missed,business_name,customer_type,state,sex,occupation,referred,appointment_start_time_day,appointment_start_time_week,appointment_start_time_year,appointment_start_time_hour,age
0,0,0.133095,0,0,0,0.159151,0.456233,29,0,12.73,0.021053,0.021053,0.031373,0,0.087652,Staff,0.164891,1,0.631579,1,16,7,2020,21,36


In [44]:
pd.set_option('display.max_columns', None)
prediction

Unnamed: 0,case_linked,case_type,total_open_invoices_before_appointemnt,total_good_appointments_before_appointment,total_cxl_appointments_before_appointment,patient_status,patient_type,week_of_year,appointment_status,notice,appointment_type,billable_item,category,missed,business_name,customer_type,state,sex,occupation,referred,appointment_start_time_day,appointment_start_time_week,appointment_start_time_year,appointment_start_time_hour,age
0,0,0.133095,0,0,0,0.159151,0.456233,29,0,12.73,0.021053,0.021053,0.031373,0,0.087652,Staff,0.164891,1,0.631579,1,16,7,2020,21,36


In [18]:
prediction.shape

(1, 25)

In [19]:
prediction.columns

Index(['case_linked', 'case_type', 'total_open_invoices_before_appointemnt',
       'total_good_appointments_before_appointment',
       'total_cxl_appointments_before_appointment', 'patient_status',
       'patient_type', 'week_of_year', 'appointment_status', 'notice',
       'appointment_type', 'billable_item', 'category', 'missed',
       'business_name', 'customer_type', 'state', 'sex', 'occupation',
       'referred', 'appointment_start_time_day', 'appointment_start_time_week',
       'appointment_start_time_year', 'appointment_start_time_hour', 'age'],
      dtype='object')

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
df = pd.read_csv("test.csv")
df

Unnamed: 0,hospital_id,case_linked,case_type,total_open_invoices_before_appointemnt,total_good_appointments_before_appointment,total_cxl_appointments_before_appointment,patient_status,patient_type,time_of_day,month_period,day_of_week,month_of_year,day_of_month,week_of_year,appointment_status,next_appointment_time,notice,cancelled_at,appointment_type,billable_item,category,missed,appointment_start_time,business_name,customer_type,title,state,date_of_birth,sex,city,occupation,referred
0,11,No,Not Availablee,0,0,0,Not Yet Actioned,New To Clinic,Early Morning,Mid month,Friday,July,17,29,Not Rebooked,,12.73,2020-07-16 08:36:13+00,✅20 Minute Follow Up Consult,PhysioCall - 20 Minute Follow Up Consult (Pre-...,A | Pre Pay Online & Save ✅,No,2020-07-16 21:20:00+00,PhysioCall Gladstone,Staff,Mr,QLD,03/02/1984,Male,South Gladstone,Physio,Yes
