# Imports and Reading the CSV

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# csv being read into pandas's dataframe
# if csv file name differs, please change it accordingly
df = pd.read_csv('Kaggle_MissedAppointments.csv')
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


# Data Preprocessing 

In [2]:
# general info to clean up data 
# there are no null values in data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


In [3]:
# some more general info
df.describe()

Unnamed: 0,PatientId,AppointmentID,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received
count,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0
mean,147496300000000.0,5675305.0,37.088874,0.098266,0.197246,0.071865,0.0304,0.022248,0.321026
std,256094900000000.0,71295.75,23.110205,0.297675,0.397921,0.258265,0.171686,0.161543,0.466873
min,39217.84,5030230.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4172614000000.0,5640286.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,31731840000000.0,5680573.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,94391720000000.0,5725524.0,55.0,0.0,0.0,0.0,0.0,0.0,1.0
max,999981600000000.0,5790484.0,115.0,1.0,1.0,1.0,1.0,4.0,1.0


In [4]:
# dropping PatientId and Appointment Id since they do NOT help predict appointment no shows 
# also dropping Neighbourhood as it proves later to be an unhelpful feature
df = df.drop(['PatientId', 'AppointmentID', 'Neighbourhood'], axis = 1)

# fixing column name typos for consistency and convenience
df = df.rename(columns={'Hipertension' : 'Hypertension',
                        'Handcap' : 'Handicap',
                        'SMS_received' : 'SMSReceived',
                        'No-show' : 'NoShow'})

# removing impossible ages and age outliers over 100 years old as noted from df.describe
df = df[(df['Age'] >= 0) & (df['Age'] <= 100)]

# the Handicap column has values from 0 to 4 instead of just 0 and 1, so dropping values greater than 1
df = df[df['Handicap'] <= 1]

# changing NoShow values to numeric values for the model
# 'Yes' value is the one we're interested in
# 'No' to 0 and 'Yes' to 1
df.loc[df['NoShow'] == 'No', 'NoShow'] = 0
df.loc[df['NoShow'] == 'Yes', 'NoShow'] = 1
df['NoShow'] = df['NoShow'].astype('int64')

# checking if the changes were successful
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110320 entries, 0 to 110526
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Gender          110320 non-null  object
 1   ScheduledDay    110320 non-null  object
 2   AppointmentDay  110320 non-null  object
 3   Age             110320 non-null  int64 
 4   Scholarship     110320 non-null  int64 
 5   Hypertension    110320 non-null  int64 
 6   Diabetes        110320 non-null  int64 
 7   Alcoholism      110320 non-null  int64 
 8   Handicap        110320 non-null  int64 
 9   SMSReceived     110320 non-null  int64 
 10  NoShow          110320 non-null  int64 
dtypes: int64(8), object(3)
memory usage: 10.1+ MB


In [5]:
# converting ScheduledDay and AppointmentDay objects to pandas to_datetime
# this gives us calendar dates
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay']).dt.date
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay']).dt.date

# substracted the days from 'AppointmentDay' to 'ScheduledDay' to create a DaysWaited column
# a single day value is easier to work with instead of dates for the logistic regression model
df['DaysWaited'] = (df['AppointmentDay'] - df['ScheduledDay']).dt.days
df = df.drop(['ScheduledDay', 'AppointmentDay'], axis = 1)

# checking to see if changes were successful
df.head()

Unnamed: 0,Gender,Age,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSReceived,NoShow,DaysWaited
0,F,62,0,1,0,0,0,0,0,0
1,M,56,0,0,0,0,0,0,0,0
2,F,62,0,0,0,0,0,0,0,0
3,F,8,0,0,0,0,0,0,0,0
4,F,56,0,1,1,0,0,0,0,0


### Changes to the dataframe to improve accuracy after running model once

In [6]:
'''
These are more changes to model after running it once.

The model roughly had a 78% accuracy on both training and test set.
Normalizing age improved the accuracy to roughly 79% on both sets.

Attempted to one-hot encoding for features: Gender, Neighbourhood.
Neighbourhood did not improve accuracy of the model so the feature is dropped.

Attempted to drop each feature one by one to see if the model performance would improved.
The model did not made any noticeable gains for all cases.
'''

# normalizing Age through min-max normalization to help improve model accuracy
# since age is from 0 to 100 in this case, just divide by the max to normalize it
df['Age'] = df['Age'] / df['Age'].max()
df.head()

Unnamed: 0,Gender,Age,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSReceived,NoShow,DaysWaited
0,F,0.62,0,1,0,0,0,0,0,0
1,M,0.56,0,0,0,0,0,0,0,0
2,F,0.62,0,0,0,0,0,0,0,0
3,F,0.08,0,0,0,0,0,0,0,0
4,F,0.56,0,1,1,0,0,0,0,0


In [7]:
# one-hot encoding function to first create encoded features
# then concatenate the encoded features to the dataframe
# then dropping the original features
def encode(og_df, feature_to_encode):
    dummies = pd.get_dummies(og_df[[feature_to_encode]])
    df = pd.concat([og_df, dummies], axis=1)
    df = df.drop([feature_to_encode], axis=1)
    return(df)

# code to run the one-hot encoding function
# it can take in one or more features
features_to_encode = ['Gender']
for feature in features_to_encode:
    df = encode(df, feature)


# rearranging columns with our class labels as the last column
# visually makes more sense this way
col = list(df.columns.values)
df = df[col[0:1] + col[-3:-1] + [col[-1]] + col[1:8]]
df.head()

Unnamed: 0,Age,DaysWaited,Gender_F,Gender_M,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSReceived,NoShow
0,0.62,0,1,0,0,1,0,0,0,0,0
1,0.56,0,0,1,0,0,0,0,0,0,0
2,0.62,0,1,0,0,0,0,0,0,0,0
3,0.08,0,1,0,0,0,0,0,0,0,0
4,0.56,0,1,0,0,1,1,0,0,0,0


# Creating a Randomized Training and Test Set

In [8]:
from sklearn.model_selection import train_test_split

# running this line will generate a new training and test set from the pandas dataframe
# splitting 80% of the samples into the training set and the rest (20%) of the sample into the test set
df_trainingset, df_testset = train_test_split(df, test_size = 0.2)

# checking if the sets were created successfully
df_trainingset.info()
df_testset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88256 entries, 8422 to 64391
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Age           88256 non-null  float64
 1   DaysWaited    88256 non-null  int64  
 2   Gender_F      88256 non-null  uint8  
 3   Gender_M      88256 non-null  uint8  
 4   Scholarship   88256 non-null  int64  
 5   Hypertension  88256 non-null  int64  
 6   Diabetes      88256 non-null  int64  
 7   Alcoholism    88256 non-null  int64  
 8   Handicap      88256 non-null  int64  
 9   SMSReceived   88256 non-null  int64  
 10  NoShow        88256 non-null  int64  
dtypes: float64(1), int64(8), uint8(2)
memory usage: 6.9 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 22064 entries, 14367 to 105889
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Age           22064 non-null  float64
 1   DaysWaited    22064 n

In [9]:
# converting 'NoShow' values into 'labels' variable for both training and test sets
training_labels = df_trainingset['NoShow']
test_labels = df_testset['NoShow']

# checking to see if changes were successful
training_labels.head()

8422      0
45968     1
44259     0
108940    0
67200     0
Name: NoShow, dtype: int64

In [10]:
# dropping 'NoShow' will give us the 'features' values for both training and test sets
training_features = df_trainingset.drop(['NoShow'], axis = 1)
test_features = df_testset.drop(['NoShow'], axis = 1)

# checking to see if changes were successful
training_features.head()

Unnamed: 0,Age,DaysWaited,Gender_F,Gender_M,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSReceived
8422,0.49,2,0,1,0,0,0,1,0,0
45968,0.43,0,1,0,0,0,0,0,0,0
44259,0.27,61,0,1,0,0,0,0,0,0
108940,0.03,0,0,1,0,0,0,0,0,0
67200,0.39,64,0,1,0,0,0,0,0,1


# Logistic Regression Model

In [32]:
# initializing variables
# num_iter is the number of iterations in gradient descent
# if learning rate is >= 0.01, then the model will return log(0)s
num_iter = 100000
learning_rate = 0.001
weights = np.zeros(10)

# standard sigmoid function to get probabilities from 0 to 1
def sigmoid(s):
    return 1 / (1 + np.exp(-s))


# calculating the hypothesis
# this gets run through the sigmoid function
def predict(features, weights):
    return sigmoid(np.dot(features, weights))


# implemented the cross-entropy loss function and gradient formula
# the formula is from Andrew Ng's derivation
def calculations(features, weights, labels):
    
    # calling the hypothesis
    prediction = predict(features, weights)
    
    
    # n is the number of training examples
    n = len(labels)
    
    # cross-entropy formula for the cost function from Andrew Ng's notes
    cost = (-1/n) * np.sum(labels*np.log(prediction) + (1-labels)*np.log(1-prediction))
    
    # partial derviative of the cost function from Andrew Ng's notes
    gradient = (1/n) * np.dot(features.T, (prediction-labels))
    
    
    return cost, gradient


# training the logistic regression algorithm
# gradient descent function
def descent(features, weights, labels, num_iter, learning_rate):
    
    # in case if I want to create a matplot graph later to see if continuously goes down
    # no need to create graph since I'm printing the iterations
    cost_history = []
    
    # training for loop
    # range is 'num_iter + 1' in order to print the last iteration for cost
    for iter in range(num_iter + 1): 
        
        # for each iteration, re-calculate the weight aka gradient descent
        cost, gradient = calculations(features, weights, labels)
        weights -= learning_rate*gradient
        
        # every 2500 iterations, add cost to list and print the cost of that iteration
        if iter % 2500 == 0:
            cost_history.append(cost)
            print("Cost after", iter, "iterations:", cost)
    
    return weights, cost_history


# classifying the end probabilities to labels: class 0 if < 0.5 and 1 if > 0.5
def classifier(features, weights):
    classifier_labels = np.round_(predict(features,weights))
    return classifier_labels


# accuracy of the model
def accuracy(features, weights, labels):
    classifier_labels = classifier(features, weights)
    accuracy = np.mean(classifier_labels == labels) * 100
    return accuracy

In [33]:
# running logistic regression model over the training set
# printed training accuracy, test accuracy, and final weights
# if the model is good, final weights are used to evaluate each feature to see what is attributing to the classes
final_weights, cost_hist = descent(training_features, weights, training_labels, num_iter, learning_rate)

print("Model training accuracy:", accuracy(training_features, final_weights, training_labels), "%")
print("Model test accuracy:", accuracy(test_features, final_weights, test_labels), "%")
print("Model's final weights: \n", final_weights)

Cost after 0 iterations: 0.6931471805599453
Cost after 2500 iterations: 0.5901342917030302
Cost after 5000 iterations: 0.5492750576064033
Cost after 7500 iterations: 0.5267642881117767
Cost after 10000 iterations: 0.5135084834968564
Cost after 12500 iterations: 0.5052249360935944
Cost after 15000 iterations: 0.49977314321037813
Cost after 17500 iterations: 0.4960186152649026
Cost after 20000 iterations: 0.4933276335942284
Cost after 22500 iterations: 0.49132975264036327
Cost after 25000 iterations: 0.4897997730660941
Cost after 27500 iterations: 0.48859602372362765
Cost after 30000 iterations: 0.4876266139340205
Cost after 32500 iterations: 0.48683025439307787
Cost after 35000 iterations: 0.48616498908912986
Cost after 37500 iterations: 0.48560138130694536
Cost after 40000 iterations: 0.48511828731809165
Cost after 42500 iterations: 0.484700175093165
Cost after 45000 iterations: 0.4843353879157644
Cost after 47500 iterations: 0.48401499834473743
Cost after 50000 iterations: 0.483732038

# Evaluating the Model

In [39]:
# importing imblearn's SMOTE
# looking to see if it would improve the confusion matrix
## IMPORTANT - MAKE SURE THE IMBLEARN LIBRARY IS INSTALLED BY USING THE FOLLOWING ONE OF THE TWO INSTALLS
## pip install -U imbalanced-learn
## conda install -c conda-forge imbalanced-learn
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state = 10, sampling_strategy = 1.0)
training_features_sm, training_labels_sm = sm.fit_sample(training_features, training_labels)

In [40]:
# importing scikit-learn's logistic regression
from sklearn.linear_model import LogisticRegression

# logistic regression on normal samples
sklearn_logreg = LogisticRegression(max_iter=1000)
sklearn_logreg.fit(training_features, training_labels)
sklearn_prediction = sklearn_logreg.predict(test_features)

# logistic regression on SMOTE samples
sklearn_logreg_sm = LogisticRegression(max_iter=1000)
sklearn_logreg_sm.fit(training_features_sm, training_labels_sm)
sklearn_prediction_sm = sklearn_logreg_sm.predict(test_features)

# scikit-learn's logistic regression score for both normal and SMOTE samples
logreg_score = sklearn_logreg.score(test_features, test_labels)
print("Scikit's logistic regression score for normal samples:", logreg_score)

logreg_score_sm = sklearn_logreg_sm.score(test_features, test_labels)
print("Scikit's logistic regression score for SMOTE samples:", logreg_score_sm)

Scikit's logistic regression score for normal samples: 0.7876178390137781
Scikit's logistic regression score for SMOTE samples: 0.6715917331399565


In [42]:
# importing scikit-learn's cross validation
from sklearn.model_selection import cross_val_score

# cross validation scores over 10-folds for both normal and SMOTE samples
cv_scores = cross_val_score(sklearn_logreg, training_features, training_labels, cv=10)
cv_scores_sm = cross_val_score(sklearn_logreg_sm, training_features_sm, training_labels_sm, cv=10)

# printing cross validation scores for both normal and SMOTE samples
print("Cross Validation Scores for normal samples: \n", cv_scores)
cv_scores = pd.Series(cv_scores)
print("Cross Validation Minimum:", cv_scores.min())
print("Cross Validation Mean:", cv_scores.mean()) 
print("Cross Validation Max:", cv_scores.max())

print("\n")

print("Cross Validation Scores for SMOTE samples: \n", cv_scores_sm)
cv_scores_sm = pd.Series(cv_scores_sm)
print("Cross Validation Minimum:", cv_scores_sm.min())
print("Cross Validation Mean:", cv_scores_sm.mean()) 
print("Cross Validation Max:", cv_scores_sm.max())

Cross Validation Scores for normal samples: 
 [0.79786993 0.7960571  0.79741672 0.79639701 0.79809653 0.79458418
 0.79614731 0.79830028 0.79728045 0.79490085]
Cross Validation Minimum: 0.7945841830953999
Cross Validation Mean: 0.796705037716918
Cross Validation Max: 0.798300283286119


Cross Validation Scores for SMOTE samples: 
 [0.62035003 0.63090767 0.62445972 0.62665627 0.62566428 0.63600935
 0.63019911 0.63579678 0.63019911 0.63629278]
Cross Validation Minimum: 0.6203500318854956
Cross Validation Mean: 0.6296535109473536
Cross Validation Max: 0.6362927797066534


In [44]:
# importing scikit-learn's confusion matrix
from sklearn.metrics import confusion_matrix

# my model's confusion matrix
classifier_labels = classifier(test_features, final_weights)
my_cm = confusion_matrix(test_labels, classifier_labels)
print("My model's confusion matrix: \n", my_cm)

# scikit-learn's confusion matrix for normal samples
sk_cm = confusion_matrix(test_labels, sklearn_prediction)
print("Scikit's confusion matrix for normal samples: \n", sk_cm)

# scikit-learn's confusion matrix for SMOTE samples
sk_cm_sm = confusion_matrix(test_labels, sklearn_prediction_sm)
print("Scikit's confusion matrix for SMOTE samples: \n", sk_cm_sm)

My model's confusion matrix: 
 [[17337   143]
 [ 4528    56]]
Scikit's confusion matrix for normal samples: 
 [[17325   155]
 [ 4531    53]]
Scikit's confusion matrix for SMOTE samples: 
 [[12231  5249]
 [ 1997  2587]]


In [45]:
# importing scikit-learn's classification report
from sklearn.metrics import classification_report

# my model's classification report
my_report = classification_report(test_labels, classifier_labels)
print("My model's report: \n", my_report)

# scikit-learn's classification report for normal samples
sklearn_report = classification_report(test_labels, sklearn_prediction)
print("Scikit's report for normal samples: \n", sklearn_report)

# scikit-learn's classification report for normal samples
sklearn_report_sm = classification_report(test_labels, sklearn_prediction_sm)
print("Scikit's report for SMOTE samples: \n", sklearn_report_sm)

My model's report: 
               precision    recall  f1-score   support

           0       0.79      0.99      0.88     17480
           1       0.28      0.01      0.02      4584

    accuracy                           0.79     22064
   macro avg       0.54      0.50      0.45     22064
weighted avg       0.69      0.79      0.70     22064

Scikit's report for normal samples: 
               precision    recall  f1-score   support

           0       0.79      0.99      0.88     17480
           1       0.25      0.01      0.02      4584

    accuracy                           0.79     22064
   macro avg       0.52      0.50      0.45     22064
weighted avg       0.68      0.79      0.70     22064

Scikit's report for SMOTE samples: 
               precision    recall  f1-score   support

           0       0.86      0.70      0.77     17480
           1       0.33      0.56      0.42      4584

    accuracy                           0.67     22064
   macro avg       0.59      0.6

In [48]:
# importing ROC/AUC to get the area under the curve score
from sklearn.metrics import roc_auc_score

auc_score = roc_auc_score(test_labels, sklearn_prediction)
print("AUC score for normal samples:", auc_score)

auc_score_sm = roc_auc_score(test_labels, sklearn_prediction_sm)
print("AUC score for SMOTE samples:", auc_score_sm)

AUC score for normal samples: 0.501347338868455
AUC score for SMOTE samples: 0.6320341172758895
