# Medication Compliance Forecasting 

### Objective 
Predict patient’s likelihood of adherence to a prescribed regimen. For that,information is provided both for patient and the prescription. Patient information include details like age, gender, medical history, cultural background etc. Details about the prescription are like Diet control advised, Exercise advised etc.

To build a predictive model, with the provided a data set that contains details about patient and the prescription.


### Evaluation 
For each patient id in test set, you must predict if a patient is going to adhere to the prescribed regimen. Your model will be evaluated on precision and recall for both the outcomes. So, your code must include generation of confusion matrix for your predictions.

# Setup
Setting up project directory and output paths, for data and model.

### Step 1 : Standard Imports

In [None]:
#Common imports
import numpy as np
import pandas as pd
import os
import sys

#To make outputs more consistent
np.random.seed(42)

#To Save & Load Models
import pickle

#To plot figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(style = "whitegrid", color_codes = True)

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Preprocessing imports

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedShuffleSplit

# Models Imports
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier


from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score,f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve,roc_curve , roc_auc_score




#Folder Directory Structure
PROJECT_ROOT_DIR ='.'
PROJECT_DATA_DIR = './data'
PROJECT_FOLDER = 'medical_compliance_forecasting'
PROJECT_ID=PROJECT_FOLDER
PROJECT_OUTPUT_PATH = os.path.join(PROJECT_ROOT_DIR,'model',PROJECT_ID)
TRAINING_DATA =  os.path.join(PROJECT_DATA_DIR,PROJECT_FOLDER,'Training Data.csv')
TEST_DATA =  os.path.join(PROJECT_DATA_DIR,PROJECT_FOLDER,'Test Data.csv')

### Step 2a : Utility Functions

In [None]:
def save_fig(fig_id,tight_layout=True,fig_extension='png',resolution=300):
    if not os.path.exists(IMAGES_PATH):
        os.makedirs(PROJECT_OUTPUT_PATH)
    path = os.path.join(PROJECT_OUTPUT_PATH,fig_id + '.' + fig_extension)
    print("Saving Figure : {}".format(fig_id))
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path,format=fig_extension,dpi=resolution)

def save_model(model,model_name):
    model_file = os.path.join(PROJECT_OUTPUT_PATH,model_name+'.pkl')
    with open(model_file, 'wb') as f:
        pickle.dump(model, f)
    
def load_model(model,model_name):
    model_file = os.path.join(PROJECT_OUTPUT_PATH,model_name+'.pkl')
    with open(model_file, 'rb') as f:
        model = pickle.load(f)
    return model

def load_data(path=TRAINING_DATA):
    data_file = os.path.join(path)
    return pd.read_csv(data_file)


### Step 2b : Visualization Functions

In [None]:

# Used to See Correleation in Data against a selected Feature
def get_correlation(corr_matrix,data,feature):
    return corr_matrix[feature].sort_values(ascending=False)

# To compute & display Precision,Recall & F1 Score

def compute_scores(y_label,y_predicted):
    print("Classsification Report - ")
    print(classification_report(y_label, y_predicted))
    cnf_matrix = confusion_matrix(y_label, y_predicted)
    plot_confusion_matrix(cnf_matrix)
    
    precisions, recalls, thresholds = precision_recall_curve(processed_training_label, predicted_label)
    fpr, tpr, thresholds = roc_curve(processed_training_label, predicted_label)
    logit_roc_auc = roc_auc_score(processed_training_label, predicted_label)
    
    print("Plots for Precision Recall & ROC can be visualised below")
    plot_precision_vs_recall(precisions, recalls)
    plot_roc_curve(fpr, tpr,logit_roc_auc, label=None)
    
    
def plot_confusion_matrix(cnf_matrix):
    plt.clf()
    plt.imshow(cnf_matrix, interpolation='nearest', cmap=plt.cm.Wistia)
    classNames = ['Negative','Positive']
    plt.title('Confusion Matrix for Medical Compliance')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(classNames))
    plt.xticks(tick_marks, classNames, rotation=45)
    plt.yticks(tick_marks, classNames)
    s = [['TN','FP'], ['FN', 'TP']]
    for i in range(2):
        for j in range(2):
            plt.text(j,i, str(s[i][j])+" = "+str(cnf_matrix[i][j]))
    plt.show()
    
def plot_precision_vs_recall(precisions, recalls):
    plt.plot(recalls, precisions, "b-", linewidth=2)
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1,])
    plt.figure(figsize=(8, 6))
    plt.show()


def plot_roc_curve(fpr, tpr,logit_roc_auc, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.title('Receiver operating characteristic',fontsize=14)
    plt.text(0.5,0.3,'Area under the Graph - %0.2f'%logit_roc_auc)
    plt.figure(figsize=(8, 6))
    plt.show()
    

### Step 3 
Since we already have different Training & Test sets, we don't neeed to create any splits. Hence, directly loading the data


In [None]:
dataset =  load_data(path=TRAINING_DATA)
test_set = load_data(path=TEST_DATA)

### Step 4 : Exploratory Data Analysis

In [None]:
# View Train Data
dataset.head(2)

In [None]:
# View Test Data
test_set.head(2)

In [None]:
# Checking all description of the columns for their Data-types, counts & Nulls
dataset.info()

In [None]:
#Let's see the distribution and attributes of the Age & Prescription Period which have a range of values
dataset[['Age','Prescription_period']].describe()

In [None]:
# Since we see Objects types / Categorical Values, let's see the count
print("Gender Distribution - ")
print(dataset['Gender'].value_counts())
print()
print("Adherence Distribution - ")
print(dataset['Adherence'].value_counts())

#### Inference 1 -

1. We can see the data has no null/missing values.
2. We see that Gender & Adherence are categorical values
3. We can also see that "F"emale patients data is more than "M"ale. Though the data is not imbalanced.
4. We can also see that Adherence is dominated by "N"o but is not imbalanced.
5. Since we have categorical values, we need to convert them.
6. Since our data does not have any missing value, we don't require any imputer for filling nulls.

In [None]:
# Defining Numerical & Categorical Categories
num_attribs = list(dataset.drop(['Gender','Adherence'], axis=1))
cat_attribs = ['Gender','Adherence']

print("Numerical Categories in Source Data : {}".format(num_attribs))
print("Categorical Categories in Source Data : {}".format(cat_attribs))

In [None]:
# Creating Encoder for Categorical Values
ordinal_encoder = OrdinalEncoder()
gender = ordinal_encoder.fit_transform(dataset[['Gender']])
adherence = ordinal_encoder.fit_transform(dataset[['Adherence']])

#Creating a copy of the Orginial Dataset to update the values for Further analysis
dataset_numerical = dataset.copy()
dataset_numerical['Gender'] = gender
dataset_numerical['Adherence'] = adherence


In [None]:
dataset_numerical.head()

In [None]:
full_pipeline = ColumnTransformer([
        ("num", StandardScaler(), num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

data_prepared = full_pipeline.fit_transform(dataset)

In [None]:
#We can see the graphical distribution too as follows 
#Vertical Axis has number of instances
#Horizontal Axis has values of the attributes
dataset_numerical.hist(bins=50,figsize=(20,15))
plt.show()

In [None]:
## We also create the correlation between features -
corr = dataset_numerical.corr()
plt.figure(figsize=(15, 10))

sns.heatmap(corr, vmax=.9, linewidths=0.01,
            square=True,annot=True,cmap='YlGnBu',linecolor="white")
plt.title('Correlation between features');

In [None]:
get_correlation(dataset_numerical.corr(),dataset_numerical,'Adherence')

#### Inference 2

1. From the histogram distribution we can infer that all the fields have binary values.
2. The Ordinal Encoder is able to encode the Categorical Categories
3. The Correlation Heat-map shoes some features being related to each other.
4. The correlation with the target field, 'Adherence' shows that Prescription_period & Age highly affect the compliance to the prescription.

### Step 5 : Data Pre-processing

Since we can see that "Prescription_period" affect the target inversely, it is important for us to have a equal distribution of it , in the training & test dataset. Hence, we would go for a StratifiedShuffleSplit.

Steps to be followed - 

1. Pipeline to process input data for training.
2. Create Training-Test Split

In [None]:
dataset["Prescription_period"].hist()

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(dataset, dataset["Prescription_period"]):
    strat_train_set = dataset.loc[train_index]
    strat_test_set = dataset.loc[test_index]

In [None]:
# Printing Top 10 values of the test split set
print((strat_test_set["Prescription_period"].value_counts() / len(strat_test_set))[0:10])

In [None]:
# Printing Top 10 values of the original DataSet
print((dataset["Prescription_period"].value_counts() / len(dataset))[0:10])

In [None]:
# Separating Features & Target for training

training_data = strat_train_set.drop(['Adherence','patient_id'],axis=1)
training_label = strat_train_set[['Adherence']].copy()

training_data.info()

In [None]:
# Separating Features & Target for training

test_data = strat_test_set.drop(['Adherence','patient_id'],axis=1)
test_label = strat_test_set[['Adherence']].copy()

test_data.info()

In [None]:
# Defining Numerical & Categorical Categories
ordinal_encoder = OrdinalEncoder()
imputer_attribs = ['Diabetes','Alcoholism','HyperTension','Smokes','Tuberculosis','Sms_Reminder']
num_attribs = ['Age','Prescription_period']
cat_attribs = ['Gender',]

full_pipeline = ColumnTransformer([
        ('imputer', SimpleImputer(strategy="median"),imputer_attribs),
        ("num", StandardScaler(), num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

processed_training_data = full_pipeline.fit_transform(training_data)
processed_training_label = ordinal_encoder.fit_transform(training_label).reshape(-1,)

processed_test_data = full_pipeline.fit_transform(test_data)
processed_test_label = ordinal_encoder.fit_transform(test_label).reshape(-1,)



### Algo 1 : Logistic Regression 
F1 Score :  88

In [None]:
# Initialize our algorithm
log_reg = LogisticRegression(solver = 'lbfgs',random_state=42, n_jobs = -1)
log_reg.fit(processed_training_data, processed_training_label)
#log_reg.predict(processed_training_data[11,:].reshape(1,-1))
predicted_label = cross_val_predict(log_reg, processed_training_data, processed_training_label, cv=3)
compute_scores(processed_training_label, predicted_label)

In [None]:
### Algo 2 : Decision Tree 
F1 Score :  90

In [None]:
# Initialize our algorithm

tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42 )

tree_clf.fit(processed_training_data, processed_training_label)
#log_reg.predict(processed_training_data[11,:].reshape(1,-1))
predicted_label = cross_val_predict(tree_clf, processed_training_data, processed_training_label, cv=3)
compute_scores(processed_training_label, predicted_label)

### Algo 3 : Randome Forest Tree 
F1 Score :  9

In [None]:
# Initialize our algorithm

forest_clf  = RandomForestClassifier(n_estimators=500, random_state=42 , n_jobs = -1,
                                     max_depth=2)
                                   
#forest_clf.fit(processed_training_data, processed_training_label)
#log_reg.predict(processed_training_data[11,:].reshape(1,-1))
predicted_label = cross_val_predict(forest_clf, processed_training_data, processed_training_label, cv=3)
compute_scores(processed_training_label, predicted_label)

In [None]:
# Initialize our algorithm

mlp_clf  = MLPClassifier(hidden_layer_sizes=(100,50,10) ,random_state=42 ,learning_rate ='adaptive',max_iter=500)
                                   
#forest_clf.fit(processed_training_data, processed_training_label)
#log_reg.predict(processed_training_data[11,:].reshape(1,-1))
predicted_label = cross_val_predict(mlp_clf, processed_training_data, processed_training_label, cv=3)
compute_scores(processed_training_label, predicted_label)

In [None]:
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
predicted_label = cross_val_predict(extra_trees_clf, processed_training_data, processed_training_label, cv=3)
compute_scores(processed_training_label, predicted_label)