# Import a SageMaker Predictive Model for Calculating Risk Index per Region - 

In [None]:
import time
st = time.time()

### Install necessary libraries

In [None]:
!pip install pandas_profiling
!pip install lightgbm
!pip install shap

### Import the libraries into the notebook

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import ipaddress
import pandas_profiling as pp
%matplotlib inline
from sklearn import preprocessing
plt.rc("font", size=14)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import warnings
warnings.filterwarnings("ignore")
import time
import boto3
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import shap
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

### Update the AWS credentials in the below cell.
We will ingest the data files which were pre-processed in the earlier steps using the Data pre-processing Notebook

In [None]:
BUCKET_TYPE="s3"
BUCKET_NAME=""
BUCKET_REGION=""
AWS_ACCESS_KEY_ID=""
AWS_SECRET_ACCESS_KEY=""

### Create an instance with S3 Credentials 

In [None]:
s3 = boto3.resource(
    service_name = BUCKET_TYPE,
    region_name = BUCKET_REGION,
    aws_access_key_id = AWS_ACCESS_KEY_ID,
    aws_secret_access_key = AWS_SECRET_ACCESS_KEY
)

### Read and display the available datasets in S3 Bucket

In [None]:
datasets = []
for obj in s3.Bucket(BUCKET_NAME).objects.all():
    datasets.append(obj.key)
datasets

### Read the Risk Index per Region data as dataframe and display five records

In [None]:
obj = s3.Bucket(BUCKET_NAME).Object('ts-data-region-RI.csv').get()
data = pd.read_csv(obj['Body'])
data.head()

### identify the Data Types

In [None]:
data.dtypes

### Drop the Date attribute as we will not be using it

In [None]:
data.drop('DATE', axis=1, inplace=True)
data

### Encode the Region data into integers

Brussels is encoded as 0 ::: Flanders is encoded as 1 ::: Wallonia is encoded as 2

In [None]:
label_encoder = preprocessing.LabelEncoder()

data['REGION']= label_encoder.fit_transform(data['REGION'])

data

### Generate the detailed analysis report

In [None]:
pp.ProfileReport(data)

### Drop the duplicate values

In [None]:
data.drop_duplicates(keep=False, inplace=True)

In [None]:
pp.ProfileReport(data)

### Check for missing values

In [None]:
data.isna()

### Set the Input & Target variables for the Predictive Model

Region & Total_cases are Input attributes, Risk_Index will be the Target attribute

In [None]:
X = data[data.columns[0:2]]
y = data[data.columns[2:]]

### Print the Input attributes

In [None]:
X

### Print the Target attribute

In [None]:
y

### Split the data using 70:30 mix into Training & Testing samples 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Check for data shape of Training & Testing samples

In [None]:
print("Train_x Shape :: ", X_train.shape)
print("Train_y Shape :: ", y_train.shape)
print("Test_x Shape :: ", X_test.shape)
print("Test_y Shape :: ", y_test.shape)

### Prepare the dataset for building the Predictive Model

In [None]:
d_train = lgb.Dataset(X_train, label=y_train)

### Build the Model using Light Gradient Boosting algorithm

In [None]:
def LGBM_classifier(features, target):
    """
    To train the LGBM classifier with features and target data
    :param features:
    :param target:
    :return: trained LGBM classifier
    """
    model = LGBMClassifier(metric='multi_logloss', objective='multiclass', num_leaves=60, max_depth=30, min_data=50)
    model.fit(features, target)
    return model

start = time.time()
trained_model = LGBM_classifier(X_train, y_train.values.ravel())
print("> Completion Time : ", time.time() - start)
print("Trained LGBM model :: ", trained_model)
predictions = trained_model.predict(X_test)

In [None]:
X_test

### Evaluate the accuracy of the Model

In [None]:
print("Train Accuracy :: ", accuracy_score(y_train, trained_model.predict(X_train)))
print("LGBM Model Test Accuracy is :: ", accuracy_score(y_test, predictions))

### Analyze the Confusion Matrix

In [None]:
print(" Confusion matrix ", confusion_matrix(y_test, predictions))

### Analyze the model performance across different categories

In [None]:
print(metrics.classification_report(y_test, predictions))

### Identify the important feature/s

In [None]:
feat_imp = pd.Series(trained_model.feature_importances_, index=X.columns)
feat_imp.nlargest(12).plot(kind='barh', figsize=(8,10))

### Install the shap library for Model Evaluation & Feature Importance

In [None]:
!pip install shap

In [None]:
shap.initjs()

### You may need higher computation resources to run the below cells. 

In [None]:
#shap_values = shap.TreeExplainer(trained_model.booster_).shap_values(X_train)
shap_values = shap.TreeExplainer(trained_model).shap_values(X_train)

In [None]:
shap.summary_plot(shap_values, X_train)

#### As per above summary plot, we can infer that Region is important/significant attribute for classes 0 & 2 (Brussels & Wallonia) & Total_cases is a significant attribute for class 0 (Brussels) Risk Index prediction. For Flanders (Class 1), Region attribute is more significant than Total_cases to predict Risk Index.

### Let's churn out some predictions using random values for Region & Total_cases 

Risk-Index of 0 is Low ::: Risk-Index of 1 is Moderate ::: Risk-Index of 2 is High Risk

In [None]:
import numpy as np

new_data=np.array([1,1000])
new_data = new_data.reshape(1, -1)

In [None]:
trained_model.predict(new_data)

For the input [1, 1000] which is Flanders with 1000 cases per day, the predicted Risk index is ([ 2 ]) which is High risk.

In [None]:
new_data1=np.array([0,10])
new_data1 = new_data1.reshape(1, -1)

In [None]:
trained_model.predict(new_data1)

For the input [0, 10] which is Brussels with 10 cases per day, the predicted Risk index is ([ 0 ]) which is Low risk.

In [None]:
new_data2=np.array([2,100])
new_data2=new_data2.reshape(1, -1)

In [None]:
trained_model.predict(new_data2)

For the input [2, 1] which is Wallonia with 1 case per day, the predicted Risk index is ([ 1 ]) which is Moderate risk.

### Generate predictions for multiple values in one go

In [None]:
validation_data=np.array([1,500,2,100,0,600])
val_data1 = validation_data.reshape(3, -1)

In [None]:
val_data1

In [None]:
trained_model.predict(val_data1)

The predicted results are High risk, Moderate risk & Low risk for the three records.

## In this notebook, we have learnt how to import the SageMaker notebook into Watson Studio and generate predictions.

In [None]:
time_taken=round(time.time() - st,1)
print("The Notebook ran in {} ".format(float(time_taken)) + "seconds") 