# Lab 3 Modeling 
1. Define a model 
2. Experiment and hyper parameter tuning
3. Train model on training cluster
4. Save model back into project repo 

In [None]:
import numpy as np
import pandas as pd
import os
import json
import seaborn as sns
sns.set(font_scale=1.5)

%matplotlib inline 

# Loading student variable
%store -r STUDENT

In [None]:
def ProjectRepo(path):
   ProjectRepo = "/bd-fs-mnt/project_repo"
   return str(ProjectRepo + '/' + path)

In [None]:
final_train = pd.read_csv(ProjectRepo('data/' + STUDENT + '_UCI_Income/adult_train_cleaned.csv'))
final_test = pd.read_csv(ProjectRepo('data/' + STUDENT + '_UCI_Income/adult_test_cleaned.csv'))
y_train = final_train.pop('wage_class')
y_test = final_test.pop('wage_class')

# Model Development

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

### First model

In [None]:
cv_params = {'max_depth': [3,5], 'min_child_weight': [3,5]}
ind_params = { 'n_estimators': 100, 'seed': 0, 'subsample' : 0.8, 'colsample_bytree': 0.8, 
              'objective': 'binary:logistic', "eval_metric" :"error"}

#optimizing for accuracy, GBM = gradient boost model
optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params, use_label_encoder=False), 
                             cv_params, 
                             scoring = 'accuracy', cv = 2, n_jobs = 1, verbose=3)

In [None]:
# The training of this model could take a few minutes or more depending on the infrastructure you are running on. Please be patient 
optimized_GBM.fit(final_train, y_train)

In [None]:
optimized_GBM.cv_results_

### Second model
Tuning other hyperparameters in an attempt to achieve higher mean accuracy

In [None]:
cv_params = {'learning_rate': [0.1, 0.01], 'subsample': [0.7, 0.8, 0.9]}
ind_params = {'n_estimators': 100, 'seed': 0, 'colsample_bytree': 0.8, 'objective': 'binary:logistic', 
              'max_depth': 3, 'min_child_weight': 1, "eval_metric" :"error"}
                    
optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params, use_label_encoder=False), 
                             cv_params, 
                             scoring = 'accuracy', cv=2, n_jobs=1, verbose=3)
optimized_GBM.fit(final_train, y_train)

In [None]:
optimized_GBM.cv_results_

### Third model
Utilize XGBoost's built-in cv which allows early stopping to prevent overfitting

In [None]:
xgdmat = xgb.DMatrix(final_train, y_train)

In [None]:
our_params = {'eta': 0.1, 'seed': 0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'objective': 'binary:logistic',
              'max_depth': 3, 'min_child_weight': 1}

cv_xgb = xgb.cv(params=our_params, dtrain=xgdmat, num_boost_round=3000, metrics=['error'],
                early_stopping_rounds=100)

In [None]:
print('Best iteration:', len(cv_xgb))

In [None]:
cv_xgb.tail(5)

### Final Model

In [None]:
our_params = {'eta': 0.1, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 'max_depth':3, 'min_child_weight':1} 

final_gb = xgb.train(our_params, xgdmat, num_boost_round = 326)

# Plot feature importances

In [None]:
xgb.plot_importance(final_gb)

In [None]:
importances = final_gb.get_fscore()
importances

In [None]:
importance_frame = pd.DataFrame({'Importance': list(importances.values()), 'Feature': list(importances.keys())})
importance_frame.sort_values(by = 'Importance', inplace=True)
importance_frame.plot(kind='barh', x='Feature', figsize=(8,8), color='green')

## Train the model on the remote shared training cluster

In general, data scientists use their local Jupyter Notebook to **experiment** several learning algorithms with a variety of parameters. They do so to determine the ML model that works best for the business problem they try to address and develop the model that yields to the best prediction result. Then, within their notebooks, they submit their code to large scaled computing training cluster environment to train and test their full ML models, in a reasonable time, typically against a larger training dataset and test dataset. The output of this step is a trained model ready for deployment in production.

>**Note:** _This workshop is not intended to teach you about AI/ML model experimentation and development. It is intended to give a use case for data science end-to-end ML workflow with HPE Ezmeral ML Ops. Therefore we will assume that the experimentation step has already been done and that the data science team has shared the best performant ML model in a notebook in the GitHub version control system repository set up by the Operations team for the data science team. The notebook is actually this notebook pulled from GitHub repository by the local Jupyter Notebook cluster. Here you will submit the ML model code to the tenant-shared training cluster environment to train and test your model against the train/test dataset

In [None]:
%attachments

## Fill in your initials for the <b>STUDENT</b> variable and your training cluster name

In [None]:
%%<your training cluster>

STUDENT = ""

# Importing libraries 
print("Importing libraries")
import numpy as np
import pandas as pd
import os
import pickle
import xgboost as xgb
import datetime
from sklearn.model_selection import GridSearchCV

# Start time 
print("Start time: ", datetime.datetime.now())

# Project repo path function
def saveInProjectRepo(path):
   ProjectRepo = "/bd-fs-mnt/project_repo/"
   return str(ProjectRepo + '/' + path)

# Reading in data 
print("Reading in data")
train = pd.read_csv(saveInProjectRepo('data/' + STUDENT + '_UCI_Income/adult_train_cleaned.csv'))
print("Done reading in data")

# Extracting target values 
y_train = train.pop('wage_class')
train.pop('Unnamed: 0')

# Model development / Training
print("Training...")
xgdmat = xgb.DMatrix(train, y_train)
our_params = {'eta': 0.1, 'seed': 0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'objective': 'binary:logistic',
              'max_depth': 3, 'min_child_weight': 1, "eval_metric" :"error"}
cv_xgb = xgb.cv(params=our_params, dtrain=xgdmat, num_boost_round=3000, metrics=['error'],
                early_stopping_rounds=100)
optimal_rounds = len(cv_xgb)
final_gb = xgb.train(our_params, xgdmat, num_boost_round = optimal_rounds)

# Save model into project repo
print("Saving model")
# pickle.dump(final_gb, open( saveInProjectRepo('models/XGB_Income/') + "XGB.pickle.dat", "wb"))
xgb.Booster.save_model(final_gb, saveInProjectRepo('models/' + STUDENT + '_UCI_Income/') + "XGB.pickle.dat")

# Finish time
print("End time: ", datetime.datetime.now())

In [None]:
# Fill in the history url from the output of the previous cell
%logs --url http://training-loadbalancer-dcdpw-0.training76xcc.terry-mlops.svc.cluster.local:10001/history/2

# Testing with loading pickle model 

In [None]:
cleaned = pd.read_csv(ProjectRepo('data/' + STUDENT + '_UCI_Income/adult_test_cleaned.csv'))
cleaned.tail(1)

Using scoring with pickle model yields proper results

In [None]:
#Running with final_gb model from local notebook 
temp = cleaned.tail(1)
y_test = temp.pop('wage_class')
temp.set_index('age')
# temp.pop('Unnamed: 0')
mat = xgb.DMatrix(temp) 
y_pred = final_gb.predict(mat)
y_pred

In [None]:
model = xgb.Booster({'nthread':325})
model.load_model(ProjectRepo('models/' + STUDENT + '_UCI_Income/XGB.pickle.dat'))
temp = cleaned.tail(1)
y_test = temp.pop('wage_class')
temp.set_index('age')
temp.pop('Unnamed: 0')
mat = xgb.DMatrix(temp) 
y_pred = model.predict(mat)
y_pred

# Continue onto Lab 4 for model serving! 