# Overview
This is a simple end to end example of how you can use SAS Viya for analysis
The example follows these steps:
1. Importing the needed Python packages
1. Starting a CAS session on an already running CAS server
1. Load the needed CAS Action Sets
1. Loading data from the local file system to the CAS server
1. Explore the data
1. Impute missing values
1. Partition the data into training and validation partitions
1. Build a gradient boost
1. Assess the model
1. Build ROC and lift charts

## Set up and initialize

Find doc for all the CAS actions [here](http://go.documentation.sas.com/?cdcId=vdmmlcdc&cdcVersion=8.11&docsetId=caspg&docsetTarget=titlepage.htm 
) 

### Documentation Links:
* [SAS® Viya™ 3.2: System Programming Guide](http://go.documentation.sas.com/?cdcId=vdmmlcdc&cdcVersion=8.11&docsetId=caspg&docsetTarget=titlepage.htm)
* [Getting Started with SAS® Viya™ 3.2 for Python](http://go.documentation.sas.com/?cdcId=vdmmlcdc&cdcVersion=8.11&docsetId=caspg3&docsetTarget=titlepage.htm&locale=en)

In this code we import the needed packages and we assign variables for the modeling details that will be used later in the analysis

In [None]:
import os
import pandas as pd
import swat
import sys
from matplotlib import pyplot as plt
%matplotlib inline

target          = "bad"
class_inputs    = ["reason", "job"]
class_vars      = [target] + class_inputs
interval_inputs = ["im_clage", "clno", "im_debtinc", "loan", "mortdue", "value", "im_yoj", "im_ninq", "derog", "im_delinq"]
all_inputs      = interval_inputs + class_inputs

indata = 'hmeq'

## Start CAS session

* Documentation to [Connect and Start a Session](http://go.documentation.sas.com/?cdcId=vdmmlcdc&cdcVersion=8.11&docsetId=caspg3&docsetTarget=home.htm&locale=en)

In this code we assign values for the cashost, casport, and casauth values. These are then used to establish a CAS session named `sess`.

In [None]:
# cashost='localhost'
cashost='localhost'
casport=5570
casauth='~/.authinfo'
sess = swat.CAS(hostname=cashost, port=casport,  authinfo=casauth, caslib="casuser", name="brad")

# Load the needed action sets for this example:
sess.loadactionset('datastep')
sess.loadactionset('datapreprocess')
sess.loadactionset('cardinality')
sess.loadactionset('sampling')
sess.loadactionset('decisiontree')
sess.loadactionset('astore')
sess.loadactionset('percentile')
# show the session details
sess

## Load data into CAS



In [None]:
indata = sess.CASTable('hmeq')
if not indata.tableexists().exists:
    indata = sess.upload_file('http://support.sas.com/documentation/onlinedoc/viya/exampledatasets/hmeq.csv', casout=indata)

## Explore and Impute missing values

In [None]:
indata.summary()

#### Explore data and plot missing values

In [None]:
sess.CASTable()

In [None]:
tbl_data_card = sess.CASTable('data_card', replace=True)

indata.cardinality.summarize(cardinality=tbl_data_card)

tbl_data_card = tbl_data_card.query('_NMISS_ > 0')
tbl_data_card.head()

In [None]:
tbl_data_card['PERCENT_MISSING'] = (tbl_data_card['_NMISS_'] / tbl_data_card['_NOBS_']) * 100

ax = tbl_data_card[['_VARNAME_', 'PERCENT_MISSING']].to_frame().set_index('_VARNAME_').plot.bar(
         title='Percentage of Missing Values', figsize=(15,7)
     )
ax.set_ylabel('Percent Missing')
ax.set_xlabel('Variable Names');

#### Impute missing values

In [None]:
hmeq_prepped = sess.CASTable('hmeq_prepped', replace=True)
# hmeq_prepped = sess.CASTable('hmeq_prepped', replace=True)

indata.datapreprocess.transform(
    casout=hmeq_prepped,
    copyallvars=True,
    outvarsnameglobalprefix='im',
    requestpackages=[
        {'impute': {'method': 'mean'}, 'inputs': ['clage']},
        {'impute': {'method': 'median'}, 'inputs': ['delinq']},
        {'impute': {'method': 'value', 'valuesNumeric': [2]}, 'inputs': ['ninq']},
        {'impute': {'method': 'value', 'valuesNumeric': [35.0, 7, 2]}, 'inputs': ['debtinc', 'yoj']}
    ]
)

## Partition data into Training and Validation

The stratified action in the sampling actionset allows us to create two partition and observe the reponse rate of the target variable `bad` in both training and validation

In [None]:
hmeq_part = sess.CASTable('hmeq_part', replace=True)

hmeq_prepped.groupby(target).sampling.stratified(
  output=dict(casout=hmeq_part, copyvars='all'),
  samppct=70,
  partind=True
)

## Gradient Boosting Machine


In this code block we do the following:
1. Train the decision tree using the variable listed we defined in the setup phase. We save the decision tree model `gb_model`. It is used in the subsequent step but it could just have easily been used a day, week, or month from now.
1. Score data using the `gb_model` that was created in the previous step
1. Run data step code on the scored output to prepare it for further analysis 

In [None]:
hmeq_part_1 = hmeq_part.query('_partind_ = 1')
gb_model = sess.CASTable('gb_model', replace=True)
scored_gb = sess.CASTable('_scored_gb', replace=True)
gb_model_astore = sess.CASTable('gb_model_astore', replace=True)

hmeq_part_1.decisiontree.gbtreetrain(
  inputs=all_inputs,
  nominals=class_vars,
  target=target,
  ntree=10,
  nbins=20,
  maxlevel=6,
  varimp=True,
  missing='useinsearch',
  casout=gb_model,
  #save the model state as astore
  savestate={"name": "gb_model_astore",
              "promote": True,
             "caslib":"public"
          }
)

# Score 
hmeq_part.decisionTree.gbtreeScore(
  modeltable=gb_model,
  casout=scored_gb,
  copyvars=[target, '_partind_']
)

gb_model.head()
scored_gb.head()

In [None]:
import json
hmeq_part_1.head().to_json(orient='records')

In [None]:
# write the model savestate out to the server filesystem
gb_model_astore.table.save(name="gb_model_astore.bin", table=dict(name='gb_model_astore',caslib='public'),caslib="public")

In [None]:
# download results of astore savestate
results = sess.aStore.download(rstore=dict(name='gb_model_astore',caslib='public'))
list(results.keys())
results['blob'][2]

In [None]:
# # model written to local filesystem 
import codecs
blob = sess.astore.download(rstore=dict(name='gb_model_astore',caslib='public'))['blob']
with open("Output.bin","wb") as output_file:
    output_file.write(blob)
    output_file.close()

In [None]:
f = open("Output.bin","rb")
f.seek(0)
data = f.read()
f.close()
data[2]
# results2 = sess.aStore.upload(rstore=dict(name='gb_model_astore_upload',caslib='public'),store=data)

In [None]:
m = sess.describe(rstore=dict(name='gb_model_astore',caslib='PUBLIC'))  
m

In [None]:
sess.table.fileinfo(caslib="public",path="%")

## Assess Model

In [None]:
# Create p_bad0 and p_bad1 as _gbt_predp_ is the probability of event in _gbt_predname_
scored_gb['p_bad1'] = scored_gb.eval("ifn( strip(_gbt_predname_) = '1', _gbt_predp_, 1-_gbt_predp_ )") 
scored_gb['p_bad0'] = scored_gb.eval("ifn( strip(_gbt_predname_) = '0', 1-_gbt_predp_, _gbt_predp_ )")

gb_assess = sess.percentile.assess(
            table=scored_gb.query('_partind_ = 0'),
            inputs=['p_bad1'],      
            response='bad',
            event='1',
            pvar=['p_bad0'],
            pevent=['0']      
        )
 
gb_fitstat  = gb_assess.FitStat
gb_rocinfo  = gb_assess.ROCInfo
gb_liftinfo = gb_assess.LIFTInfo

## Create ROC and Lift plots (using Validation data)

#### Prepare assessment results for plotting

In [None]:
# Add new variable to indicate type of model
gb_liftinfo['model']   = 'GradientBoosting'
gb_rocinfo['model']    = 'GradientBoosting'

# # Concatenate data
# all_liftinfo = pd.concat([rf_liftinfo, gb_liftinfo, nn_liftinfo, tree_liftinfo], ignore_index=True)
# all_rocinfo = pd.concat([rf_rocinfo, gb_rocinfo, nn_rocinfo, tree_rocinfo], ignore_index=True)

## Draw ROC and Lift plots

In [None]:
# Draw ROC charts 
plt.figure(figsize=(15, 5))
for key, grp in gb_rocinfo.groupby(['model']):
    plt.plot(grp['FPR'], grp['Sensitivity'], label=key)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)
plt.legend(loc='best')
plt.title('ROC Curve')
plt.show()

# Draw lift charts
plt.figure(figsize=(15, 5))
for key, grp in gb_liftinfo.groupby(['model']):
    plt.plot(grp['Depth'], grp['CumLift'], label=key)
plt.xlabel('Depth')
plt.ylabel('Cumulative Lift')
plt.grid(True)
plt.legend(loc='best')
plt.title('Cumulative Lift Chart')
plt.show();

## End CAS session

This closes the CAS session freeing resources for others to leverage

In [None]:
# This is the same as sess.endsession(); sess.close();
sess.terminate()