## Part 1: Load and examine the data (v2)

To run the code in the notebook cell, change the name of the data_bucket from 'machliba' to the data_bucket holding your data and click into the cell and press Ctrl+Enter.

In [None]:
data_bucket = 'doughudgeon-mlforbusiness'  # change the name odf your bucket
subfolder = 'ch05'
dataset = 'activities.csv' 

In [None]:
import sys
import pandas as pd
import boto3
import s3fs
import sagemaker
from sklearn.model_selection import train_test_split
import json
import csv
import time

# correspond to Version 2.x of the SageMaker Python SDK
# Check the latest version of SageMaker
if int(sagemaker.__version__.split('.')[0]) == 2:
    print("Version is good")
else:
    !{sys.executable} -m pip install --upgrade sagemaker
    print("Installing latest SageMaker Version. Please restart the kernel")

role = sagemaker.get_execution_role()
s3 = s3fs.S3FileSystem(anon=False)

In [None]:
df = pd.read_csv(f's3://{data_bucket}/{subfolder}/{dataset}')
display(df[5:8])

In [None]:
df['Error'].value_counts() # Display the number of error lines: False = no error. True = error.

In [None]:
print(f'Number of rows in dataset: {df.shape[0]}')
print()
print('Matter types:')
print(df['Matter Type'].value_counts())
print()
print('Resources:')
print(df['Resource'].value_counts())
print()
print('Activities:')
print(df['Activity'].value_counts())

## Part 2: Get the data into the right shape

In [None]:
encoded_df = pd.get_dummies(df, columns=['Matter Type','Resource','Activity']) 
encoded_df.head(3)

## Part 3: Create training and validation datasets

In [None]:
train_df, val_df, _, _ = train_test_split(encoded_df, encoded_df['Error'], test_size=0.2, random_state=0)
train_df_no_result = train_df.drop(['Error','Firm Name'], axis=1)
val_df_no_result = val_df.drop(['Error','Firm Name'], axis=1)
print(f'{train_df.shape[0]} rows in training data')
print(f'{val_df.shape[0]} rows in validation data')

## Part 4: Train the model



In [None]:
from sagemaker import RandomCutForest

session = sagemaker.Session()

rcf = RandomCutForest(role=role,
                      instance_count=1,
                      instance_type='ml.m4.xlarge',
                      data_location=f's3://{data_bucket}/{subfolder}/',
                      output_path=f's3://{data_bucket}/{subfolder}/output',
                      num_samples_per_tree=100,
                      num_trees=50)

job_name = 'ml4biz-{}'.format(time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()))

In [None]:
# automatically upload the training data to S3 and run the training job
rcf.fit(rcf.record_set(train_df_no_result.values,labels=None, channel='train', encrypt=False),
        mini_batch_size=None, wait=True, logs=True, job_name=job_name, experiment_config=None)

## Part 5: Host the model

In [None]:
endpoint_name = 'suspicious-lines'
try:
    session.delete_endpoint(sagemaker.predictor.Predictor(endpoint=endpoint_name).endpoint)
    print('Warning: Existing endpoint deleted to make way for your new endpoint.')
except:
    pass

In [None]:
rcf_endpoint = rcf.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium',
    endpoint_name=endpoint_name
)

In [None]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

rcf_endpoint.serializer = CSVSerializer()
rcf_endpoint.deserializer = JSONDeserializer()

## Part 6: Test the model

In [None]:
results = rcf_endpoint.predict(val_df_no_result.values)
scores_df = pd.DataFrame(results['scores'])
val_df = val_df.reset_index(drop=True)
results_df = pd.concat([val_df, scores_df], axis=1)
results_df['Error'].value_counts()

In [None]:
score_cutoff = results_df[results_df['Error'] == True]['score'].median()
print(f'Score cutoff: {score_cutoff}')
results_above_cutoff = results_df[results_df['score'] > score_cutoff]
results_above_cutoff['Error'].value_counts()

In [None]:
results_df['Prediction'] = results_df['score'] > score_cutoff
results_df.head()

## Remove the Endpoint (optional)

Comment out this cell if you want the endpoint to exist after "run all"

In [None]:
sagemaker.Session().delete_endpoint(rcf_endpoint.endpoint_name)