**Unsupervised Learning: Random Cut Forest**

**Objective:** Anomaly Detection

In [2]:
# Importing the require libraries 

import pandas as pd                               
import boto3
import sagemaker
import s3fs
from sklearn.model_selection import train_test_split
import json
import nltk
import csv
from time import sleep

Data Setup

In [1]:
data_bucket = "just-abdul-aws" #Defining bucket name
subfolder = "Random_CF"
dataset = "activities.csv"

In [3]:
role = sagemaker.get_execution_role()
sess = sagemaker.Session()
s3 = s3fs.S3FileSystem(anon=False)
sm = boto3.Session().client('sagemaker')

In [4]:
df = pd.read_csv(f's3://{data_bucket}/{subfolder}/{dataset}')

Understanding the data

In [5]:
df.head()

Unnamed: 0,Matter Number,Firm Name,Matter Type,Resource,Activity,Minutes,Fee,Total,Error
0,0,Cox Group,Antitrust,Paralegal,Prepare Opinion,85,70,99.17,False
1,0,Cox Group,Antitrust,Junior,Prepare Opinion,505,150,1262.5,False
2,0,Cox Group,Antitrust,Junior,Prepare Opinion,100,180,300.0,False
3,0,Cox Group,Antitrust,Senior,Prepare Opinion,65,400,433.33,False
4,0,Cox Group,Antitrust,Junior,Phone Call,20,180,60.0,False


In [6]:
df.shape

(105965, 9)

In [7]:
Error = df.Error.value_counts()
pd.DataFrame(Error)

Unnamed: 0,Error
False,103935
True,2030


In [11]:
Firm_name = df['Firm Name'].unique()
pd.DataFrame(Firm_name)

Unnamed: 0,0
0,Cox Group
1,"Day, Love and Beasley"
2,Taylor-Brown
3,"Peters, Nelson and Moyer"
4,"Russell, Bradley and Howell"
5,Knight-Flores
6,"Jackson, Pollard and Levy"
7,"Parks, Roberson and Short"
8,"Solis, Adams and Cooper"
9,Mejia-Flores


In [13]:
Resource = df['Resource'].unique()
pd.DataFrame(Resource)

Unnamed: 0,0
0,Paralegal
1,Junior
2,Senior
3,Partner


In [14]:
Activity = df['Activity'].unique()
pd.DataFrame(Activity)

Unnamed: 0,0
0,Prepare Opinion
1,Phone Call
2,Attend Court
3,Attend Meeting


In [15]:
data = df.drop(['Matter Number', 'Firm Name'], axis=1)

In [16]:
# Handling Categorical data types

encoded_df = pd.get_dummies(data, columns=['Matter Type','Resource','Activity']) 
encoded_df.head()

Unnamed: 0,Minutes,Fee,Total,Error,Matter Type_Antitrust,Matter Type_Asset recovery,Matter Type_Commercial arbitration,Matter Type_IPO,Matter Type_Insolvency,Matter Type_M&A,...,Matter Type_Structured finance,Matter Type_Tax planning,Resource_Junior,Resource_Paralegal,Resource_Partner,Resource_Senior,Activity_Attend Court,Activity_Attend Meeting,Activity_Phone Call,Activity_Prepare Opinion
0,85,70,99.17,False,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,505,150,1262.5,False,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,100,180,300.0,False,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [17]:
X = encoded_df
Y = encoded_df['Error']

In [18]:
# Creating the training and the validation set

train_df, val_df, _, _ = train_test_split(X, Y, test_size=0.2, random_state=0)

In [30]:
train_df_no_result = train_df.drop(['Error'], axis=1)
val_df_no_result = val_df.drop(['Error'], axis=1)

In [31]:
# Training The Model 

from sagemaker import RandomCutForest

session = sagemaker.Session()

RCF = RandomCutForest(role=role,
                      instance_count=1,
                      instance_type='ml.m4.xlarge',
                      data_location=f's3://{data_bucket}/{subfolder}/',
                      output_path=f's3://{data_bucket}/{subfolder}/output',
                      num_samples_per_tree=100,
                      num_trees=50)

# automatically upload the training data to S3 and run the training job
RCF.fit(RCF.record_set(train_df_no_result.values))

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


2021-06-28 02:18:19 Starting - Starting the training job...
2021-06-28 02:18:42 Starting - Launching requested ML instancesProfilerReport-1624846698: InProgress
......
2021-06-28 02:19:42 Starting - Preparing the instances for training......
2021-06-28 02:20:50 Downloading - Downloading input data...
2021-06-28 02:21:03 Training - Downloading the training image......
2021-06-28 02:22:18 Uploading - Uploading generated training model[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[06/28/2021 02:22:11 INFO 139943321941824] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'num_samples_per_tree': 256, 'num_trees': 100, 'force_dense': 'true', 'eval_metrics': ['accuracy', 'precision_recall_fscore'], 'epochs': 1, 'mini_batch_size': 1000, '_log_level': 'info', '_kvstore': 'dist_async', '_num_kv_servers': 'auto', '_num_gpus': 'auto', '_tuning_objective_metri

In [34]:
# Hosting the Model

predictor = RCF.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge', 
    endpoint_name="Random-Cut-Forest"
)

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


-------------!

In [36]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

predictor.serializer = CSVSerializer()
predictor.deserializer = JSONDeserializer()

In [38]:
# Testing the Model

results = predictor.predict(val_df_no_result.values)
scores_df = pd.DataFrame(results['scores'])
val_df = val_df.reset_index(drop=True)
results_df = pd.concat([val_df, scores_df], axis=1)
results_df['Error'].value_counts()

False    20791
True       402
Name: Error, dtype: int64

In [39]:
results_df

Unnamed: 0,Minutes,Fee,Total,Error,Matter Type_Antitrust,Matter Type_Asset recovery,Matter Type_Commercial arbitration,Matter Type_IPO,Matter Type_Insolvency,Matter Type_M&A,...,Matter Type_Tax planning,Resource_Junior,Resource_Paralegal,Resource_Partner,Resource_Senior,Activity_Attend Court,Activity_Attend Meeting,Activity_Phone Call,Activity_Prepare Opinion,score
0,100,50,83.33,False,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0.701596
1,20,80,26.67,False,0,0,0,0,0,0,...,1,0,1,0,0,0,0,1,0,0.666033
2,120,70,140.00,False,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0.661593
3,25,180,75.00,False,0,0,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0.682108
4,25,600,250.00,False,0,1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0.890886
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21188,45,80,60.00,False,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0.670974
21189,105,450,787.50,False,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0.727313
21190,60,70,70.00,False,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0.689184
21191,110,50,91.67,False,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0.702229


In [40]:
scores_df

Unnamed: 0,score
0,0.701596
1,0.666033
2,0.661593
3,0.682108
4,0.890886
...,...
21188,0.670974
21189,0.727313
21190,0.689184
21191,0.702229


In [41]:
score_cutoff = results_df[results_df['Error'] == True]['score'].median()
print(f'Score cutoff: {score_cutoff}')
results_above_cutoff = results_df[results_df['score'] > score_cutoff]
results_above_cutoff['Error'].value_counts()

Score cutoff: 1.78126339745


True     201
False     74
Name: Error, dtype: int64

In [51]:
results_df['Prediction'] = results_df['score'] > score_cutoff
results_df.head()

Unnamed: 0,Minutes,Fee,Total,Error,Matter Type_Antitrust,Matter Type_Asset recovery,Matter Type_Commercial arbitration,Matter Type_IPO,Matter Type_Insolvency,Matter Type_M&A,...,Resource_Junior,Resource_Paralegal,Resource_Partner,Resource_Senior,Activity_Attend Court,Activity_Attend Meeting,Activity_Phone Call,Activity_Prepare Opinion,score,Prediction
0,100,50,83.33,False,1,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0.701596,False
1,20,80,26.67,False,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0.666033,False
2,120,70,140.0,False,1,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0.661593,False
3,25,180,75.0,False,0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0.682108,False
4,25,600,250.0,False,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0.890886,False


In [54]:
results_df[results_df['score'] > score_cutoff]

Unnamed: 0,Minutes,Fee,Total,Error,Matter Type_Antitrust,Matter Type_Asset recovery,Matter Type_Commercial arbitration,Matter Type_IPO,Matter Type_Insolvency,Matter Type_M&A,...,Resource_Junior,Resource_Paralegal,Resource_Partner,Resource_Senior,Activity_Attend Court,Activity_Attend Meeting,Activity_Phone Call,Activity_Prepare Opinion,score,Prediction
103,495,250,2062.50,True,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,2.444236,True
177,535,250,2229.17,True,1,0,0,0,0,0,...,0,0,0,1,0,0,0,1,2.590069,True
179,285,400,1900.00,True,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,2.041744,True
186,520,80,693.33,True,1,0,0,0,0,0,...,0,1,0,0,0,0,0,1,1.835965,True
304,510,80,680.00,True,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,1.813424,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20973,150,800,2000.00,False,0,0,1,0,0,0,...,0,0,1,0,1,0,0,0,2.150048,True
20994,135,800,1800.00,False,1,0,0,0,0,0,...,0,0,1,0,1,0,0,0,1.862283,True
21078,135,800,1800.00,False,0,0,0,1,0,0,...,0,0,1,0,1,0,0,0,1.862462,True
21158,495,600,4950.00,True,0,0,0,0,1,0,...,0,0,1,0,0,0,0,1,3.920501,True


# Clean Up

In [47]:
sess.delete_endpoint(predictor.endpoint_name)