# Training

In [11]:
import boto3
import botocore
import sagemaker
import sys


bucket = (
    sagemaker.Session().default_bucket()
)  # Feel free to change to another bucket you have access to
prefix = "tpmlengineer"
execution_role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# S3 bucket where the original data is downloaded and stored.
downloaded_data_bucket = f"tpmlengineer"
downloaded_data_prefix = "test"


def check_bucket_permission(bucket):
    # check if the bucket exists
    permission = False
    try:
        boto3.Session().client("s3").head_bucket(Bucket=bucket)
    except botocore.exceptions.ParamValidationError as e:
        print(
            "Hey! You either forgot to specify your S3 bucket"
            " or you gave your bucket an invalid name!"
        )
    except botocore.exceptions.ClientError as e:
        if e.response["Error"]["Code"] == "403":
            print(f"Hey! You don't have permission to access the bucket, {bucket}.")
        elif e.response["Error"]["Code"] == "404":
            print(f"Hey! Your bucket, {bucket}, doesn't exist!")
        else:
            raise
    else:
        permission = True
    return permission


if check_bucket_permission(bucket):
    print(f"Training input/output will be stored in: s3://{bucket}/{prefix}")
if check_bucket_permission(downloaded_data_bucket):
    print(
        f"Downloaded training data will be read from s3://{downloaded_data_bucket}/{downloaded_data_prefix}"
    )


Training input/output will be stored in: s3://sagemaker-us-east-2-704413948526/tpmlengineer
Downloaded training data will be read from s3://tpmlengineer/test


In [12]:
import pandas as pd

data_filename = "X.csv"
s3 = boto3.client("s3")
s3.download_file(downloaded_data_bucket, f"{downloaded_data_prefix}/{data_filename}", data_filename)
df = pd.read_csv(data_filename, delimiter=",")

In [13]:
df.DEP_DELAY.to_numpy()

array([ -2.,  -1.,  -1., ...,   0.,  -5., -10.])

In [14]:
from sagemaker import RandomCutForest

session = sagemaker.Session()

# specify general training job information
rcf = RandomCutForest(
    role=execution_role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    data_location=f"s3://{bucket}/{prefix}/",
    output_path=f"s3://{bucket}/{prefix}/output",
    num_samples_per_tree=512,
    num_trees=50,
)

# automatically upload the training data to S3 and run the training job
rcf.fit(rcf.record_set(df.DEP_DELAY.to_numpy().reshape(-1, 1)))

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


2022-04-01 17:05:49 Starting - Starting the training job...
2022-04-01 17:06:14 Starting - Preparing the instances for trainingProfilerReport-1648832749: InProgress
.........
2022-04-01 17:07:35 Downloading - Downloading input data...
2022-04-01 17:08:13 Training - Downloading the training image.....[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[04/01/2022 17:09:03 INFO 140207764698944 integration.py:636] worker started[0m
[34m[04/01/2022 17:09:03 INFO 140207764698944] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'num_samples_per_tree': 256, 'num_trees': 100, 'force_dense': 'true', 'eval_metrics': ['accuracy', 'precision_recall_fscore'], 'epochs': 1, 'mini_batch_size': 1000, '_log_level': 'info', '_kvstore': 'dist_async', '_num_kv_servers': 'auto', '_num_gpus': 'auto', '_tuning_objective_metric': '', '_ftp_port': 8999}[0m
[34m[04/01/2022 1

In [15]:
print(f"Training job name: {rcf.latest_training_job.job_name}")


Training job name: randomcutforest-2022-04-01-17-05-49-136


In [16]:
rcf_inference = rcf.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")


Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


------!

In [17]:
print(f"Endpoint name: {rcf_inference.endpoint}")


The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


Endpoint name: randomcutforest-2022-04-01-17-10-02-548


# Prediction

In [None]:
import boto3, sys
import numpy as np 
import json
import pandas as pd


downloaded_data_bucket = f"tpmlengineer"
downloaded_data_prefix = "test"

data_filename = "delays_test.csv"
s3 = boto3.client("s3")
s3.download_file(downloaded_data_bucket, f"{downloaded_data_prefix}/{data_filename}", data_filename)
df_test = pd.read_csv(data_filename, delimiter=",")

In [267]:
import boto3, sys


sm_rt = boto3.Session().client('runtime.sagemaker') 

In [175]:
response = sm_rt.invoke_endpoint(EndpointName=endpoint,
                                   ContentType='text/csv',
                                   Body=df_test.DEP_DELAY.to_string(index=False))


In [176]:
result = json.loads(response['Body'].read().decode())

In [177]:
df_test['score'] = [x.get('score') for x in result.get('scores')]

In [189]:
score_mean = df_test.score.mean()
score_std = df_test.score.std()

score_cutoff = score_mean + 3*score_std
anomalies = df_test[df_test['score'] > score_cutoff]


In [191]:
score_cutoff

4.658141312777081

In [192]:
anomalies.shape

(2764, 4)

In [197]:
df_test['anomalies'] = np.where(df_test['score'] > score_cutoff, 1, 0)

In [198]:
df_test

Unnamed: 0,ORIGIN,FL_DATE,DEP_DELAY,score,anomalies
0,ABE,2010-01-01,22.0,0.874883,0
1,ABE,2010-01-02,20.0,0.845079,0
2,ABE,2010-01-03,53.0,1.042316,0
3,ABE,2010-01-04,109.0,1.369159,0
4,ABE,2010-01-05,49.0,0.972269,0
...,...,...,...,...,...
103986,YUM,2010-12-27,89.0,1.210318,0
103987,YUM,2010-12-28,93.0,1.244144,0
103988,YUM,2010-12-29,24.0,0.846275,0
103989,YUM,2010-12-30,42.0,0.954496,0


In [200]:
df_graph

Unnamed: 0,ORIGIN,FL_DATE,DEP_DELAY,score,anomalies
0,ABE,2010-01-01,22.0,0.874883,0
1,ABE,2010-01-02,20.0,0.845079,0
2,ABE,2010-01-03,53.0,1.042316,0
3,ABE,2010-01-04,109.0,1.369159,0
4,ABE,2010-01-05,49.0,0.972269,0
...,...,...,...,...,...
360,ABE,2010-12-27,198.0,2.297799,0
361,ABE,2010-12-28,53.0,1.042316,0
362,ABE,2010-12-29,121.0,1.495570,0
363,ABE,2010-12-30,228.0,2.773681,0


# Functions

In [15]:
import boto3, sys
import numpy as np 
import json
import pandas as pd


In [16]:
def get_max_file_with_last_year():
    s3 = boto3.client("s3")
    objects=s3.list_objects(Bucket='tpmlengineer')
    files = list()
    for obj in objects['Contents']:
        files.append(obj['Key'])
    files_tp = [x for x in files if x.startswith('tp')]
    years = [int(x.split('tp/')[1].split('.csv')[0]) for x in files_tp]
    max_year = np.array(years).max()
    file_max_year = str(max_year)
    
    return file_max_year

In [17]:
def download_file(data_filename):
    downloaded_data_bucket = f"tpmlengineer"
    downloaded_data_prefix = "tp"
    col_list = ['ORIGIN', 'FL_DATE', 'DEP_DELAY']
    s3 = boto3.client("s3")
    s3.download_file(downloaded_data_bucket, f"{downloaded_data_prefix}/{data_filename}", data_filename)
    df = pd.read_csv(data_filename, delimiter=",", usecols=col_list)
    return df

In [18]:
def transform_file(df):
    df_transformed = df.groupby(['ORIGIN', 'FL_DATE']).aggregate({'DEP_DELAY': ['count', 'mean']}).reset_index()
    df_transformed.columns = ['origin', 'fl_date','flights_count', 'delay_mean'] 
    return df_transformed
    

In [19]:
def predict_anomalies(df):
    endpoint = "randomcutforest-2022-04-01-16-34-53-759"
    sm_rt = boto3.Session().client('runtime.sagemaker') 
    
    response = sm_rt.invoke_endpoint(EndpointName=endpoint,
                                   ContentType='text/csv',
                                   Body=df.delay_mean.to_string(index=False))
    
    result = json.loads(response['Body'].read().decode())
    
    df['score'] = [x.get('score') for x in result.get('scores')]
    score_mean = df.score.mean()
    score_std = df.score.std()

    score_cutoff = score_mean + 3*score_std
    df['anomalies'] = np.where(df['score'] > score_cutoff, 1, 0)
    return df
    
    

In [20]:
def graph_airport(airport, df_anomalies, year):
    import matplotlib.pyplot as plt
    df_graph = df_anomalies.loc[df_anomalies.origin==airport,['fl_date', 'flights_count', 'anomalies']]
    df_graph.set_index('fl_date',inplace=True)
    
    outlier_dates = df_graph[df_graph['anomalies'] == 1].index
    
    y_values = [df_graph.loc[i]['flights_count'] for i in outlier_dates]
    
    fig, ax1 = plt.subplots(figsize=(30,14))

    ax1.plot(pd.to_datetime(df_graph.index), df_graph.flights_count, label='Flights')
    ax1.scatter(pd.to_datetime(outlier_dates),y_values, c='red', label='Anomalies in Delay')
    ax1.set_ylabel('Mean Delay')
    ax1.tick_params('y', colors='C0')

    ax1.legend()
    fig.suptitle( airport + ' airport delays')
    image_name = airport+".png"         
    
    plt.savefig(image_name)
    
    # upload image to aws s3
    # warning, the ACL here is set to public-read
    s3 = boto3.client("s3")
    bucket='tpmlengineer'
    img_data = open(image_name, "rb")
    s3.put_object(Bucket=bucket, Key= 'output/' + year + '/' + image_name, Body=img_data, 
                                 ContentType="image/png")
    plt.close(fig)


In [21]:
def main(year='default'):
    if year == 'default':
        year = get_last_year()
    
    data_filename = year + '.csv'
    df = download_file(data_filename)
    df_transformed = transform_file(df)
    df_anomalies = predict_anomalies(df_transformed)
    for i in df_anomalies.origin.unique():
        graph_airport(i, df_anomalies, year)

In [None]:
main('2017')