# Get Set Up

In [None]:
import numpy as np
import pandas as pd
from sagemaker import get_execution_role
import sagemaker as sage

role = get_execution_role()
sess = sage.Session()

# Prep the data

In [None]:

input_file = "ferretData.csv"

# comma delimited is the default
df = pd.read_csv(input_file, header = 0)

#one-hot encode the some columns
df = pd.get_dummies(df, columns=['foodSituation','groomingSituation','livingSituation','disposition'])

#remove the ferretID as it's not relevent for us.  We can remove the disposition_nice as the disposition_angry 
# can serve as an effective label for us.
df = df.drop(['ferretID','disposition_nice'],axis=1)

df.head()


# Get the prepared data into S3

In [None]:
bucket_name = '<<s3 bucket name>>'
data_key = 'marketplace_logistic_regression'
prepped_data_file = 'ferretData_prepped.csv'


np.savetxt(prepped_data_file,df,delimiter=',')

# Upload file to S3
output_location = 's3://{}/{}'.format(bucket_name, 'output')
data_location = output_location = 's3://{}/{}'.format(bucket_name, data_key)
print ("Training artifacts will be uploaded at: " + output_location)
print ("And data_location will be a parameter for fit method (see training stage below).")

sess.upload_data(prepped_data_file, bucket=bucket_name, key_prefix=data_key)

# Specify our hyperparameters, algorithm ARN and SageMaker Estimator

In [None]:
# Only required hyperparameter for this algortihm is the number of classes.
# Also, per the documentation of this algortihm the labels should be the last column.
hyperparameters={"nClasses": 2}

algo_subscription_arn = "<<insert subscribed algorithm ARN here>>"

ferretDetector_LogRegress = sage.algorithm.AlgorithmEstimator(
    algorithm_arn=algo_subscription_arn,
    base_job_name="ferretDetector-LogisticRegression",
    role=role,
    train_instance_count=1,
    train_instance_type='ml.m4.xlarge',
    input_mode="File",
    output_path=output_location,
    sagemaker_session=sess,
    hyperparameters=hyperparameters
)

# Get to Training Already!

In [None]:
ferretDetector_LogRegress.fit({"training": data_location})
