In [38]:
# !pip install boto3
# !pip install sagemaker
import warnings, requests, zipfile, io
warnings.simplefilter('ignore')
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split
import os
import boto3
from sagemaker.image_uris import retrieve
import sagemaker

  DEPRECATION: sagemaker is being installed using the legacy 'setup.py install' method, because it does not have a 'pyproject.toml' and the 'wheel' package is not installed. pip 23.1 will enforce this behaviour change. A possible replacement is to enable the '--use-pep517' option. Discussion can be found at https://github.com/pypa/pip/issues/8559

[notice] A new release of pip is available: 23.0.1 -> 23.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting sagemaker
  Downloading sagemaker-2.195.0.tar.gz (915 kB)
     ---------------------------------------- 0.0/915.6 kB ? eta -:--:--
     -------- ----------------------------- 204.8/915.6 kB 6.3 MB/s eta 0:00:01
     -------------------- ----------------- 491.5/915.6 kB 7.8 MB/s eta 0:00:01
     -------------------------------------- 915.6/915.6 kB 8.3 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting google-pasta
  Downloading google_pasta-0.2.0-py3-none-any.whl (57 kB)
     ---------------------------------------- 0.0/57.5 kB ? eta -:--:--
     ---------------------------------------- 57.5/57.5 kB ? eta 0:00:00
Collecting protobuf<5.0,>=3.12
  Downloading protobuf-4.24.4-cp310-abi3-win_amd64.whl (430 kB)
     ---------------------------------------- 0.0/430.5 kB ? eta -:--:--
     ------------------------------------- 430.5/430.5 kB 26.3 MB/s eta 0:00:00
Collecting smdebug_rulesconfig==1.0.1
  D

In [31]:
df = pd.read_excel("../data/titanic3.xls")

### Clean the data
Unclear names are changed.  
Sex is converted into binary values.  
The parents_children and siblings_spouses columns are summarized in the binary alone column (passengers without siblings, spouses, parents, or children are considered to be alone).  
Columns with too many NaN values or meaningless info are dropped.  
The order of the columns is changed for clarity.
It should be noted that class is a categorical column and both sex, survived, and alone are binary columns.

In [32]:
# change names
df = df.rename(columns={'pclass': 'class', 'sibsp': 'siblings_spouses', 'parch': 'parents_children', 'home.dest': 'home_destination'})
# create binary columns for sex and alone
df['sex'] = df['sex'].apply(lambda x: 0 if str(x) == 'male' else 1)
df['alone'] = df.apply(lambda row: 1 if row['parents_children'] == 0 and row['siblings_spouses'] == 0 else 0, axis=1)
# fill null values for age and fare
df['age'].fillna(value=df['age'].mean(), inplace=True)
df['fare'].fillna(value=df['fare'].mean(), inplace=True)
# drop unused columns
df.drop(['cabin', 'boat', 'body', 'ticket', 'name', 'home_destination', 'embarked'], axis=1, inplace=True)
#change order of columns
new_order = ['survived', 'sex', 'age',  'alone', 'siblings_spouses', 'parents_children', 'fare', 'class']
df = df[new_order]

df.head(5)

Unnamed: 0,survived,sex,age,alone,siblings_spouses,parents_children,fare,class
0,1,1,29.0,1,0,0,211.3375,1
1,1,0,0.9167,0,1,2,151.55,1
2,0,1,2.0,0,1,2,151.55,1
3,0,0,30.0,0,1,2,151.55,1
4,0,1,25.0,0,1,2,151.55,1


## Building Models

#### Encode Categorical Data
Amazon Sagemaker requires the categorical data to be manually encoded. In this df we have three non-ordinal categorical data columns: class (3 values), sex (binary), and alone (binary). I believe that one-hot encoding is the way to go for these, so that will be done in the following cells.

In [33]:
df = pd.get_dummies(df, columns=['sex', 'alone', 'class'])

# Clean up the column names
df = df.rename(columns={'sex_0': 'sex_male', 'sex_1': 'sex_female', 'alone_0': 'alone_no', 'alone_1': 'alone_yes'})

df.head()

Unnamed: 0,survived,age,siblings_spouses,parents_children,fare,sex_male,sex_female,alone_no,alone_yes,class_1,class_2,class_3
0,1,29.0,0,0,211.3375,0,1,0,1,1,0,0
1,1,0.9167,1,2,151.55,1,0,1,0,1,0,0
2,0,2.0,1,2,151.55,0,1,1,0,1,0,0
3,0,30.0,1,2,151.55,1,0,1,0,1,0,0
4,0,25.0,1,2,151.55,0,1,1,0,1,0,0


#### Split the Data
Amazon Sagemaker requires us to manually split the data in a training set, validation set, and a test set. These sets need to then be converted to csv files. That will be done here.  

In [34]:
# TODO adjust stratify and test_size
train, test_and_validate = train_test_split(df, test_size=0.2, random_state=42)
test, validate = train_test_split(test_and_validate, test_size=0.5, random_state=42)

#### Train the XGBoost model
The first step is to upload all data to AWS.

In [35]:
# I am using the bucket used in the tutorial as that one contains other required files
bucket='c93435a2086638l4942346t1w782520689469-labbucket-t0wjpy1mp3v2'

prefix='titanic'

train_file='training_data/train_ea.csv'
test_file='training_data/test_ea.csv'
validate_file='training_data/validate_ea.csv'

s3_resource = boto3.Session().resource('s3')
def upload_s3_csv(filename, folder, dataframe):
    csv_buffer = io.StringIO()
    dataframe.to_csv(csv_buffer, header=False, index=False)
    s3_resource.Bucket(bucket).Object(os.path.join(prefix, folder, filename)).put(Body=csv_buffer.getvalue())

In [36]:
upload_s3_csv(train_file, 'train', train)
upload_s3_csv(test_file, 'test', test)
upload_s3_csv(validate_file, 'validate', validate)

NoCredentialsError: Unable to locate credentials

Now we can actually train the model.

In [None]:
container = retrieve('xgboost',boto3.Session().region_name,'1.0-1')

In [None]:
# TODO experiment with num_round and maybe binary:hinge for the objective (uncommon choice)
hyperparams={"num_round":"42",
             "eval_metric": "auc",
             "objective": "binary:logistic"}

In [39]:
#  Set up the model
# TODO see how to experiment with this
s3_output_location="s3://{}/{}/output/".format(bucket,prefix)
xgb_model=sagemaker.estimator.Estimator(container,
                                       sagemaker.get_execution_role(),
                                       instance_count=1,
                                       instance_type='ml.m4.xlarge',
                                       output_path=s3_output_location,
                                        hyperparameters=hyperparams,
                                        sagemaker_session=sagemaker.Session())

In [None]:
# Set up channels
train_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/train/".format(bucket,prefix,train_file),
    content_type='text/csv')

validate_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/validate/".format(bucket,prefix,validate_file),
    content_type='text/csv')

data_channels = {'train': train_channel, 'validation': validate_channel}

In [None]:
# Actually train the model
xgb_model.fit(inputs=data_channels, logs=False)

## Predict

Here we will be using our model to make predictions. We will perform a batch transform; we will be applying the model to all test data at once.

In [42]:
# Save all test rows to a csv without the survived column
batch_X = test.iloc[:,1:];
batch_X_file='batch-in.csv'
upload_s3_csv(batch_X_file, 'batch-in', batch_X)


NoCredentialsError: Unable to locate credentials

In [None]:
batch_output = "s3://{}/{}/batch-out/".format(bucket,prefix)
batch_input = "s3://{}/{}/batch-in/{}".format(bucket,prefix,batch_X_file)

# Configure the transformer
xgb_transformer = xgb_model.transformer(instance_count=1,
                                       instance_type='ml.m4.xlarge',
                                       strategy='MultiRecord',
                                       assemble_with='Line',
                                       output_path=batch_output)

# Perform a transform
xgb_transformer.transform(data=batch_input,
                         data_type='S3Prefix',
                         content_type='text/csv',
                         split_type='Line')
xgb_transformer.wait()

In [None]:
# Download the output from Amazon S3
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucket, Key="{}/batch-out/{}".format(prefix,'batch-in.csv.out'))

# Save the output in a df
target_predicted = pd.read_csv(io.BytesIO(obj['Body'].read()),sep=',',names=['class'])
target_predicted.head(5)

In [None]:
# TODO experiment with the threshold
def binary_convert(x):
    threshold = 0.65
    if x > threshold:
        return 1
    else:
        return 0

target_predicted['binary'] = target_predicted['class'].apply(binary_convert)

print(target_predicted.head(10))
test.head(10)

## Evaluation 

## Tuning the Model