In [8]:
!pip install xlrd
import warnings, requests, zipfile, io
warnings.simplefilter('ignore')
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import os
import boto3
from sagemaker.image_uris import retrieve
import sagemaker
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve, auc

Collecting xlrd
  Downloading xlrd-2.0.1-py2.py3-none-any.whl (96 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.5/96.5 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlrd
Successfully installed xlrd-2.0.1


In [43]:
# Get the data from github
github_url = 'https://github.com/mjochen/CloudAI/raw/master/Exercises/files/titanic3.xls'
response = requests.get(github_url)

# Check if the request was successful
if response.status_code == 200:
    excel_data = io.BytesIO(response.content)
    df = pd.read_excel(excel_data, engine='xlrd')
else:
    print("Failed to retrieve the Excel file from GitHub.")

### Clean the data
Unclear names are changed.  
Sex is converted into binary values.  
The parents_children and siblings_spouses columns are summarized in the binary alone column (passengers without siblings, spouses, parents, or children are considered to be alone).  
Columns with too many NaN values or meaningless info are dropped.  
The order of the columns is changed for clarity.
It should be noted that class is a categorical column and both sex, survived, and alone are binary columns.

In [44]:
# change names
df = df.rename(columns={'pclass': 'class', 'sibsp': 'siblings_spouses', 'parch': 'parents_children', 'home.dest': 'home_destination'})
# create binary columns for sex and alone
df['sex'] = df['sex'].apply(lambda x: 0 if str(x) == 'male' else 1)
df['alone'] = df.apply(lambda row: 1 if row['parents_children'] == 0 and row['siblings_spouses'] == 0 else 0, axis=1)
# fill null values for age and fare
df['age'].fillna(value=df['age'].mean(), inplace=True)
df['fare'].fillna(value=df['fare'].mean(), inplace=True)
# drop unused columns
df.drop(['cabin', 'boat', 'body', 'ticket', 'name', 'home_destination', 'embarked'], axis=1, inplace=True)
#change order of columns
new_order = ['sex', 'age', 'survived', 'alone', 'siblings_spouses', 'parents_children', 'fare', 'class']
df = df[new_order]

df.head(5)

Unnamed: 0,sex,age,survived,alone,siblings_spouses,parents_children,fare,class
0,1,29.0,1,1,0,0,211.3375,1
1,0,0.9167,1,0,1,2,151.55,1
2,1,2.0,0,0,1,2,151.55,1
3,0,30.0,0,0,1,2,151.55,1
4,1,25.0,0,0,1,2,151.55,1


In [45]:
df = pd.get_dummies(df, columns=['sex', 'alone', 'class'])

# Clean up the column names
df = df.rename(columns={'sex_0': 'sex_male', 'sex_1': 'sex_female', 'alone_0': 'alone_no', 'alone_1': 'alone_yes'})
df.drop('fare', axis=1, inplace=True)
df.head()

Unnamed: 0,age,survived,siblings_spouses,parents_children,sex_male,sex_female,alone_no,alone_yes,class_1,class_2,class_3
0,29.0,1,0,0,False,True,False,True,True,False,False
1,0.9167,1,1,2,True,False,True,False,True,False,False
2,2.0,0,1,2,False,True,True,False,True,False,False
3,30.0,0,1,2,True,False,True,False,True,False,False
4,25.0,0,1,2,False,True,True,False,True,False,False


In [46]:
train, test_and_validate = train_test_split(df, test_size=0.2, random_state=42)
test, validate = train_test_split(test_and_validate, test_size=0.5, random_state=42)

In [47]:
bucket='c93435a2086654l5165132t1w1437246309-sandboxbucket-1iljc8bta520y'
prefix='titanic'

In [48]:
train_file='training_data/train_js.csv'
test_file='training_data/test_js.csv'
validate_file='training_data/validate_js.csv'

s3_resource = boto3.Session().resource('s3')
def upload_s3_csv(filename, folder, dataframe):
    csv_buffer = io.StringIO()
    dataframe.to_csv(csv_buffer, header=False, index=False)
    s3_resource.Bucket(bucket).Object(os.path.join(prefix, folder, filename)).put(Body=csv_buffer.getvalue())

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [49]:
upload_s3_csv(train_file, 'train', train)
upload_s3_csv(test_file, 'test', test)
upload_s3_csv(validate_file, 'validate', validate)

In [50]:
container = retrieve('xgboost',boto3.Session().region_name,'1.0-1')

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.


In [51]:
hyperparams={"num_round":"48",
             "eval_metric": "auc",
             "objective": "binary:hinge"}

In [52]:
s3_output_location="s3://{}/{}/output/".format(bucket,prefix)
xgb_model=sagemaker.estimator.Estimator(container,
                                       sagemaker.get_execution_role(),
                                       instance_count=1,
                                       instance_type='ml.m4.xlarge',
                                       output_path=s3_output_location,
                                        hyperparameters=hyperparams,
                                        sagemaker_session=sagemaker.Session())

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [53]:
train_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/train/".format(bucket,prefix,train_file),
    content_type='text/csv')

validate_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/validate/".format(bucket,prefix,validate_file),
    content_type='text/csv')

data_channels = {'train': train_channel, 'validation': validate_channel}

In [54]:
xgb_model.fit(inputs=data_channels, logs=False)

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-11-02-15-25-21-996



2023-11-02 15:25:22 Starting - Starting the training job......
2023-11-02 15:25:58 Starting - Preparing the instances for training...............
2023-11-02 15:27:22 Downloading - Downloading input data.....
2023-11-02 15:27:52 Training - Downloading the training image........
2023-11-02 15:28:38 Training - Training image download completed. Training in progress.....
2023-11-02 15:29:03 Uploading - Uploading generated training model.
2023-11-02 15:29:10 Failed - Training job failed


UnexpectedStatusException: Error for Training job sagemaker-xgboost-2023-11-02-15-25-21-996: Failed. Reason: AlgorithmError: framework error: 
Traceback (most recent call last):
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_xgboost_container/algorithm_mode/train.py", line 226, in train_job
    verbose_eval=False)
  File "/miniconda3/lib/python3.7/site-packages/xgboost/training.py", line 209, in train
    xgb_model=xgb_model, callbacks=callbacks)
  File "/miniconda3/lib/python3.7/site-packages/xgboost/training.py", line 84, in _train_internal
    bst_eval_set = bst.eval_set(evals, i, feval)
  File "/miniconda3/lib/python3.7/site-packages/xgboost/core.py", line 1314, in eval_set
    ctypes.byref(msg)))
  File "/miniconda3/lib/python3.7/site-packages/xgboost/core.py", line 189, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [15:28:55] /workspace/src/metric/rank_metric.cc:212: Check failed: dat[1] > 0.0f (0 vs. 0) : AUC: the dataset only contains pos or neg samples
Stack trace:
  [bt] (0) /miniconda3/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(dml

In [None]:
columns_to_convert = ['sex_male', 'sex_female', 'alone_no', 'alone_yes', 'class_1', 'class_2', 'class_3']
test[columns_to_convert] = test[columns_to_convert].applymap(lambda x: int(x))

# Test data is saved in a csv and uploaded to the bucket
batch_X = test.iloc[:,1:];
batch_X_file='batch-in.csv'
upload_s3_csv(batch_X_file, 'batch-in', batch_X)