In [1]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
from sagemaker.predictor import csv_serializer

# Define IAM role
role = get_execution_role()
my_region = boto3.session.Session().region_name # set the region of the instance


print("Success")

Matplotlib is building the font cache; this may take a moment.


Success


In [34]:
xgboost_container = sagemaker.image_uris.retrieve("xgboost", my_region, "latest")

In [4]:
bucket_name = 'insurancedatakmp' 
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
      s3.create_bucket(Bucket=bucket_name)
    else: 
      s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 bucket created successfully


In [6]:
try:
  raw_data = pd.read_csv('./Dummy-Data.csv',index_col=0)
  print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)
    

Success: Data loaded into dataframe.


In [12]:
#preprocess raw data
#convert height into meters   
raw_data["height"] = (((raw_data["Ht"]%100) + ((raw_data["Ht"]//100)*12))*0.0254)
raw_data["weight"] = (raw_data["Wt"]/2.2)
raw_data["bmi"] = raw_data["weight"]/(raw_data["height"]**2)

def insurance_quote(data):
    if  (data["Ins_Age"]>17 and data["Ins_Age"]<40) and (data["bmi"]<17.49 or data["bmi"]>38.5 ):
        quote = 750
    elif (data["Ins_Age"]>39 and data["Ins_Age"]<60) and (data["bmi"]<18.49 or data["bmi"]>38.5 ):
        quote = 1000
    elif (data["Ins_Age"]>60) and (data["bmi"]<18.49 or data["bmi"]>45.5 ):
        quote = 2000
    else:
        quote = 500
    if (data["Ins_Gender"] == "Female"):
        quote = 0.9*quote
    return quote 
    


raw_data["quote"] = raw_data.apply(insurance_quote, axis=1)

raw_data.head(5)

Unnamed: 0_level_0,Ins_Age,Ins_Gender,Ht,Wt,IssueDate,height,weight,bmi,quote
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
56372,31,Male,510,185,,1.778,84.090909,26.600239,500.0
34565,35,Male,510,205,,1.778,93.181818,29.47594,500.0
57732,45,Female,510,125,,1.778,56.818182,17.973134,900.0
87324,38,Male,503,175,,1.6002,79.545455,31.064677,500.0
12323,39,Female,600,252,,1.8288,114.545455,34.248806,450.0


In [23]:
#converting gender into number for training
raw_data['Ins_Gender'].replace({'Female':1,'Male':0},inplace=True)
#dropping unwanted collumns Ht Wt IssueDate
preprocessed_traintest_data = raw_data.drop(columns=['Ht', 'Wt', 'IssueDate'])
preprocessed_traintest_data.head(2)

Unnamed: 0_level_0,Ins_Age,Ins_Gender,height,weight,bmi,quote
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
56372,31,0,1.778,84.090909,26.600239,500.0
34565,35,0,1.778,93.181818,29.47594,500.0


Now training a XGB model for predicting BMI from age gender and quote

In [20]:
#dropping height and weight
quote_traintest = preprocessed_traintest_data.drop(preprocessed_traintest_data.columns[[2, 3]], axis=1)
quote_traintest.head(1)

Unnamed: 0_level_0,Ins_Age,Ins_Gender,bmi,quote
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
56372,31,0,26.600239,500.0


In [41]:
train_data, test_data = np.split(quote_traintest.sample(frac=1, random_state=1729), [int(0.7 * len(quote_traintest))])
print(train_data.shape, test_data.shape)
train_data.head

(70, 4) (30, 4)


<bound method NDFrame.head of        Ins_Age  Ins_Gender        bmi  quote
AppID                                       
45764       31           1  26.441992  450.0
24435       33           0  31.224404  500.0
64574       25           0  21.153593  500.0
34523       32           0  16.526245  750.0
99946       30           0  21.331436  500.0
...        ...         ...        ...    ...
99452       46           0  23.797792  500.0
99918       19           0  20.964497  500.0
53242       28           1  25.140621  450.0
12323       39           1  34.248806  450.0
99946       40           0  31.952239  500.0

[70 rows x 4 columns]>

In [42]:
pd.concat([train_data['quote'], train_data.drop(['quote'], axis=1)], axis=1).to_csv('train.csv',sep=',', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object('./train.csv').upload_file('train.csv')
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/./'.format(bucket_name), content_type='csv')

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [43]:
sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(xgboost_container,role, instance_count=1, instance_type='ml.m4.xlarge',output_path='s3://{}/./'.format(bucket_name),sagemaker_session=sess)
xgb.set_hyperparameters(max_depth=5,eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,silent=0,objective='multi:softmax',num_class=8, num_round=100)

In [44]:
xgb.fit({'train': s3_input_train})

INFO:sagemaker:Creating training-job with name: xgboost-2023-06-01-11-56-15-151


2023-06-01 11:56:15 Starting - Starting the training job...
2023-06-01 11:56:40 Starting - Preparing the instances for training......
2023-06-01 11:57:43 Downloading - Downloading input data......
2023-06-01 11:58:28 Training - Downloading the training image...
2023-06-01 11:59:16 Uploading - Uploading generated training model
2023-06-01 11:59:16 Failed - Training job failed
[34mArguments: train[0m
[34m[2023-06-01:11:59:02:INFO] Running standalone xgboost training.[0m
[34m[2023-06-01:11:59:02:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2023-06-01:11:59:02:INFO] File size need to be processed in the node: 0.75mb. Available memory size in the node: 8579.62mb[0m
[34m[2023-06-01:11:59:02:ERROR] Customer Error: Could not determine delimiter on line :[0m
[34mCould not determine delimiter[0m
[34mTraceback (most recent call last):
  File "/opt/amazon/lib/python3.7/site-packages/sage_xgboost/train_helper.py", line 134, in get_csv_dmatrix
    delimiter = csv.Sni

UnexpectedStatusException: Error for Training job xgboost-2023-06-01-11-56-15-151: Failed. Reason: ClientError: Could not determine delimiter on line :
Could not determine delimiter, exit code: 1