# Imports for model running

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import sagemaker
import boto3

import io
import sagemaker.amazon.common as smac

import os

### Import dataset

In [None]:
insurance_df = pd.read_csv('Data for HT.csv')

In [None]:
#preview the first 5 rows of the data
insurance_df.head()

In [None]:
#check if the dataset cells contain any null values within any columns
insurance_df.isnull().sum()

### Dataset Preprocessing and Data Augmentation

##### Since some columns contain non numerical data, convert all the columns to numerical which will reflect the values respectfully

##### GENDER
[Male = 0 | Female = 1]

In [None]:
insurance_df['Gender'] = insurance_df['Gender'].apply(lambda x: 0 if x == 'Male' else 1)

##### Smoking Status
[never smoked = 0 | formerley smoked = 1 | smokes = 2 | unknown = 3]

In [None]:
def condition_s(j):
    if j == 'never smoked':
        return 0
    elif j == 'formerly smoked':
        return 1
    elif j == 'smokes':
        return 2
    else:
        return 3


insurance_df['Smoking Status'] = insurance_df['Smoking Status'].apply(condition_s)

##### Alchohol (Freq)
[No = 0 | Rare = 1 | Daily = 2]

In [None]:
def condition_al(al):
    if al == 'No':
        return 0
    elif al == 'Rare':
        return 1
    else:
        return 2


insurance_df['Alcohol (Freq)'] = insurance_df['Alcohol (Freq)'].apply(condition_al)

##### Exercise Intensity 
[None = 0 | Moderate = 1 | Extreme = 2]

In [None]:
def condition_i(i):
    if i == 'No':
        return 0
    elif i == 'Moderate':
        return 1
    else:
        return 2


insurance_df['Exercise Intensity'] = insurance_df['Exercise Intensity'].apply(condition_i)

##### Drop the columns that are not relevant for fitting the model or have not been obtained

In [None]:
insurance_df = insurance_df.drop(columns=['Exercise Type'])
insurance_df = insurance_df.drop(columns=['Av. Sleep per Day'])
insurance_df = insurance_df.drop(columns=['Av. Daily Kcal In'])
insurance_df = insurance_df.drop(columns=['Drugs Freq()'])
insurance_df = insurance_df.drop(columns=['Av. Daily Kcal Burn'])
insurance_df = insurance_df.drop(columns=['Assured Cost'])

### Drop the rows that will cause issue with taining the model, blank values or NaN

In [None]:
insurance_df = insurance_df.dropna()

### Create the testing and training data for the model at a 20-80 split respectively

In [None]:
#Create a dataset with all augmented data without the insurance cost column for training and testing
X = insurance_df.drop(columns=['Insurance Cost'])
#use y as the ground truth for all corresponding values in X
y = insurance_df['Insurance Cost']

#Reshape the datasets so that they can be used in the model
X = np.array(X).astype('float32')
y = np.array(y).astype('float32')

y = y.reshape(-1, 1)

#split the data to 20-80 for testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
#Start a sagemaker session and 

sagemaker_session = sagemaker.Session()

################# MUST BE CHANGED FOR FINAL IMPLEMENTATION #####################
bucket = 'sagemaker-hacker-2'
prefix = 'linear_learner'

role = sagemaker.get_execution_role()
print(role)

### Ensure y_train is in vector format

In [None]:
y_train = y_train[:,0]

#### Create a buffer for the training data to go to

In [None]:
buf = io.BytesIO()
#write training data to buffer
smac.write_numpy_to_dense_tensor(buf, X_train, y_train)
buf.seek(0)

##### Store the training data in a specified folder #PLACEHOLDER - hack-train-data# in a specified s3 bucket

In [None]:
key = 'hack-train-data'

boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)

s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

In [None]:
output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('Training uploaded will be uploaded to {}'.format(output_location))

In [None]:
#Specify sagemaker algorithm linear-learner

from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(boto3.Session().region_name, 'linear-learner')

In [None]:
#use ml.t3.medium as it is part of the free tier for sagemaker

linear = sagemaker.estimator.Estimator(container,
                                       role,
                                       train_instance_count = 1,
                                       train_instance_type = 'ml.m5.4xlarge',
                                       output_path = output_location,
                                       sagemaker_session = sagemaker_session)

#features depend on the number of columns, since we have 11, use 11
#num_models runs the model 32 times, so you can pick the best one
linear.set_hyperparameters(feature_dim = 11,
                          predictor_type = 'regressor',
                          mini_batch_size = 100,
                          epochs = 100,
                          num_models = 32,
                          loss = 'absolute_loss')

linear.fit({'train': s3_train_data})

#### Showcase predictions result of the model

In [None]:
predictions = np.array([r['score'] for r in result['predictions']])

In [None]:
#set up variables to use for prediction analysis
y_predict_orig = scaler_y.inverse_transform(predictions)
y_test_orig = scaler_y.inverse_transform(y_test)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

#Root Means Squared Error
RMSE = float(format(np.sqrt(mean_squared_error(y_test_orig, y_predict_orig)),'.3f'))
#Means Squared Error
MSE = mean_squared_error(y_test_orig, y_predict_orig)
#Mean Absolute Error
MAE = mean_absolute_error(y_test_orig, y_predict_orig)
#R-Squared Score
r2 = r2_score(y_test_orig, y_predict_orig)

# TERMINATE THE MODEL TO STOP RESOURCES BEING USED

In [None]:
linear_regressor.delete_endpoint()