In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

<h2>Simple Regression Dataset - Straight Line</h2>

Input Feature: X

Target: 5*X + 8 + some noise

Objective: Train a model to predict target for a given X

# Straight Line Function
def straight_line(x):
    return 5*x + 8

straight_line(25)

straight_line(1.254)

np.random.seed(5)

samples = 150
x = pd.Series(np.arange(0,150))
y = x.map(straight_line) + np.random.randn(samples)*10

df = pd.DataFrame({'x':x,'y':y})

df.head()

# Correlation will indicate how strongly features are related to the output
df.corr()

plt.plot(df.x,df.y,label='Target')
plt.grid(True)
plt.xlabel('Input Feature')
plt.ylabel('Target')
plt.legend()
plt.show()

# Save all data
df.to_csv('linear_all.csv',index=False,
          columns=['x','y'])

<h2>SageMaker Convention for Training and Validation files</h2>

CSV File Column order: y_noisy, x

Training, Validation files do not have a column header

# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.iloc[l]

df.head()

rows = df.shape[0]
train = int(.7 * rows)
test = rows - train

print(rows, train, test)

# Write Training Set
df[:train].to_csv('linear_train.csv',index=False,header=False,columns=['y','x'])

# Write Validation Set
df[train:].to_csv('linear_validation.csv',index=False,header=False,columns=['y','x'])



<h2>Simple Regression Dataset - Linear Regression vs XGBoost</h2>

Model is trained with XGBoost installed in notebook instance

In the later examples, we will train using SageMaker's XGBoost algorithm.

Training on SageMaker takes several minutes (even for simple dataset).

If algorithm is supported on Python, we will try them locally on notebook instance

This allows us to quickly learn an algorithm, understand tuning options and then finally train on SageMaker Cloud

In this exercise, let's compare XGBoost and Linear Regression for simple regression dataset

# Install xgboost in notebook instance.
#### Command to install xgboost
!pip install xgboost==1.2

import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error


# XGBoost
import xgboost as xgb
# Linear Regression
from sklearn.linear_model import LinearRegression

# All data
df = pd.read_csv('linear_all.csv')

df.head()

plt.plot(df.x,df.y,label='Target')
plt.grid(True)
plt.xlabel('Input Feature')
plt.ylabel('Target')
plt.legend()
plt.title('Simple Regression Dataset')
plt.show()

# Let's load Training and Validation Datasets
train_file = 'linear_train.csv'
validation_file = 'linear_validation.csv'

# Specify the column names as the file does not have column header
df_train = pd.read_csv(train_file,names=['y','x'])
df_validation = pd.read_csv(validation_file,names=['y','x'])

df_train.head()

df_validation.head()

plt.scatter(df_train.x,df_train.y,label='Training',marker='.')
plt.scatter(df_validation.x,df_validation.y,label='Validation',marker='.')
plt.grid(True)
plt.xlabel('Input Feature')
plt.ylabel('Target')
plt.title('Simple Regression Dataset')
plt.legend()
plt.show()

X_train = df_train.iloc[:,1:] # Features: 1st column onwards
y_train = df_train.iloc[:,0].ravel() # Target: 0th column

X_validation = df_validation.iloc[:,1:]
y_validation = df_validation.iloc[:,0].ravel()

# Create an instance of XGBoost Regressor
# XGBoost Training Parameter Reference:
#   https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
regressor = xgb.XGBRegressor()

# Default Options
regressor

# Train the model
# Provide Training Dataset and Validation Dataset
# XGBoost reports training and validation error
regressor.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_validation, y_validation)])

# Get the Training RMSE and Evaluation RMSE
eval_result = regressor.evals_result()

eval_result

training_rounds = range(len(eval_result['validation_0']['rmse']))

print(training_rounds)

plt.scatter(x=training_rounds,y=eval_result['validation_0']['rmse'],label='Training Error')
plt.scatter(x=training_rounds,y=eval_result['validation_1']['rmse'],label='Validation Error')
plt.grid(True)
plt.xlabel('Iterations')
plt.ylabel('RMSE')
plt.title('XGBoost Training Vs Validation Error')
plt.legend()
plt.show()

xgb.plot_importance(regressor)
plt.show()

## Validation Dataset Compare Actual and Predicted

result = regressor.predict(X_validation)

result[:5]

plt.title('XGBoost - Validation Dataset')
plt.scatter(df_validation.x,df_validation.y,label='actual',marker='.')
plt.scatter(df_validation.x,result,label='predicted',marker='.')
plt.grid(True)
plt.legend()
plt.show()

# RMSE Metrics
print('XGBoost Algorithm Metrics')
mse = mean_squared_error(df_validation.y,result)
print(" Mean Squared Error: {0:.2f}".format(mse))
print(" Root Mean Square Error: {0:.2f}".format(mse**.5))

# Residual
# Over prediction and Under Prediction needs to be balanced
# Training Data Residuals
residuals = df_validation.y - result
plt.hist(residuals)
plt.grid(True)
plt.xlabel('Actual - Predicted')
plt.ylabel('Count')
plt.title('XGBoost Residual')
plt.axvline(color='r')
plt.show()

# Count number of values greater than zero and less than zero
value_counts = (residuals > 0).value_counts(sort=False)

print(' Under Estimation: {0}'.format(value_counts[True]))
print(' Over  Estimation: {0}'.format(value_counts[False]))

# Plot for entire dataset
plt.plot(df.x,df.y,label='Target')
plt.plot(df.x,regressor.predict(df[['x']]) ,label='Predicted')
plt.grid(True)
plt.xlabel('Input Feature')
plt.ylabel('Target')
plt.legend()
plt.title('XGBoost')
plt.show()

## Linear Regression Algorithm

lin_regressor = LinearRegression()

lin_regressor.fit(X_train,y_train)

Compare Weights assigned by Linear Regression.

Original Function: 5*x + 8 + some noise


lin_regressor.coef_

lin_regressor.intercept_

result = lin_regressor.predict(df_validation[['x']])

plt.title('LinearRegression - Validation Dataset')
plt.scatter(df_validation.x,df_validation.y,label='actual',marker='.')
plt.scatter(df_validation.x,result,label='predicted',marker='.')
plt.grid(True)
plt.legend()
plt.show()

# RMSE Metrics
print('Linear Regression Metrics')
mse = mean_squared_error(df_validation.y,result)
print(" Mean Squared Error: {0:.2f}".format(mse))
print(" Root Mean Square Error: {0:.2f}".format(mse**.5))

# Residual
# Over prediction and Under Prediction needs to be balanced
# Training Data Residuals
residuals = df_validation.y - result
plt.hist(residuals)
plt.grid(True)
plt.xlabel('Actual - Predicted')
plt.ylabel('Count')
plt.title('Linear Regression Residual')
plt.axvline(color='r')
plt.show()

# Count number of values greater than zero and less than zero
value_counts = (residuals > 0).value_counts(sort=False)

print(' Under Estimation: {0}'.format(value_counts[True]))
print(' Over  Estimation: {0}'.format(value_counts[False]))

# Plot for entire dataset
plt.plot(df.x,df.y,label='Target')
plt.plot(df.x,lin_regressor.predict(df[['x']]) ,label='Predicted')
plt.grid(True)
plt.xlabel('Input Feature')
plt.ylabel('Target')
plt.legend()
plt.title('LinearRegression')
plt.show()

<h2>Input Features - Outside range used for training</h2>

XGBoost Prediction has an upper and lower bound (applies to tree based algorithms)

Linear Regression extrapolates

# True Function
def straight_line(x):
    return 5*x + 8

# X is outside range of training samples
X = np.array([-100,-5,160,1000,5000])
y = straight_line(X)

df_tmp = pd.DataFrame({'x':X,'y':y})
df_tmp['xgboost']=regressor.predict(df_tmp[['x']])
df_tmp['linear']=lin_regressor.predict(df_tmp[['x']])

df_tmp

# XGBoost Predictions have an upper bound and lower bound
# Linear Regression Extrapolates
plt.scatter(df_tmp.x,df_tmp.y,label='Actual',color='r')
plt.plot(df_tmp.x,df_tmp.linear,label='LinearRegression')
plt.plot(df_tmp.x,df_tmp.xgboost,label='XGBoost')
plt.legend()
plt.xlabel('X')
plt.ylabel('y')
plt.title('Input Outside Range')
plt.show()

# X is inside range of training samples
X = np.array([0,1,3,5,7,9,11,15,18,125])
y = straight_line(X)

df_tmp = pd.DataFrame({'x':X,'y':y})
df_tmp['xgboost']=regressor.predict(df_tmp[['x']])
df_tmp['linear']=lin_regressor.predict(df_tmp[['x']])

df_tmp

# XGBoost Predictions have an upper bound and lower bound
# Linear Regression Extrapolates
plt.scatter(df_tmp.x,df_tmp.y,label='Actual',color='r')
plt.plot(df_tmp.x,df_tmp.linear,label='LinearRegression')
plt.plot(df_tmp.x,df_tmp.xgboost,label='XGBoost')
plt.legend()
plt.xlabel('X')
plt.ylabel('y')
plt.title('Input within range')
plt.show()

<h2>Summary</h2>

1. Use sagemaker notebook as your own server on the cloud
2. Install python packages
3. Train directly on SageMaker Notebook (for small datasets, it takes few seconds).
4. Once happy with algorithm and performance, you can train on sagemaker cloud (takes several minutes even for small datasets)
5. Not all algorithms are available for installation (for example: AWS algorithms like DeepAR are available only in SageMaker)
6. In this exercise, we installed XGBoost and compared performance of XGBoost model and Linear Regression


Qudratic Training
<h2>Quadratic Regression Dataset - Linear Regression vs XGBoost</h2>

Model is trained with XGBoost installed in notebook instance

In the later examples, we will train using SageMaker's XGBoost algorithm.

Training on SageMaker takes several minutes (even for simple dataset).

If algorithm is supported on Python, we will try them locally on notebook instance

This allows us to quickly learn an algorithm, understand tuning options and then finally train on SageMaker Cloud

In this exercise, let's compare XGBoost and Linear Regression for Quadratic regression dataset

# Install xgboost in notebook instance.
#### Command to install xgboost
!pip install xgboost==1.2

import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error


# XGBoost
import xgboost as xgb
# Linear Regression
from sklearn.linear_model import LinearRegression

df = pd.read_csv('quadratic_all.csv')

df.head()

plt.plot(df.x,df.y,label='Target')
plt.grid(True)
plt.xlabel('Input Feature')
plt.ylabel('Target')
plt.legend()
plt.title('Quadratic Regression Dataset')
plt.show()

train_file = 'quadratic_train.csv'
validation_file = 'quadratic_validation.csv'

# Specify the column names as the file does not have column header
df_train = pd.read_csv(train_file,names=['y','x'])
df_validation = pd.read_csv(validation_file,names=['y','x'])

df_train.head()

df_validation.head()

plt.scatter(df_train.x,df_train.y,label='Training',marker='.')
plt.scatter(df_validation.x,df_validation.y,label='Validation',marker='.')
plt.grid(True)
plt.xlabel('Input Feature')
plt.ylabel('Target')
plt.title('Quadratic Regression Dataset')
plt.legend()
plt.show()

X_train = df_train.iloc[:,1:] # Features: 1st column onwards
y_train = df_train.iloc[:,0].ravel() # Target: 0th column

X_validation = df_validation.iloc[:,1:]
y_validation = df_validation.iloc[:,0].ravel()

# Create an instance of XGBoost Regressor
# XGBoost Training Parameter Reference:
#   https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
regressor = xgb.XGBRegressor()

regressor

regressor.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_validation, y_validation)])

eval_result = regressor.evals_result()

training_rounds = range(len(eval_result['validation_0']['rmse']))

plt.scatter(x=training_rounds,y=eval_result['validation_0']['rmse'],label='Training Error')
plt.scatter(x=training_rounds,y=eval_result['validation_1']['rmse'],label='Validation Error')
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('RMSE')
plt.title('Training Vs Validation Error')
plt.legend()
plt.show()

xgb.plot_importance(regressor)
plt.show()

## Validation Dataset Compare Actual and Predicted

result = regressor.predict(X_validation)

result[:5]

plt.title('XGBoost - Validation Dataset')
plt.scatter(df_validation.x,df_validation.y,label='actual',marker='.')
plt.scatter(df_validation.x,result,label='predicted',marker='.')
plt.grid(True)
plt.legend()
plt.show()

# RMSE Metrics
print('XGBoost Algorithm Metrics')
mse = mean_squared_error(df_validation.y,result)
print(" Mean Squared Error: {0:.2f}".format(mse))
print(" Root Mean Square Error: {0:.2f}".format(mse**.5))

# Residual
# Over prediction and Under Prediction needs to be balanced
# Training Data Residuals
residuals = df_validation.y - result
plt.hist(residuals)
plt.grid(True)
plt.xlabel('Actual - Predicted')
plt.ylabel('Count')
plt.title('XGBoost Residual')
plt.axvline(color='r')
plt.show()

# Count number of values greater than zero and less than zero
value_counts = (residuals > 0).value_counts(sort=False)

print(' Under Estimation: {0}'.format(value_counts[True]))
print(' Over  Estimation: {0}'.format(value_counts[False]))

# Plot for entire dataset
plt.plot(df.x,df.y,label='Target')
plt.plot(df.x,regressor.predict(df[['x']]) ,label='Predicted')
plt.grid(True)
plt.xlabel('Input Feature')
plt.ylabel('Target')
plt.legend()
plt.title('XGBoost')
plt.show()

## Linear Regression Algorithm

lin_regressor = LinearRegression()

lin_regressor.fit(X_train,y_train)

Compare Weights assigned by Linear Regression.

Original Function: 5*x**2 -23*x + 47 + some noise

Linear Regression Function: -15.08 * x + 709.86

Linear Regression Coefficients and Intercepts are not close to actual

lin_regressor.coef_

lin_regressor.intercept_

result = lin_regressor.predict(df_validation[['x']])

plt.title('LinearRegression - Validation Dataset')
plt.scatter(df_validation.x,df_validation.y,label='actual',marker='.')
plt.scatter(df_validation.x,result,label='predicted',marker='.')
plt.grid(True)
plt.legend()
plt.show()

# RMSE Metrics
print('Linear Regression Metrics')
mse = mean_squared_error(df_validation.y,result)
print(" Mean Squared Error: {0:.2f}".format(mse))
print(" Root Mean Square Error: {0:.2f}".format(mse**.5))

# Residual
# Over prediction and Under Prediction needs to be balanced
# Training Data Residuals
residuals = df_validation.y - result
plt.hist(residuals)
plt.grid(True)
plt.xlabel('Actual - Predicted')
plt.ylabel('Count')
plt.title('Linear Regression Residual')
plt.axvline(color='r')
plt.show()

# Count number of values greater than zero and less than zero
value_counts = (residuals > 0).value_counts(sort=False)

print(' Under Estimation: {0}'.format(value_counts[True]))
print(' Over  Estimation: {0}'.format(value_counts[False]))

# Plot for entire dataset
plt.plot(df.x,df.y,label='Target')
plt.plot(df.x,lin_regressor.predict(df[['x']]) ,label='Predicted')
plt.grid(True)
plt.xlabel('Input Feature')
plt.ylabel('Target')
plt.legend()
plt.title('LinearRegression')
plt.show()

Linear Regression is showing clear symptoms of under-fitting

Input Features are not sufficient to capture complex relationship

<h2>Your Turn</h2>
You can correct this under-fitting issue by adding relavant features.

1. What feature will you add and why?
2. Complete the code and Test
3. What performance do you see now?

# Specify the column names as the file does not have column header
df_train = pd.read_csv(train_file,names=['y','x'])
df_validation = pd.read_csv(validation_file,names=['y','x'])
df = pd.read_csv('quadratic_all.csv')

# Add new features

# Place holder to add new features to df_train, df_validation and df
# if you need help, scroll down to see the answer
# Add your code

X_train = df_train.iloc[:,1:] # Features: 1st column onwards
y_train = df_train.iloc[:,0].ravel() # Target: 0th column

X_validation = df_validation.iloc[:,1:]
y_validation = df_validation.iloc[:,0].ravel()

lin_regressor.fit(X_train,y_train)

Original Function: -23*x + 5*x**2 + 47 + some noise (rewritten with x term first)

lin_regressor.coef_

lin_regressor.intercept_

result = lin_regressor.predict(X_validation)

plt.title('LinearRegression - Validation Dataset')
plt.scatter(df_validation.x,df_validation.y,label='actual',marker='.')
plt.scatter(df_validation.x,result,label='predicted',marker='.')
plt.grid(True)
plt.legend()
plt.show()

# RMSE Metrics
print('Linear Regression Metrics')
mse = mean_squared_error(df_validation.y,result)
print(" Mean Squared Error: {0:.2f}".format(mse))
print(" Root Mean Square Error: {0:.2f}".format(mse**.5))

print("***You should see an RMSE score of 30.45 or less")

df.head()

# Plot for entire dataset
plt.plot(df.x,df.y,label='Target')
plt.plot(df.x,lin_regressor.predict(df[['x','x2']]) ,label='Predicted')
plt.grid(True)
plt.xlabel('Input Feature')
plt.ylabel('Target')
plt.legend()
plt.title('LinearRegression')
plt.show()

## Solution for under-fitting

add a new X**2 term to the dataframe

syntax:

df_train['x2'] = df_train['x']**2

df_validation['x2'] = df_validation['x']**2

df['x2'] = df['x']**2

### Tree Based Algorithms have a lower bound and upper bound for predicted values

# True Function
def quad_func (x):
    return 5*x**2 -23*x + 47

# X is outside range of training samples
# New Feature: Adding X^2 term

X = np.array([-100,-25,25,1000,5000])
y = quad_func(X)
df_tmp = pd.DataFrame({'x':X,'y':y,'x2':X**2})
df_tmp['xgboost']=regressor.predict(df_tmp[['x']])
df_tmp['linear']=lin_regressor.predict(df_tmp[['x','x2']])

df_tmp

plt.scatter(df_tmp.x,df_tmp.y,label='Actual',color='r')
plt.plot(df_tmp.x,df_tmp.linear,label='LinearRegression')
plt.plot(df_tmp.x,df_tmp.xgboost,label='XGBoost')
plt.legend()
plt.xlabel('X')
plt.ylabel('y')
plt.title('Input Outside Range')
plt.show()

# X is inside range of training samples
X = np.array([-15,-12,-5,0,1,3,5,7,9,11,15,18])
y = quad_func(X)
df_tmp = pd.DataFrame({'x':X,'y':y,'x2':X**2})
df_tmp['xgboost']=regressor.predict(df_tmp[['x']])
df_tmp['linear']=lin_regressor.predict(df_tmp[['x','x2']])

df_tmp

# XGBoost Predictions have an upper bound and lower bound
# Linear Regression Extrapolates
plt.scatter(df_tmp.x,df_tmp.y,label='Actual',color='r')
plt.plot(df_tmp.x,df_tmp.linear,label='LinearRegression')
plt.plot(df_tmp.x,df_tmp.xgboost,label='XGBoost')
plt.legend()
plt.xlabel('X')
plt.ylabel('y')
plt.title('Input within range')
plt.show()

<h2>Summary</h2>

1. In this exercise, we compared performance of XGBoost model and Linear Regression on a quadratic dataset
2. The relationship between input feature and target was non-linear.
3. XGBoost handled it pretty well; whereas, linear regression was under-fitting
4. To correct the issue, we had to add additional features for linear regression
5. With this change, linear regression performed much better

XGBoost can detect patterns involving non-linear relationship; whereas, algorithms like linear regression may need complex feature engineering









**Bike** **Code**

In [None]:
#Name : bikerental_data_preparation_rev1

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.plotting.register_matplotlib_converters.html
# Register converters for handling timestamp values in plots

<h2>Kaggle Bike Sharing Demand Dataset</h2>
<h4>To download dataset, sign-in and download from this link: https://www.kaggle.com/c/bike-sharing-demand/data</h4>
<br>

Input Features:<br>
['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek','hour']<br>

Target:<br>
['count']<br>

Objective:

You are provided hourly rental data spanning two years.

For this competition, the training set is comprised of the first 19 days of each month, while the test set is the 20th to the end of the month.

You must predict the total count of bikes rented during each hour covered by the test set, using only information available prior to the rental period

Reference: https://www.kaggle.com/c/bike-sharing-demand/data

columns = ['count', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek','hour']

df = pd.read_csv('train.csv', parse_dates=['datetime'],index_col=0)
df_test = pd.read_csv('test.csv', parse_dates=['datetime'],index_col=0)

df.head()

# We need to convert datetime to numeric for training.
# Let's extract key features into separate numeric columns
def add_features(df):
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['day'] = df.index.day
    df['dayofweek'] = df.index.dayofweek
    df['hour'] = df.index.hour

# Add New Features
add_features(df)
add_features(df_test)

df.head()

# Need to predict the missing data
plt.title('Rental Count - Gaps')
df['2011-01':'2011-02']['count'].plot()
plt.show()

# Rentals change hourly!
plt.plot(df['2011-01-01']['count'])
plt.xticks(fontsize=14, rotation=45)
plt.xlabel('Date')
plt.ylabel('Rental Count')
plt.title('Hourly Rentals for Jan 01, 2011')
plt.show()

# Seasonal
plt.plot(df['2011-01']['count'])
plt.xticks(fontsize=14, rotation=45)
plt.xlabel('Date')
plt.ylabel('Rental Count')
plt.title('Jan 2011 Rentals (1 month)')
plt.show()

group_hour = df.groupby(['hour'])
average_by_hour = group_hour['count'].mean()

plt.plot(average_by_hour.index,average_by_hour)
plt.xlabel('Hour')
plt.ylabel('Rental Count')
plt.xticks(np.arange(24))
plt.grid(True)
plt.title('Average Hourly Rental Count')

# Year to year trend
plt.plot(df['2011']['count'],label='2011')
plt.plot(df['2012']['count'],label='2012')
plt.xticks(fontsize=14, rotation=45)
plt.xlabel('Date')
plt.ylabel('Rental Count')
plt.title('2011 and 2012 Rentals (Year to Year)')
plt.legend()
plt.show()

group_year_month = df.groupby(['year','month'])

average_year_month = group_year_month['count'].mean()

average_year_month

for year in average_year_month.index.levels[0]:
    plt.plot(average_year_month[year].index,average_year_month[year],label=year)

plt.legend()
plt.xlabel('Month')
plt.ylabel('Count')
plt.grid(True)
plt.title('Average Monthly Rental Count for 2011, 2012')
plt.show()

group_year_hour = df.groupby(['year','hour'])
average_year_hour = group_year_hour['count'].mean()
for year in average_year_hour.index.levels[0]:
    #print (year)
    #print(average_year_month[year])
    plt.plot(average_year_hour[year].index,average_year_hour[year],label=year)

plt.legend()
plt.xlabel('Hour')
plt.ylabel('Count')
plt.xticks(np.arange(24))
plt.grid(True)
plt.title('Average Hourly Rental Count - 2011, 2012')

group_workingday_hour = df.groupby(['workingday','hour'])
average_workingday_hour = group_workingday_hour['count'].mean()

for workingday in average_workingday_hour.index.levels[0]:
    #print (year)
    #print(average_year_month[year])
    plt.plot(average_workingday_hour[workingday].index,average_workingday_hour[workingday],
             label=workingday)

plt.legend()
plt.xlabel('Hour')
plt.ylabel('Count')
plt.xticks(np.arange(24))
plt.grid(True)
plt.title('Average Hourly Rental Count by Working Day')
plt.show()

# Let's look at correlation beween features and target
df.corr()['count']

# Any relation between temperature and rental count?
plt.scatter(x=df.temp,y=df["count"])
plt.grid(True)
plt.xlabel('Temperature')
plt.ylabel('Count')
plt.title('Temperature vs Count')
plt.show()

# Any relation between humidity and rental count?
plt.scatter(x=df.humidity,y=df["count"],label='Humidity')
plt.grid(True)
plt.xlabel('Humidity')
plt.ylabel('Count')
plt.title('Humidity vs Count')
plt.show()

# Save all data
df.to_csv('bike_all.csv',index=True,index_label='datetime',columns=columns)

## Training and Validation Set
### Target Variable as first column followed by input features
### Training, Validation files do not have a column header

# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.loc[l]

rows = df.shape[0]
train = int(.7 * rows)
test = rows-train

rows, train, test

columns

# Write Training Set
df.iloc[:train].to_csv('bike_train.csv'
                          ,index=False,header=False
                          ,columns=columns)

# Write Validation Set
df.iloc[train:].to_csv('bike_validation.csv'
                          ,index=False,header=False
                          ,columns=columns)

# Test Data has only input features
df_test.to_csv('bike_test.csv',index=True,index_label='datetime')

print(','.join(columns))



# Write Column List
with open('bike_train_column_list.txt','w') as f:
    f.write(','.join(columns))

## Train a model with bike rental data using XGBoost algorithm
###  Model is trained with XGBoost installed in notebook instance
###  In the later examples, we will train using SageMaker's XGBoost algorithm





#NAME : bikerental_xgboost_localmode_rev1
# Install xgboost in notebook instance.
#### Command to install xgboost
!pip install xgboost==1.2

import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error

# XGBoost
import xgboost as xgb

column_list_file = 'bike_train_column_list.txt'
train_file = 'bike_train.csv'
validation_file = 'bike_validation.csv'
test_file = 'bike_test.csv'

columns = ''
with open(column_list_file,'r') as f:
    columns = f.read().split(',')

columns

# Specify the column names as the file does not have column header
df_train = pd.read_csv(train_file,names=columns)
df_validation = pd.read_csv(validation_file,names=columns)

df_train.head()

df_validation.head()

X_train = df_train.iloc[:,1:] # Features: 1st column onwards
y_train = df_train.iloc[:,0].ravel() # Target: 0th column

X_validation = df_validation.iloc[:,1:]
y_validation = df_validation.iloc[:,0].ravel()

# XGBoost Training Parameter Reference:
#   https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
#regressor = xgb.XGBRegressor(max_depth=5,eta=0.1,subsample=0.7,num_round=150)
regressor = xgb.XGBRegressor(max_depth=5,n_estimators=150)

regressor

regressor.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_validation, y_validation)])

eval_result = regressor.evals_result()

training_rounds = range(len(eval_result['validation_0']['rmse']))

print(training_rounds)

plt.scatter(x=training_rounds,y=eval_result['validation_0']['rmse'],label='Training Error')
plt.scatter(x=training_rounds,y=eval_result['validation_1']['rmse'],label='Validation Error')
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('RMSE')
plt.title('Training Vs Validation Error')
plt.legend()
plt.show()

xgb.plot_importance(regressor)
plt.show()

# Verify Quality using Validation dataset
# Compare actual vs predicted performance with dataset not seen by the model before
df = pd.read_csv(validation_file,names=columns)

df.head()

df.shape

X_test = df.iloc[:,1:]
print(X_test[:5])

result = regressor.predict(X_test)

result[:5]

df['count_predicted'] = result

df.head()

# Negative Values are predicted
df['count_predicted'].describe()

df[df['count_predicted'] < 0]

df['count_predicted'].hist()
plt.title('Predicted Count Histogram')
plt.show()

def adjust_count(x):
    if x < 0:
        return 0
    else:
        return x

df['count_predicted'] = df['count_predicted'].map(adjust_count)

df[df['count_predicted'] < 0]

# Actual Vs Predicted
plt.plot(df['count'], label='Actual')
plt.plot(df['count_predicted'],label='Predicted')
plt.xlabel('Sample')
plt.ylabel('Count')
plt.xlim([100,150])
plt.title('Validation Dataset - Predicted Vs. Actual')
plt.legend()
plt.show()

# Over prediction and Under Prediction needs to be balanced
# Training Data Residuals
residuals = (df['count'] - df['count_predicted'])

plt.hist(residuals)
plt.grid(True)
plt.xlabel('Actual - Predicted')
plt.ylabel('Count')
plt.title('Residuals Distribution')
plt.axvline(color='r')
plt.show()

value_counts = (residuals > 0).value_counts(sort=False)
print(' Under Estimation: {0:0.2f}'.format(value_counts[True]/len(residuals)))
print(' Over  Estimation: {0:0.2f}'.format(value_counts[False]/len(residuals)))

print("RMSE: {0:0.2f}".format(mean_squared_error(df['count'],df['count_predicted'])**.5))

# RMSlE - Root Mean Squared Log Error
# RMSLE Metric is used by Kaggle for this competition

# RMSE Cost Function - Magnitude of difference matters

# RMSLE cost function - "Only Percentage difference matters"

# Reference:Katerina Malahova, Khor SoonHin
# https://www.slideshare.net/KhorSoonHin/rmsle-cost-function
def compute_rmsle(y_true, y_pred):
    if type(y_true) != np.ndarray:
        y_true = np.array(y_true)

    if type(y_pred) != np.ndarray:
        y_pred = np.array(y_pred)

    return(np.average((np.log1p(y_pred) - np.log1p(y_true))**2)**.5)

print('RMSLE')
print(compute_rmsle(100,50),
      compute_rmsle(1000,500),
      compute_rmsle(10000,5000))

print('RMSLE')
print(compute_rmsle(100,25),
      compute_rmsle(1000,250),
      compute_rmsle(10000,2500))

print('RMSE')
print(mean_squared_error([100],[50])**.5,
      mean_squared_error([1000],[500])**.5,
      mean_squared_error([10000],[5000])**.5)

print('RMSE')
print(mean_squared_error([100],[25])**.5,
      mean_squared_error([1000],[250])**.5,
      mean_squared_error([10000],[2500])**.5)

print("RMSLE: {0}".format(compute_rmsle(df['count'],df['count_predicted'])))

# Prepare Data for Submission to Kaggle
df_test = pd.read_csv(test_file,parse_dates=['datetime'])

df_test.head()

X_test =  df_test.iloc[:,1:] # Exclude datetime for prediction

X_test.head()

result = regressor.predict(X_test)

result[:5]

df_test["count"] = result

df_test.head()

df_test[df_test["count"] < 0]

df_test["count"] = df_test["count"].map(adjust_count)

df_test[['datetime','count']].to_csv('predicted_count.csv',index=False)

# RMSLE (Kaggle) Score
# Test 1: 0.62


# Bike Train -- SageMaker
# XGBoost Built-in Algorithm - Bike Rental Regression Example

import numpy as np
import pandas as pd

import boto3
import re

import sagemaker
from sagemaker import get_execution_role
# SageMaker SDK Documentation: http://sagemaker.readthedocs.io/en/latest/estimators.html

## Upload Data to S3

# Specify your bucket name
bucket_name = 'kayode-ml-sagemaker'

training_folder = r'bikerental/training/'
validation_folder = r'bikerental/validation/'
test_folder = r'bikerental/test/'

s3_model_output_location = r's3://{0}/bikerental/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_folder)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name,validation_folder)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_folder)

print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)
print(s3_test_file_location)

# Write and Reading from S3 is just as easy
# files are referred as objects in S3.
# file name is referred as key name in S3

# File stored in S3 is automatically replicated across 3 different availability zones
# in the region where the bucket was created.

# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

write_to_s3('bike_train.csv',
            bucket_name,
            training_folder + 'bike_train.csv')

write_to_s3('bike_validation.csv',
            bucket_name,
            validation_folder + 'bike_validation.csv')

write_to_s3('bike_test.csv',
            bucket_name,
            test_folder + 'bike_test.csv')

## Training Algorithm Docker Image
### SageMaker maintains a separate image for algorithm and region
https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html

# Use Spot Instance - Save up to 90% of training cost by using spot instances when compared to on-demand instances
# Reference: https://github.com/aws-samples/amazon-sagemaker-managed-spot-training/blob/main/xgboost_built_in_managed_spot_training_checkpointing/xgboost_built_in_managed_spot_training_checkpointing.ipynb

# if you are still on two-month free-tier you can use the on-demand instance by setting:
#   use_spot_instances = False

# We will use spot for training
use_spot_instances = True
max_run = 3600 # in seconds
max_wait = 7200 if use_spot_instances else None # in seconds

job_name = 'xgboost-bikerental-v1'

checkpoint_s3_uri = None

if use_spot_instances:
    checkpoint_s3_uri = f's3://{bucket_name}/bikerental/checkpoints/{job_name}'

print (f'Checkpoint uri: {checkpoint_s3_uri}')

# Establish a session with AWS
sess = sagemaker.Session()

role = get_execution_role()

# This role contains the permissions needed to train, deploy models
# SageMaker Service is trusted to assume this role
print(role)

# https://sagemaker.readthedocs.io/en/stable/api/utility/image_uris.html#sagemaker.image_uris.retrieve

# SDK 2 uses image_uris.retrieve the container image location

# Use XGBoost 1.2 version
container = sagemaker.image_uris.retrieve("xgboost",sess.boto_region_name,version="1.2-2")

print (f'Using XGBoost Container {container}')

## Build Model

# Configure the training job
# Specify type and number of instances to use
# S3 location where final artifacts needs to be stored

#   Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html

# for managed spot training, specify the use_spot_instances flag, max_run, max_wait and checkpoint_s3_uri

# SDK 2.x version does not require train prefix for instance count and type
estimator = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=s3_model_output_location,
    sagemaker_session=sess,
    base_job_name = job_name,
    use_spot_instances=use_spot_instances,
    max_run=max_run,
    max_wait=max_wait,
    checkpoint_s3_uri=checkpoint_s3_uri)

# Specify hyper parameters that appropriate for the training algorithm
# XGBoost Training Parameter Reference
#  https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst#learning-task-parameters
estimator.set_hyperparameters(max_depth=5,
                              objective="reg:squarederror",
                              eta=0.1,
                              num_round=150)

estimator.hyperparameters()

### Specify Training Data Location and Optionally, Validation Data Location

# content type can be libsvm or csv for XGBoost
training_input_config = sagemaker.session.TrainingInput(
    s3_data=s3_training_file_location,
    content_type='csv',
    s3_data_type='S3Prefix')

validation_input_config = sagemaker.session.TrainingInput(
    s3_data=s3_validation_file_location,
    content_type='csv',
    s3_data_type='S3Prefix'
)

data_channels = {'train': training_input_config, 'validation': validation_input_config}

print(training_input_config.config)
print(validation_input_config.config)

### Train the model

# XGBoost supports "train", "validation" channels
# Reference: Supported channels by algorithm
#   https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
estimator.fit(data_channels)

## Deploy Model

# Ref: http://sagemaker.readthedocs.io/en/latest/estimators.html
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m5.xlarge',
                             endpoint_name = job_name)

## Run Predictions

# SDK 2.0 serializers
from sagemaker.serializers import CSVSerializer

predictor.serializer = CSVSerializer()

predictor.predict([[3,0,1,2,28.7,33.335,79,12.998,2011,7,7,3]])

## Summary

1. Ensure Training, Test and Validation data are in S3 Bucket
2. Select Algorithm Container Registry Path - Path varies by region
3. Configure Estimator for training - Specify Algorithm container, instance count, instance type, model output location
4. Specify algorithm specific hyper parameters
5. Train model
6. Deploy model - Specify instance count, instance type and endpoint name
7. Run Predictions

# Name: Invoke Endpoint
#<h1>XGBoost Cloud Prediction Invocation Template</h1>
<h4>Invoke SageMaker Prediction Service</h4>

import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os

import boto3
import re # python regex module
from sagemaker import get_execution_role
import sagemaker

# SDK 2 serializers and deserializers
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

# SDK 2
# RealTimePredictor renamed to Predictor
# https://sagemaker.readthedocs.io/en/stable/v2.html

# Create a predictor and point to an existing endpoint
endpoint_name = 'xgboost-bikerental-v1'
predictor = sagemaker.predictor.Predictor (endpoint_name=endpoint_name)

predictor.serializer = CSVSerializer()

df_all = pd.read_csv('bike_test.csv')

df_all.head()

df_all.columns[1:]

# Need to pass an array to the prediction
# can pass a numpy array or a list of values [[19,1],[20,1]]
arr_test = df_all[df_all.columns[1:]].values

type(arr_test)

arr_test.shape

arr_test[:5]

result = predictor.predict(arr_test[:2])

result

arr_test.shape

### Split the input data into chunks
There are thousands of rows in this data set for which need inference.
When communicating over internet, it is a good idea to split the data into chunks to prevent payload and timeout error

# For large number of predictions, we can split the input data and
# Query the prediction service.
# array_split is convenient to specify how many splits are needed

# Splitting using regular expression as xgboost 1-2-2 is returning
# predicted values with inconsistent delimiters (comma, newline or both)

# pattern looks for one or more of non-numeric characters
pattern = r'[^0-9.]+'

predictions = []
for arr in np.array_split(arr_test,10):
    result = predictor.predict(arr)
    result = re.split(pattern,result.decode("utf-8"))

    print (arr.shape)
    predictions += [float(r) for r in result if r != ""] # Thanks, Ionut Barbu!

len(predictions)

np.expm1(predictions)

df_all['count'] = np.expm1(predictions)

df_all.head()

df_all[['datetime','count']].to_csv('predicted_count_cloud.csv',index=False)

# Delete Endpoint to prevent unnecessary charges
predictor.delete_endpoint()




# Multil class Classification
# iris_data_preparation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing

<h2>Iris Classification Dataset</h2>

Input Features:<br>
sepal_length,sepal_width,petal_length,petal_width<br>

Target:<br>
Iris plant class<br>

Objective: Predict iris plant class for a given sepal_length,sepal_width,petal_length,petal_width<br>
<h4>Data source: https://archive.ics.uci.edu/ml/datasets/iris</h4>

columns = ['encoded_class','sepal_length','sepal_width','petal_length','petal_width']

# Encode Class Labels to integers
le = preprocessing.LabelEncoder()
le.fit(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])

le.classes_

df = pd.read_csv('iris_all.csv')

df['class'].value_counts()

df.head()

df.tail()

le.transform(df['class'])[-5:]

# Convert Classes to numeric value
df['encoded_class'] = le.transform(df['class'])

df.head()

df.tail()

# Visualize
setosa = df['class'] == 'Iris-setosa'
versicolor = df['class'] == 'Iris-versicolor'
virginica = df['class'] == 'Iris-virginica'

plt.scatter(df[setosa].sepal_length,y=df[setosa].sepal_width, label='setosa',color='g')
plt.scatter(df[versicolor].sepal_length,y=df[versicolor].sepal_width, label='versicolor',color='r')
plt.scatter(df[virginica].sepal_length,y=df[virginica].sepal_width, label='virginica',color='b')
plt.xlabel('length')
plt.ylabel('width')
plt.title('Sepal')
plt.grid(True)
plt.legend()
plt.show()

plt.scatter(df[setosa].petal_length,y=df[setosa].petal_width, label='setosa',color='g')
plt.scatter(df[versicolor].petal_length,y=df[versicolor].petal_width, label='versicolor',color='r')
plt.scatter(df[virginica].petal_length,y=df[virginica].petal_width, label='virginica',color='b')
plt.xlabel('length')
plt.ylabel('width')
plt.title('Petal')
plt.grid(True)
plt.legend()
plt.show()

plt.scatter(df[setosa].petal_length,y=df[setosa].sepal_length, label='setosa',color='g')
plt.scatter(df[versicolor].petal_length,y=df[versicolor].sepal_length, label='versicolor',color='r')
plt.scatter(df[virginica].petal_length,y=df[virginica].sepal_length, label='virginica',color='b')
plt.xlabel('petal length')
plt.ylabel('sepal length')
plt.title('Petal-Sepal')
plt.grid(True)
plt.legend()
plt.show()

## Training and Validation Set
### Target Variable as first column followed by input features:
class,sepal_length,sepal_width,petal_length,petal_width
### Training, Validation files do not have a column header

# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.iloc[l]

rows = df.shape[0]
train = int(.7 * rows)
test = rows-train

rows, train, test

# Write Training Set
df[:train].to_csv('iris_train.csv'
                          ,index=False,header=False
                          ,columns=columns)

# Write Validation Set
df[train:].to_csv('iris_validation.csv'
                          ,index=False,header=False
                          ,columns=columns)

# Write Column List
with open('iris_train_column_list.txt','w') as f:
    f.write(','.join(columns))


# Iris Multiclass Classification
## Train a model with Iris data using XGBoost algorithm
###  Model is trained with XGBoost installed in notebook instance
###  In the later examples, we will train using SageMaker's XGBoost algorithm

# Install xgboost in notebook instance.
#### Command to install xgboost
!pip install xgboost==1.2

import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import xgboost as xgb

from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix

column_list_file = 'iris_train_column_list.txt'
train_file = 'iris_train.csv'
validation_file = 'iris_validation.csv'

columns = ''
with open(column_list_file,'r') as f:
    columns = f.read().split(',')

columns

# Encode Class Labels to integers
# Labeled Classes
labels=[0,1,2]
classes = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
le = preprocessing.LabelEncoder()
le.fit(classes)

# Specify the column names as the file does not have column header
df_train = pd.read_csv(train_file,names=columns)
df_validation = pd.read_csv(validation_file,names=columns)

df_train.head()

df_validation.head()

X_train = df_train.iloc[:,1:] # Features: 1st column onwards
y_train = df_train.iloc[:,0].ravel() # Target: 0th column

X_validation = df_validation.iloc[:,1:]
y_validation = df_validation.iloc[:,0].ravel()

# Launch a classifier
# XGBoost Training Parameter Reference:
#   https://xgboost.readthedocs.io/en/latest/parameter.html

classifier = xgb.XGBClassifier(objective="multi:softmax",
                               num_class=3,
                               n_estimators=100)

classifier

classifier.fit(X_train,
               y_train,
               eval_set = [(X_train, y_train), (X_validation, y_validation)],
               eval_metric=['mlogloss'],
               early_stopping_rounds=10)

# early_stopping_rounds - needs to be passed in as a hyperparameter in SageMaker XGBoost implementation
# "The model trains until the validation score stops improving.
# Validation error needs to decrease at least every early_stopping_rounds to continue training.
# Amazon SageMaker hosting uses the best model for inference."

eval_result = classifier.evals_result()

training_rounds = range(len(eval_result['validation_0']['mlogloss']))

print(training_rounds)

plt.scatter(x=training_rounds,y=eval_result['validation_0']['mlogloss'],label='Training Error')
plt.scatter(x=training_rounds,y=eval_result['validation_1']['mlogloss'],label='Validation Error')
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('LogLoss')
plt.title('Training Vs Validation Error')
plt.legend()
plt.show()

xgb.plot_importance(classifier)
plt.show()

df = pd.read_csv(validation_file,names=columns)

df.head()

X_test = df.iloc[:,1:]
print(X_test[:5])

result = classifier.predict(X_test)

result[:5]

df['predicted_class'] = result #le.inverse_transform(result)

df.head()

# Compare performance of Actual and Model 1 Prediction
plt.figure()
plt.scatter(df.index,df['encoded_class'],label='Actual')
plt.scatter(df.index,df['predicted_class'],label='Predicted',marker='^')
plt.legend(loc=4)
plt.yticks([0,1,2])
plt.xlabel('Sample')
plt.ylabel('Class')
plt.show()

<h2>Confusion Matrix</h2>
Confusion Matrix is a table that summarizes performance of classification model.<br><br>

# Reference:
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    #else:
    #    print('Confusion matrix, without normalization')

    #print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

# Compute confusion matrix
cnf_matrix = confusion_matrix(df['encoded_class'],
                              df['predicted_class'],labels=labels)

cnf_matrix

# Plot confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=classes,
                      title='Confusion matrix - Count')

# Plot confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=classes,
                      title='Confusion matrix - Count',normalize=True)

print(classification_report(
    df['encoded_class'],
    df['predicted_class'],
    labels=labels,
    target_names=classes))


## iNVOKE Endpoint
## Invoke SageMaker Enpoint from outside of AWS environment using SageMaker SDK

Model used: XGBoost Bike Rental Prediction Trained in the XGBoost Lectures

This example uses the IAM user: ml_user_predict. The user was setup in the housekeeping lecture of the course.

Refer to the lecture: Configure IAM Users, Setup Command Line Interface (CLI)

Ensure xgboost-biketrain-v1 Endpoint is deployed before running this example

To create an endpoint using SageMaker Console:
1. Select "Models" under "Inference" in navigation pane
2. Search for model using this prefix: xgboost-biketrain-v1
3. Select the latest model and choose create endpoint
4. Specify endpoint name as: xgboost-biketrain-v1
5. Create a new endpoint configuration
6. Create a new endpoint
7. After this lab is completed, delete the endpoint to avoid unnecessary charges

# Install SageMaker 2.x version.
!pip install --upgrade sagemaker

import boto3
import sagemaker
import math
import dateutil
import re

# SDK 2 serializers and deserializers
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

# Establish a session with AWS
# Specify credentials and region to be used for this session.
# We will use a ml_user_predict credentials that has limited privileges
boto_session = boto3.Session(profile_name='ml_user_predict',region_name='us-east-1')

sess = sagemaker.Session(boto_session=boto_session)

# Create a predictor and point to an existing endpoint

# Get Predictor using SageMaker SDK
# Specify Your Endpoint Name
endpoint_name = 'xgboost-biketrain-v1'

predictor = sagemaker.predictor.Predictor(endpoint_name=endpoint_name,
                                                 sagemaker_session=sess)

# We are sending data for inference in CSV format
predictor.serializer = CSVSerializer()

#datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
# Actual=562
sample_one = '2012-12-19 17:00:00,4,0,1,1,16.4,20.455,50,26.0027'
# Actual=569
sample_two = '2012-12-19 18:00:00,4,0,1,1,15.58,19.695,50,23.9994'
# Actual=4
sample_three = '2012-12-10 01:00:00,4,0,1,2,14.76,18.94,100,0'

# Raw Data Structure:
# datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count

# Model expects data in this format (it was trained with these features):
# season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour

def transform_data(data):
    features = data.split(',')

    # Extract year, month, day, dayofweek, hour
    dt = dateutil.parser.parse(features[0])

    features.append(str(dt.year))
    features.append(str(dt.month))
    features.append(str(dt.day))
    features.append(str(dt.weekday()))
    features.append(str(dt.hour))

    # Return the transformed data. skip datetime field
    return ','.join(features[1:])

print('Raw Data:\n',sample_one)
print('Transformed Data:\n',transform_data(sample_one))

# Let's invoke prediction now
predictor.predict(transform_data(sample_one))

# Actual Count is 562...but predicted is 6.3.

# Model was trained with log1p(count)
# So, we need to apply inverse transformation to get the actual count
# Predicted Count looks much better now
result = predictor.predict(transform_data(sample_one))
result = result.decode("utf-8")
print ('Predicted Count', math.expm1(float(result)))

# how to send multiple samples
result = predictor.predict([transform_data(sample_one), transform_data(sample_two)])

result.decode("utf-8")

# Batch Prediction
# Transform data and invoke prediction in specified batch sizes
def run_predictions(data, batch_size):
    predictions = []

    transformed_data = [transform_data(row.strip()) for row in data]

    # Splitting using regular expression as xgboost 1-2-2 is returning
    # predicted values with inconsistent delimiters (comma, newline or both)

    # pattern looks for one or more of non-numeric characters
    pattern = r'[^0-9.]+'

    for i in range(0, len(data), batch_size):

        print(i,i+batch_size)

        result = predictor.predict(transformed_data[i : i + batch_size])

        result = result.decode("utf-8")
        result = re.split(pattern,result)

        predictions += [math.expm1(float(r)) for r in result if r != ""]

    return predictions

run_predictions([sample_one,sample_two,sample_three],10)

# Run a batch prediction on Test.CSV File
# Read the file content
data = []
with open('test.csv','r') as f:
    # skip header
    f.readline()
    # Read remaining lines
    data = f.readlines()

len(data)

%%time
predictions = run_predictions(data,500)

len(predictions),len(data)

# Don't forget to delete the endpoint
# From SageMaker Console, Select "Endpoints" under Inference and Delete the Endpoint


#Invoke with BOTO3 SDK
# Boto3 SageMaker Invoke Endpoint
# This example shows how to invoke SageMaker Endpoint from outside of AWS environment using Boto3 SDK
# Boto is the Amazon Web Services (AWS) SDK for Python
# https://boto3.amazonaws.com/v1/documentation/api/latest/index.html

# Endpoint: XGBoost - Kaggle Bike Rental - Regressor Trained in XGBoost Lectures
# Makesure Endpoint is deployed before running this example
#
# Reference:
#  https://github.com/awslabs/amazon-sagemaker-examples

# NOTE: SageMaker SDK now requires additional permissions DescribeEndpoint, DescribeEndpointConfig in-addition to InvokeEndpoint
#   boto3 SDK requires just InvokeEndpoint permission.
#   Please update SageMakerInvokeEndpoint permissions to reflect this policy document:
#   Logon with my_admin account and update permissions (IAM->Policies->SageMakerInvokeEndpoint->Edit Policy)
#
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Sid": "VisualEditor0",
            "Effect": "Allow",
            "Action": [
                "sagemaker:DescribeEndpointConfig",
                "sagemaker:DescribeEndpoint",
                "sagemaker:InvokeEndpoint"
            ],
            "Resource": "*"
        }
    ]
}

import boto3
import math
import dateutil
import re

# Establish a session with AWS
# Specify credentials and region to be used for this session.
# We will use a ml_user_predict credentials that has limited privileges
boto_session = boto3.Session(profile_name='ml_user_predict',region_name='us-east-1')

# List of low level clients that are available in boto3
print(boto_session.get_available_services())

# Acquire a SageMaker Runtime client for us-east-1 region
client = boto_session.client(service_name='sagemaker-runtime',region_name='us-east-1')

# Specify Your Endpoint Name
endpoint_name = 'xgboost-biketrain-v1'

# Raw Data
#datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
# Actual=562
sample_one = '2012-12-19 17:00:00,4,0,1,1,16.4,20.455,50,26.0027'
# Actual=569
sample_two = '2012-12-19 18:00:00,4,0,1,1,15.58,19.695,50,23.9994'
# Actual=4
sample_three = '2012-12-10 01:00:00,4,0,1,2,14.76,18.94,100,0'

# Raw Data Structure:
# datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count

# Model expects data in this format (it was trained with these features):
# season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour

def transform_data(data):
    features = data.split(',')

    # Extract year, month, day, dayofweek, hour
    dt = dateutil.parser.parse(features[0])

    features.append(str(dt.year))
    features.append(str(dt.month))
    features.append(str(dt.day))
    features.append(str(dt.weekday()))
    features.append(str(dt.hour))

    # Return the transformed data. skip datetime field
    return ','.join(features[1:])

print('Raw Data:\n',sample_one)
print('Transformed Data:\n',transform_data(sample_one))

# Let's invoke prediction now
result = client.invoke_endpoint(EndpointName=endpoint_name,
                       Body=transform_data(sample_one).encode('utf-8'),
                       ContentType='text/csv')

result = result['Body'].read().decode('utf-8')

print(result)

# Actual Count is 562...but predicted is 6.36.

# Model was trained with log1p(count)
# So, we need to apply inverse transformation to get the actual count
# Predicted Count looks much better now
print ('Predicted Count', math.expm1(float(result)))

print('\n'.join([transform_data(sample_one),transform_data(sample_two)]))

# Prediction for multiple observations in the same call
result = client.invoke_endpoint(EndpointName=endpoint_name,
                       Body=('\n'.join([transform_data(sample_one),
                                        transform_data(sample_two)]).encode('utf-8')),
                       ContentType='text/csv')

result = result['Body'].read().decode('utf-8')

result

# Batch Prediction
# Transform data and invoke prediction in specified batch sizes
def run_predictions(data, batch_size):

    predictions = []

    transformed_data = [transform_data(row.strip()) for row in data]

    # Splitting using regular expression as xgboost 1-2-2 is returning
    # predicted values with inconsistent delimiters (comma, newline or both)

    # pattern looks for one or more of non-numeric characters
    pattern = r'[^0-9.]+'

    for i in range(0, len(data), batch_size):

        print(i,i+batch_size)

        result = client.invoke_endpoint(EndpointName=endpoint_name,
                       Body=('\n'.join(transformed_data[i : i + batch_size]).encode('utf-8')),
                       ContentType='text/csv')

        result = result['Body'].read().decode('utf-8')
        result = re.split(pattern,result)

        predictions += [math.expm1(float(r)) for r in result if r != ""]

    return predictions

run_predictions([sample_one,sample_two,sample_three],10)

# Run a batch prediction on Test.CSV File
# Read the file content
data = []
with open('test.csv','r') as f:
    # skip header
    f.readline()
    # Read remaining lines
    data = f.readlines()

len(data)

%%time
predictions = run_predictions(data,100)

len(predictions),len(data)

#Invoke as Microservices with Lambda
# Boto3 SageMaker Invoke Endpoint
# This example shows how to invoke SageMaker Endpoint from outside of AWS environment using Boto3 SDK
# Boto is the Amazon Web Services (AWS) SDK for Python
# https://boto3.amazonaws.com/v1/documentation/api/latest/index.html

# Common Data Formats
# https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html

# Endpoint: XGBoost - Kaggle Bike Rental - Regressor Trained in XGBoost Lectures
# Makesure Endpoint is deployed before running this example
#
# Reference:
#  https://github.com/awslabs/amazon-sagemaker-examples

# NOTE: SageMaker SDK now requires additional permissions DescribeEndpoint, DescribeEndpointConfig in-addition to InvokeEndpoint
#   boto3 SDK requires just InvokeEndpoint permission.
#   Please update SageMakerInvokeEndpoint permissions to reflect this policy document:
#   Logon with my_admin account and update permissions (IAM->Policies->SageMakerInvokeEndpoint->Edit Policy)
#
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Sid": "VisualEditor0",
            "Effect": "Allow",
            "Action": [
                "sagemaker:DescribeEndpointConfig",
                "sagemaker:DescribeEndpoint",
                "sagemaker:InvokeEndpoint"
            ],
            "Resource": "*"
        }
    ]
}

import boto3
import math
import dateutil
import json
import re

# Establish a session with AWS
# Specify credentials and region to be used for this session.
# We will use a ml_user_predict credentials that has limited privileges
boto_session = boto3.Session(profile_name='ml_user_predict',region_name='us-east-1')

# Acquire a SageMaker Runtime client for us-east-1 region
client = boto_session.client(service_name='sagemaker-runtime',region_name='us-east-1')

# Specify Your Endpoint Name
endpoint_name = 'xgboost-biketrain-v1'

# Raw Data
#datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
# Actual=562
sample_one = ['2012-12-19 17:00:00',4,0,1,1,16.4,20.455,50,26.0027]
# Actual=569
sample_two = ['2012-12-19 18:00:00',4,0,1,1,15.58,19.695,50,23.9994]
# Actual=4
sample_three = ['2012-12-10 01:00:00',4,0,1,2,14.76,18.94,100,0]

# Single Observation
request = {
    "instances": [
        # First instance.
        {
            "features": sample_one
        }
    ]
}

print(json.dumps(request,indent=2))

# Multiple Observations
request = {
    "instances": [
        # First instance.
        {
            "features": sample_one
        },
        # Second instance.
        {
            "features": sample_two
        },
        # Third instance.
        {
            "features": sample_three
        }
    ]
}

print(json.dumps(request,indent=2))

# Raw Data Structure:
# datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count

# Model expects data in this format (it was trained with these features):
# season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour

def transform_data(data):
    features = data.copy()
    # Extract year, month, day, dayofweek, hour
    dt = dateutil.parser.parse(features[0])

    features.append(dt.year)
    features.append(dt.month)
    features.append(dt.day)
    features.append(dt.weekday())
    features.append(dt.hour)

    # Return the transformed data. skip datetime field
    return ','.join([str(feature) for feature in features[1:]])

print('Raw Data:\n',sample_one)
print('Transformed Data:\n',transform_data(sample_one))

# Single with error
request = {
    "instances": [
        # First instance.
        {
            "features": ["hi there",0,2]
        }
    ]
}

try:
    transformed_data = [transform_data(instance['features']) for instance in request["instances"]]
except Exception as err:
    print('Error when transforming: {0}'.format(err))

# Single Observation
request = {
    "instances": [
        # First instance.
        {
            "features": sample_one
        }
    ]
}

# Let's invoke prediction now
result = client.invoke_endpoint(EndpointName=endpoint_name,
                       Body=transform_data(request['instances'][0]['features']).encode('utf-8'),
                       ContentType='text/csv')

result = result['Body'].read().decode('utf-8')

# Model was trained with log1p(count)
# So, we need to apply inverse transformation to get the actual count
# Predicted Count looks much better now
print ('Predicted Count', math.expm1(float(result)))

# Multiple Observations
request = {
    "instances": [
        # First instance.
        {
            "features": sample_one
        },
        # Second instance.
        {
            "features": sample_two
        },
        # Third instance.
        {
            "features": sample_three
        }
    ]
}

for instance in request["instances"]:
    print(instance)
    print('Transformed:')
    print(' ', transform_data(instance['features']))

# XGBoost accepts data in CSV. It does not support JSON.
# So, we need to submit the request in CSV format
# Prediction for multiple observations in the same call
result = client.invoke_endpoint(EndpointName=endpoint_name,
                       Body=('\n'.join(
                           [transform_data(instance['features'])
                                for instance in request["instances"]]).encode('utf-8')),
                       ContentType='text/csv')

result = result['Body'].read().decode('utf-8')

print(result)

# Splitting using regular expression as xgboost 1-2-2 is returning
# predicted values with inconsistent delimiters (comma, newline or both)

# pattern looks for one or more of non-numeric characters
pattern = r'[^0-9.]+'
result = re.split(pattern,result)
predictions = [math.expm1(float(r)) for r in result if r != ""]

predictions

#Invoke API Gateway
# Invoke API Gateway Endpoint
# This example shows how to invoke SageMaker Endpoint from outside of AWS environment using API Gateway
# Ref: https://stackoverflow.com/questions/17301938/making-a-request-to-a-restful-api-using-python

# Common Data Formats
# https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html

# Endpoint: XGBoost - Kaggle Bike Rental - Regressor Trained in XGBoost Lectures
# Makesure Endpoint is deployed before running this example

import requests
import json

# Update the URL to point to your API Gateway endpoint
url = 'https://bjygvuald0.execute-api.us-east-1.amazonaws.com/beta'

# Raw Data
#datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
# Actual=562
sample_one = ['2012-12-19 17:00:00',4,0,1,1,16.4,20.455,50,26.0027]
# Actual=569
sample_two = ['2012-12-19 18:00:00',4,0,1,1,15.58,19.695,50,23.9994]
# Actual=4
sample_three = ['2012-12-10 01:00:00',4,0,1,2,14.76,18.94,100,0]

# Single Observation
request = {
    "instances": [
        {
            "features": sample_one
        }
    ]
}

request

response = requests.post(url, data=json.dumps(request))
result = response.json()

if result['statusCode'] == 200:
    predictions = json.loads(result['body'])
    print('Predicted Count: ', predictions)
else:
    print('Error',result['statusCode'], result['body'])

# Multiple Observations
request = {
    "instances": [
        # First instance.
        {
            "features": sample_one
        },
        # Second instance.
        {
            "features": sample_two
        },
        # Third instance.
        {
            "features": sample_three
        }
    ]
}

response = requests.post(url, data=json.dumps(request))

result = response.json()

if result['statusCode'] == 200:
    predictions = json.loads(result['body'])
    print('Predicted Count: ', predictions)
else:
    print('Error',result['statusCode'], result['body'])

