## Import Libraries

In [2]:
import numpy as np
import boto3
import sagemaker
import io
import os
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
import pandas as pd
from io import StringIO

bucket = 'ml-exam-bucket-ishaan'

dataset = pd.read_csv('./data/bike+sharing+dataset/day.csv')

dataset

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.200000,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.229270,0.436957,0.186900,82,1518,1600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,727,2012-12-27,1,1,12,0,4,1,2,0.254167,0.226642,0.652917,0.350133,247,1867,2114
727,728,2012-12-28,1,1,12,0,5,1,2,0.253333,0.255046,0.590000,0.155471,644,2451,3095
728,729,2012-12-29,1,1,12,0,6,0,2,0.253333,0.242400,0.752917,0.124383,159,1182,1341
729,730,2012-12-30,1,1,12,0,0,0,1,0.255833,0.231700,0.483333,0.350754,364,1432,1796


In [3]:
# convert categorical date field
dataset['dteday'] = dataset['dteday'].str.replace('-', '')
dataset.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,20110101,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,20110102,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,20110103,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,20110104,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,20110105,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [4]:
train_data, test_data = np.split(dataset.sample(frac=1, random_state=1729), [int(0.7 * len(dataset))])
print(train_data.shape, test_data.shape)

(511, 16) (220, 16)


In [5]:
# get the features and lables

feature_dataset = train_data[['instant', 'dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',\
    'casual', 'registered']]

features = np.array(feature_dataset.values).astype('float32')
label_dataset = train_data[['cnt']]
labels = np.array(label_dataset.values).astype('float32')
labels_vec = np.squeeze(np.asarray(labels))

In [6]:
labels_vec

array([3485., 5870., 2493., 5572., 4097., 1851., 3641., 4334., 5138.,
       5424., 1913., 4270., 6969., 1471., 4634., 2431., 7350., 4917.,
       3071., 5743., 2277., 4649., 3598., 3351., 4608., 3873., 7013.,
       1812., 5478., 6569., 5225., 2417., 3614., 3392., 4128., 7393.,
       5305., 5058., 1683., 6227., 3272., 2765., 1872., 4773., 3429.,
       5515., 2496., 3068., 8714., 7733., 1098., 6691., 2689., 4169.,
       1461., 7109., 5115., 3523., 5936., 6664., 4040., 3606., 4068.,
       1501., 2832., 2659., 3915., 3831., 3095., 2947., 7697., 1606.,
       4318., 3709., 1969., 6606., 6824., 5047., 4669., 1526., 1985.,
       4266., 4036., 7290., 6591., 1317., 3840., 4575.,  920., 6031.,
       4679., 4127., 3974., 5729., 4486., 2933., 5445., 2115., 1416.,
       7442., 3141., 5409., 7359., 5170., 5298., 6779., 6312., 6296.,
       4795., 5319., 7534., 4717., 4694., 3422., 6889., 7129., 4151.,
       4186., 3974., 3855., 5323., 7264., 5260., 5823., 4153., 5923.,
       1115., 1817.,

In [8]:
# setup protobuf

buffer = io.BytesIO()
smac.write_numpy_to_dense_tensor(buffer, features, labels_vec)
buffer.seek(0)

prefix = 'realestate'
key = 'linearlearner'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buffer)
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)