In [1]:
import os
import boto3
import io
import sagemaker

%matplotlib inline 

import pandas as pd
import numpy as np
import mxnet as mx
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
matplotlib.style.use('ggplot')
import pickle, gzip, urllib, json
import csv

In [2]:
from sagemaker import get_execution_role
role = get_execution_role()

In [3]:
role

'arn:aws:iam::340187602989:role/service-role/AmazonSageMaker-ExecutionRole-20191215T215711'

In [4]:
s3_client = boto3.client('s3')
data_bucket_name='ccprojectbucket1'

In [5]:
obj_list=s3_client.list_objects(Bucket=data_bucket_name)
file=[]
for contents in obj_list['Contents']:
    file.append(contents['Key'])
print(file)

['datasets/', 'datasets/output/ccproject-kmeans2019-12-17-06-24-23/output/model.tar.gz', 'datasets/train/', 'datasets/train/sagemaker_train_data.csv', 'flavor/kmeans-2019-12-17-03-52-22-116/output/model.tar.gz', 'flavor/kmeans-2019-12-17-04-10-43-591/output/model.tar.gz', 'flavor/kmeans-2019-12-17-04-30-32-969/output/model.tar.gz', 'flavor/kmeans-2019-12-17-19-05-52-361/output/model.tar.gz', 'sagemaker_train_data.csv']


In [6]:
file_data='sagemaker_train_data.csv'

In [7]:
response = s3_client.get_object(Bucket=data_bucket_name, Key=file_data)
response_body = response["Body"].read()
train_data = pd.read_csv(io.BytesIO(response_body), header=None, delimiter=",", low_memory=False)

In [8]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5
0,0.166667,0.166667,0.833333,0.166667,0.166667,0.0
1,0.166667,0.166667,0.833333,0.833333,0.333333,0.833333
2,0.166667,0.333333,0.833333,0.5,0.833333,0.166667
3,0.166667,0.166667,0.833333,0.833333,0.5,0.0
4,0.166667,0.166667,0.666667,0.666667,0.666667,0.0


In [9]:
train_data.shape

(11390, 6)

In [10]:
from sagemaker import KMeans

bucket='ccprojectbucket1'
num_clusters = 6

kmeans = KMeans(role=role,
                train_instance_count=1,
                train_instance_type='ml.m4.xlarge',
                output_path='s3://'+ bucket +'/flavor/',              
                k=num_clusters)

In [11]:
train_data = train_data.values.astype('float32')

In [12]:
print(train_data[1])

[0.16666667 0.16666667 0.8333333  0.8333333  0.33333334 0.8333333 ]


In [13]:
%%time
kmeans.fit(kmeans.record_set(train_data))

2019-12-18 20:30:07 Starting - Starting the training job...
2019-12-18 20:30:08 Starting - Launching requested ML instances......
2019-12-18 20:31:16 Starting - Preparing the instances for training......
2019-12-18 20:32:22 Downloading - Downloading input data...
2019-12-18 20:33:08 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34m[12/18/2019 20:33:11 INFO 140150121895744] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'_enable_profiler': u'false', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'local_lloyd_num_trials': u'auto', u'_log_level': u'info', u'_kvstore': u'auto', u'local_lloyd_init_method': u'kmeans++', u'force_dense': u'true', u'epochs': u'1', u'init_method': u'random', u'local_lloyd_tol': u'0.0001', u'local_lloyd_max_iter': u'300', u'_disable_wait_to_read': u'false', u'extra_center_factor': u'auto', u'eval_metric


2019-12-18 20:33:21 Uploading - Uploading generated training model
2019-12-18 20:33:21 Completed - Training job completed
Training seconds: 59
Billable seconds: 59
CPU times: user 808 ms, sys: 14 ms, total: 822 ms
Wall time: 3min 42s


In [14]:
%%time
kmeans_predictor = kmeans.deploy(initial_instance_count=1, 
                                 instance_type='ml.t2.medium')

----------------------------------------------------------------------------------------------------------------------------------------------!CPU times: user 746 ms, sys: 36.3 ms, total: 782 ms
Wall time: 11min 57s


In [15]:

%%time
result=kmeans_predictor.predict(train_data)

CPU times: user 743 ms, sys: 2.85 ms, total: 746 ms
Wall time: 1.35 s


In [16]:
result[0]

label {
  key: "closest_cluster"
  value {
    float32_tensor {
      values: 0.0
    }
  }
}
label {
  key: "distance_to_cluster"
  value {
    float32_tensor {
      values: 0.5728551745414734
    }
  }
}

In [17]:
result[1]

label {
  key: "closest_cluster"
  value {
    float32_tensor {
      values: 2.0
    }
  }
}
label {
  key: "distance_to_cluster"
  value {
    float32_tensor {
      values: 0.32184845209121704
    }
  }
}

In [18]:

cluster_labels = [r.label['closest_cluster'].float32_tensor.values[0] for r in result]

In [19]:
pd.DataFrame(cluster_labels)[0].value_counts()

0.0    2590
3.0    2021
5.0    1886
4.0    1822
2.0    1782
1.0    1289
Name: 0, dtype: int64

In [20]:
train_data[0]

array([0.16666667, 0.16666667, 0.8333333 , 0.16666667, 0.16666667,
       0.        ], dtype=float32)

In [21]:
result[0]

label {
  key: "closest_cluster"
  value {
    float32_tensor {
      values: 0.0
    }
  }
}
label {
  key: "distance_to_cluster"
  value {
    float32_tensor {
      values: 0.5728551745414734
    }
  }
}

In [28]:
cluster_labels[6]

2.0

In [29]:
len(cluster_labels)

11390

In [32]:
with open('predicted_labels.csv', mode='w') as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=',')
    i = 0
    for label in cluster_labels:
        csv_writer.writerow([i, cluster_labels[i]])
        i += 1
    print(i)

11390


In [36]:
from collections import Counter
counter1 = Counter()
with open('predicted_labels.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for line in csv_reader:
        # print(line)
        label = float(line[1])
        counter1[label] += 1
print(counter1)

Counter({0.0: 2590, 3.0: 2021, 5.0: 1886, 4.0: 1822, 2.0: 1782, 1.0: 1289})


In [37]:
job_name='kmeans-2019-12-18-20-30-07-551'
model_key = "flavor/" + job_name + "/output/model.tar.gz"

boto3.resource('s3').Bucket(bucket).download_file(model_key, 'model.tar.gz')
os.system('tar -zxvf model.tar.gz')
os.system('unzip model_algo-1')

2304

In [38]:
Kmeans_model_params = mx.ndarray.load('model_algo-1')

In [39]:
cluster_centroids=pd.DataFrame(Kmeans_model_params[0].asnumpy())

In [40]:
cluster_centroids

Unnamed: 0,0,1,2,3,4,5
0,0.184647,0.322558,0.302091,0.21761,0.259755,0.100293
1,0.249609,0.339738,0.788035,0.749922,0.726648,0.136051
2,0.22671,0.395833,0.776565,0.698963,0.467158,0.742358
3,0.190003,0.24586,0.322945,0.23788,0.570084,0.712285
4,0.230551,0.56095,0.742665,0.650577,0.205565,0.084706
5,0.195973,0.242355,0.341658,0.2991,0.738447,0.149337
