In [1]:
## download training data and unzip
!unzip news-sample.zip

Archive:  news-sample.zip
   creating: news-sample/
  inflating: news-sample/%2F2021%2F05%2F01%2Fhealth%2Fus-vaccine-sharing-ethics%2Findex.html.json  
  inflating: news-sample/%2F2021%2F05%2F01%2Fentertainment%2Felliot-page-oprah-interview-intl%2Findex.html.json  
  inflating: news-sample/%2F2021%2F05%2F02%2Feconomy%2Fbiden-green-jobs%2Findex.html.json  
  inflating: news-sample/%2F2021%2F05%2F02%2Fpolitics%2Fanita-dunn-schools-reopen-biden-cnntv%2Findex.html.json  
  inflating: news-sample/%2F2021%2F04%2F06%2Fsuccess%2Ffinding-a-sponsor-workplace%2Findex.html.json  
  inflating: news-sample/%2F2021%2F04%2F29%2Fpolitics%2Fflorida-passes-elections-bill-voting-restrictions%2Findex.html.json  
  inflating: news-sample/%2F2021%2F04%2F30%2Fpolitics%2Ftennessee-bill-businesses-trans-bathroom-policy%2Findex.html.json  
  inflating: news-sample/%2F2021%2F05%2F03%2Fafrica%2Fsouth-africa-lion-breeding-intl-scli%2Findex.html.json  
  inflating: news-sample/%2F2021%2F05%2F03%2Famericas%

In [2]:
source_dir = "news-sample"

In [9]:
import sys
import numpy as np
import pandas as pd
import re
import gc

from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
from tqdm import tqdm
tqdm.pandas()

from util import text_to_wordlist
from util import text2vec

# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text

from os import listdir
from os.path import isfile, join
import json

import sagemaker
from sagemaker import get_execution_role
from sagemaker import KMeans
sess = sagemaker.Session()
bucket = sess.default_bucket()

source_files = [join(source_dir, f) for f in listdir(source_dir) if isfile(join(source_dir, f))]
news_jsons = []
for f in source_files:
  news_jsons.append(json.loads(open(f).read()))
news_df = pd.json_normalize(news_jsons)
news_df['ori_text'] = news_df[['title', 'body']].agg(' '.join, axis=1)
news_df['words'] = news_df.ori_text.progress_apply(text_to_wordlist)

## Load Google pretrained model
# wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
# gzip -d GoogleNews-vectors-negative300.bin.gz
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
def gtext2vec(text):
    return text2vec(model,text)
news_df['vectors'] = news_df.words.progress_apply(gtext2vec)

## Clustering and generating scatter
X = np.concatenate(news_df['vectors'].values)


100%|██████████| 249/249 [00:00<00:00, 1166.40it/s]
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
100%|██████████| 249/249 [00:00<00:00, 498.96it/s]


In [11]:
role = get_execution_role()

In [12]:
num_clusters = 10
kmeans = KMeans(
    role=role,
    train_instance_count=1,
    train_instance_type="ml.m5.4xlarge",
    output_path="s3://" + bucket + "/news_kmeans/",
    k=num_clusters,
)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [15]:
X

array([[ 0.05031236,  0.02700881,  0.00100662, ..., -0.0636446 ,
         0.04336041, -0.0470434 ],
       [ 0.01382729,  0.03272541,  0.02469097, ..., -0.04095681,
        -0.01075038, -0.00682084],
       [ 0.00031461,  0.0278891 ,  0.02355221, ..., -0.00412034,
         0.03624998, -0.03510499],
       ...,
       [ 0.03302171,  0.04249801,  0.01532259, ..., -0.03813257,
         0.04462177, -0.02753673],
       [ 0.0124541 ,  0.04760894,  0.02909537, ..., -0.01528124,
         0.03458426, -0.01920893],
       [-0.00318729,  0.04919301, -0.00867569, ..., -0.05455039,
         0.04025878, -0.00922522]], dtype=float32)

In [13]:
kmeans.fit(kmeans.record_set(X))

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


2021-05-15 22:18:01 Starting - Starting the training job...
2021-05-15 22:18:24 Starting - Launching requested ML instancesProfilerReport-1621117081: InProgress
.........
2021-05-15 22:19:44 Starting - Preparing the instances for training......
2021-05-15 22:20:44 Downloading - Downloading input data...
2021-05-15 22:21:26 Training - Training image download completed. Training in progress.
2021-05-15 22:21:26 Uploading - Uploading generated training model[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[05/15/2021 22:21:23 INFO 140474586662720] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'init_method': 'random', 'mini_batch_size': '5000', 'epochs': '1', 'extra_center_factor': 'auto', 'local_lloyd_max_iter': '300', 'local_lloyd_tol': '0.0001', 'local_lloyd_init_method': 'kmeans++', 'local_lloyd_num_trials': 'auto', 'half_life_time_size': '0', 'e

In [17]:
kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.t2.medium")

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


---------------------!

In [None]:
## Save News with its clusters
news_df['cluster'] = kmeans_predictor.predict(X)
news_df = news_df.drop(["ori_text", "words"], axis=1)
news_df.to_pickle('news_df.pkl')

# Test Predict

In [19]:
ENDPOINT_NAME="kmeans-2021-05-15-22-24-43-993"

In [35]:
import boto3
runtime= boto3.client('runtime.sagemaker')

In [118]:
from sagemaker import KMeansPredictor
kmeans_endpoint = KMeansPredictor(ENDPOINT_NAME)

def sagemaker_kmeans_predict(vec):
    response = kmeans_endpoint.predict(vec)
    #print(response[0].label)
    return response[0].label.get("closest_cluster").float32_tensor.values[0]

In [119]:
def infer_cluster(text):
  vec = text2vec(model, text)
  return [sagemaker_kmeans_predict(vec), vec]

In [120]:
def list_methods(res):  
    print([method_name for method_name in dir(res) if callable(getattr(res, method_name))])

In [124]:
cluster = infer_cluster("Bill Gate divorced, Indian covid is bad")
print(cluster[0])

9.0
