In [1]:
## download training data and unzip
!wget https://source-data-nyu-final-cc.s3.amazonaws.com/news-sample.zip
!unzip news-sample.zip

--2021-05-21 05:05:34--  https://source-data-nyu-final-cc.s3.amazonaws.com/news-sample.zip
Resolving source-data-nyu-final-cc.s3.amazonaws.com (source-data-nyu-final-cc.s3.amazonaws.com)... 52.216.245.44
Connecting to source-data-nyu-final-cc.s3.amazonaws.com (source-data-nyu-final-cc.s3.amazonaws.com)|52.216.245.44|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1787577 (1.7M) [application/zip]
Saving to: ‘news-sample.zip.1’


2021-05-21 05:05:35 (67.1 MB/s) - ‘news-sample.zip.1’ saved [1787577/1787577]

Archive:  news-sample.zip
   creating: news-sample/
  inflating: news-sample/%2F2021%2F05%2F01%2Fhealth%2Fus-vaccine-sharing-ethics%2Findex.html.json  
  inflating: news-sample/%2F2021%2F05%2F01%2Fentertainment%2Felliot-page-oprah-interview-intl%2Findex.html.json  
  inflating: news-sample/%2F2021%2F05%2F02%2Feconomy%2Fbiden-green-jobs%2Findex.html.json  
  inflating: news-sample/%2F2021%2F05%2F02%2Fpolitics%2Fanita-dunn-schools-reopen-biden-cnntv%2Findex.htm

In [2]:
source_dir = "news-sample"

In [3]:
import sys
import numpy as np
import pandas as pd
import re
import gc

from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
from tqdm import tqdm
tqdm.pandas()

from util import text_to_wordlist
from util import text2vec

# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text

from os import listdir
from os.path import isfile, join
import json

import sagemaker
from sagemaker import get_execution_role
from sagemaker import KMeans
sess = sagemaker.Session()
bucket = sess.default_bucket()

source_files = [join(source_dir, f) for f in listdir(source_dir) if isfile(join(source_dir, f))]
news_jsons = []
for f in source_files:
  news_jsons.append(json.loads(open(f).read()))
news_df = pd.json_normalize(news_jsons)
news_df['ori_text'] = news_df[['title', 'body']].agg(' '.join, axis=1)
news_df['words'] = news_df.ori_text.progress_apply(text_to_wordlist)

## Load Google pretrained model
# wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
# gzip -d GoogleNews-vectors-negative300.bin.gz
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
def gtext2vec(text):
    return text2vec(model,text)
news_df['vectors'] = news_df.words.progress_apply(gtext2vec)

## Clustering and generating scatter
X = np.concatenate(news_df['vectors'].values)


100%|██████████| 249/249 [00:00<00:00, 1177.34it/s]
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
100%|██████████| 249/249 [00:00<00:00, 498.37it/s]


In [4]:
role = get_execution_role()

In [5]:
num_clusters = 10
kmeans = KMeans(
    role=role,
    train_instance_count=1,
    train_instance_type="ml.m5.4xlarge",
    output_path="s3://" + bucket + "/news_kmeans/",
    k=num_clusters,
)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [6]:
X

array([[-0.01515687,  0.01863076,  0.03876204, ..., -0.02226893,
         0.04162458, -0.01257528],
       [ 0.03044853,  0.03381789,  0.03785017, ..., -0.0323432 ,
         0.03854426,  0.00232266],
       [ 0.0318165 ,  0.07535027,  0.04228247, ..., -0.01504497,
         0.02487498, -0.01117863],
       ...,
       [ 0.02225916,  0.02586881,  0.03080497, ..., -0.02945436,
         0.01095364, -0.01230877],
       [ 0.022015  ,  0.03737322,  0.03860334, ..., -0.07735461,
         0.01477447, -0.01427768],
       [ 0.02433206,  0.03272655,  0.02314244, ..., -0.05226618,
         0.01890649,  0.01904838]], dtype=float32)

In [7]:
kmeans.fit(kmeans.record_set(X))

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


2021-05-21 05:06:24 Starting - Starting the training job...
2021-05-21 05:06:48 Starting - Launching requested ML instancesProfilerReport-1621573584: InProgress
.........
2021-05-21 05:08:08 Starting - Preparing the instances for training......
2021-05-21 05:09:20 Downloading - Downloading input data...
2021-05-21 05:09:55 Training - Training image download completed. Training in progress.
2021-05-21 05:09:55 Uploading - Uploading generated training model
2021-05-21 05:09:55 Completed - Training job completed
[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[05/21/2021 05:09:44 INFO 140216410638144] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'init_method': 'random', 'mini_batch_size': '5000', 'epochs': '1', 'extra_center_factor': 'auto', 'local_lloyd_max_iter': '300', 'local_lloyd_tol': '0.0001', 'local_lloyd_init_method': 'kmeans++', 'local_l

In [26]:
kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.t2.medium")

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


---------------------!

# Test Predict

In [27]:
ENDPOINT_NAME=kmeans_predictor.endpoint

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [28]:
def response2cluster(response):
    return response[0].label.get("closest_cluster").float32_tensor.values[0]

In [29]:
from sagemaker import KMeansPredictor
kmeans_endpoint = KMeansPredictor(ENDPOINT_NAME)

def sagemaker_kmeans_predict(vec):
    response = kmeans_endpoint.predict(vec)
    return response2cluster(response)

In [30]:
def infer_cluster(text):
  vec = text2vec(model, text)
  return [sagemaker_kmeans_predict(vec), vec]

In [31]:
[cluster, vec] = infer_cluster("Bill Gate divorced, Indian covid is bad")
print(cluster)

2.0


## Save News with its clusters

In [32]:
news_df['cluster'] = kmeans_predictor.predict(X)

In [34]:
for i in range(news_df['cluster'].shape[0]):
    news_df['cluster'][i] = news_df['cluster'][i].label.get("closest_cluster").float32_tensor.values[0]
news_df

Unnamed: 0,title,time,body,link,vectors,cluster
0,Full FDA approval of Covid-19 vaccines could h...,"Updated 11:50 AM ET, Fri April 30, 2021",(CNN)As a condition of the emergency use auth...,/2021/04/29/health/fda-approval-covid-19-vacci...,"[[-0.015156866, 0.018630762, 0.038762044, 0.08...",4
1,Bush says if GOP stands for 'White Anglo-Saxon...,"Updated 3:59 PM ET, Mon May 3, 2021",Washington (CNN)Former President George W. Bus...,/2021/05/03/politics/bush-gop-white-anglo-saxo...,"[[0.030448528, 0.033817895, 0.037850168, 0.089...",8
2,Hubble spies rare giant star battling against ...,"Updated 2:37 PM ET, Mon May 3, 2021",(CNN)Some stars live fast and look beautiful ...,/2021/05/03/world/hubble-star-destruction-scn/...,"[[0.031816505, 0.07535027, 0.042282466, 0.0533...",1
3,Three-time Indy 500 winner Bobby Unser dies at 87,"Updated 1:52 PM ET, Mon May 3, 2021","(CNN)Bobby Unser, winner of the 1968, 1975 an...",/2021/05/03/us/bobby-unser-racing-driver-dies/...,"[[0.019731479, 0.024557102, 0.004227914, 0.051...",7
4,Keegan-Michael Key among final 'SNL' hosts of ...,"Updated 5:52 PM ET, Mon May 3, 2021","(CNN)""Saturday Night Live"" has set the stage ...",/2021/05/03/entertainment/snl-hosts/index.html,"[[0.027429659, 0.044627406, 0.026173107, 0.060...",7
...,...,...,...,...,...,...
244,'That Damn Michael Che' trailer has arrived,"Updated 10:57 AM ET, Wed April 28, 2021",(CNN)Welcome to the mind of Michael Che.That'...,/2021/04/28/entertainment/michael-che-show-tra...,"[[0.013547319, 0.04926485, -0.0023994446, 0.09...",7
245,You can now buy a seat on a Blue Origin rocket,"Updated 1:51 PM ET, Thu April 29, 2021","New York (CNN Business)Blue Origin, the Jeff B...",/2021/04/29/tech/blue-origin-ticket-sales/inde...,"[[0.025821665, 0.021468738, 0.016823582, 0.040...",1
246,Social media is a lifeline for desperate India...,"Updated 8:37 PM ET, Sat May 1, 2021","On most days, Network Capital, a business netw...",/2021/05/01/tech/india-covid-twitter-modi-face...,"[[0.022259163, 0.025868809, 0.03080497, 0.0739...",3
247,Trump allies worry Giuliani raid sent 'strong ...,"Updated 1:30 PM ET, Sat May 1, 2021",(CNN)A Wednesday raid by federal agents of an...,/2021/04/30/politics/giuliani-raid-donald-trum...,"[[0.022015003, 0.037373222, 0.03860334, 0.0328...",8


In [35]:
news_df.to_pickle('news_df.pkl')

In [36]:
import boto3
s3 = boto3.resource('s3')
s3_bucket_name = "nyu-cc-final-recommend-news"
object = s3.Object(s3_bucket_name, 'news_df.pkl')
object.put(Body=open("news_df.pkl", "rb").read())

{'ResponseMetadata': {'RequestId': '4VDF085RWY0B84MV',
  'HostId': '0cJeVMlzdD4QYZVRrhRRv48XnFhx6KPJVWFelz0sJSiESE7Xy+zgoZEsPm0HNCVuDTVcL6C5hxg=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '0cJeVMlzdD4QYZVRrhRRv48XnFhx6KPJVWFelz0sJSiESE7Xy+zgoZEsPm0HNCVuDTVcL6C5hxg=',
   'x-amz-request-id': '4VDF085RWY0B84MV',
   'date': 'Fri, 21 May 2021 05:47:43 GMT',
   'etag': '"5e0b3bc3ff6592d417054276fe1d0cba"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"5e0b3bc3ff6592d417054276fe1d0cba"'}