In [1]:
## download training data and unzip
!unzip news-sample.zip

Archive:  news-sample.zip
   creating: news-sample/
  inflating: news-sample/%2F2021%2F05%2F01%2Fhealth%2Fus-vaccine-sharing-ethics%2Findex.html.json  
  inflating: news-sample/%2F2021%2F05%2F01%2Fentertainment%2Felliot-page-oprah-interview-intl%2Findex.html.json  
  inflating: news-sample/%2F2021%2F05%2F02%2Feconomy%2Fbiden-green-jobs%2Findex.html.json  
  inflating: news-sample/%2F2021%2F05%2F02%2Fpolitics%2Fanita-dunn-schools-reopen-biden-cnntv%2Findex.html.json  
  inflating: news-sample/%2F2021%2F04%2F06%2Fsuccess%2Ffinding-a-sponsor-workplace%2Findex.html.json  
  inflating: news-sample/%2F2021%2F04%2F29%2Fpolitics%2Fflorida-passes-elections-bill-voting-restrictions%2Findex.html.json  
  inflating: news-sample/%2F2021%2F04%2F30%2Fpolitics%2Ftennessee-bill-businesses-trans-bathroom-policy%2Findex.html.json  
  inflating: news-sample/%2F2021%2F05%2F03%2Fafrica%2Fsouth-africa-lion-breeding-intl-scli%2Findex.html.json  
  inflating: news-sample/%2F2021%2F05%2F03%2Famericas%

In [2]:
source_dir = "news-sample"

In [3]:
import sys
import numpy as np
import pandas as pd
import re
import gc

from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
from tqdm import tqdm
tqdm.pandas()

from util import text_to_wordlist
from util import text2vec

# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text

from os import listdir
from os.path import isfile, join
import json

import sagemaker
from sagemaker import get_execution_role
from sagemaker import KMeans
sess = sagemaker.Session()
bucket = sess.default_bucket()

source_files = [join(source_dir, f) for f in listdir(source_dir) if isfile(join(source_dir, f))]
news_jsons = []
for f in source_files:
  news_jsons.append(json.loads(open(f).read()))
news_df = pd.json_normalize(news_jsons)
news_df['ori_text'] = news_df[['title', 'body']].agg(' '.join, axis=1)
news_df['words'] = news_df.ori_text.progress_apply(text_to_wordlist)

## Load Google pretrained model
# wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
# gzip -d GoogleNews-vectors-negative300.bin.gz
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
def gtext2vec(text):
    return text2vec(model,text)
news_df['vectors'] = news_df.words.progress_apply(gtext2vec)

## Clustering and generating scatter
X = np.concatenate(news_df['vectors'].values)


100%|██████████| 249/249 [00:00<00:00, 1182.89it/s]
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
100%|██████████| 249/249 [00:00<00:00, 490.87it/s]


In [4]:
role = get_execution_role()

In [5]:
num_clusters = 10
kmeans = KMeans(
    role=role,
    train_instance_count=1,
    train_instance_type="ml.m5.4xlarge",
    output_path="s3://" + bucket + "/news_kmeans/",
    k=num_clusters,
)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [6]:
X

array([[ 2.0458702e-02,  5.0533723e-02,  4.7280885e-02, ...,
        -2.3224665e-02,  4.1228846e-02, -5.3300083e-02],
       [ 2.7912447e-02,  2.6746163e-02, -8.4744439e-05, ...,
        -3.1031473e-02,  2.7302010e-02, -7.1553229e-03],
       [ 1.7667541e-02,  3.0833457e-02,  1.3139738e-02, ...,
        -2.3138842e-02,  4.8962448e-02, -1.6316909e-02],
       ...,
       [-2.0126782e-03,  3.3508014e-02,  3.6321182e-02, ...,
        -2.0768417e-02,  5.1217217e-02, -3.6849391e-03],
       [ 3.4615543e-02,  4.4535678e-02, -8.9113526e-03, ...,
        -3.4623798e-02,  5.8771107e-02,  5.0990521e-03],
       [ 1.8009610e-02,  4.4489905e-02,  3.6183905e-02, ...,
        -7.2793521e-02,  2.3015549e-02, -2.2173600e-04]], dtype=float32)

In [7]:
kmeans.fit(kmeans.record_set(X))

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


2021-05-19 15:10:27 Starting - Starting the training job...
2021-05-19 15:10:50 Starting - Launching requested ML instancesProfilerReport-1621437026: InProgress
......
2021-05-19 15:11:50 Starting - Preparing the instances for training.........
2021-05-19 15:13:21 Downloading - Downloading input data...
2021-05-19 15:13:54 Training - Training image download completed. Training in progress.
2021-05-19 15:13:54 Uploading - Uploading generated training model[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[05/19/2021 15:13:52 INFO 140058891200320] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'init_method': 'random', 'mini_batch_size': '5000', 'epochs': '1', 'extra_center_factor': 'auto', 'local_lloyd_max_iter': '300', 'local_lloyd_tol': '0.0001', 'local_lloyd_init_method': 'kmeans++', 'local_lloyd_num_trials': 'auto', 'half_life_time_size': '0', 'e

In [8]:
kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.t2.medium")

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


---------------------!

# Test Predict

In [10]:
ENDPOINT_NAME="kmeans-2021-05-19-15-14-39-730"

In [16]:
def response2cluster(response):
    return response[0].label.get("closest_cluster").float32_tensor.values[0]

In [18]:
from sagemaker import KMeansPredictor
kmeans_endpoint = KMeansPredictor(ENDPOINT_NAME)

def sagemaker_kmeans_predict(vec):
    response = kmeans_endpoint.predict(vec)
    return response2cluster(response)

In [21]:
def infer_cluster(text):
  vec = text2vec(model, text)
  return [sagemaker_kmeans_predict(vec), vec]

In [23]:
[cluster, vec] = infer_cluster("Bill Gate divorced, Indian covid is bad")
print(cluster)

8.0


## Save News with its clusters

In [None]:
news_df['cluster'] = kmeans_predictor.predict(X)

In [35]:
news_df = news_df.drop(["ori_text", "words"], axis=1)
for i in range(news_df['cluster'].shape[0]):
    news_df['cluster'][i] = news_df['cluster'][i].label.get("closest_cluster").float32_tensor.values[0]
news_df

Unnamed: 0,title,time,body,link,vectors,cluster
0,Biden uses his long history with Amtrak to put...,"Updated 4:35 PM ET, Fri April 30, 2021",(CNN)President Joe Biden on Friday pitched hi...,/2021/04/30/politics/amtrak-joe-biden/index.html,"[[0.020458702, 0.050533723, 0.047280885, 0.077...",4
1,Woman with rare muscular disease sings to exer...,"Updated 7:47 AM ET, Sat May 1, 2021",(CNN)Songwriter Tabitha Haly doesn't just sin...,/2021/04/30/health/tabitha-haly-spinal-muscula...,"[[0.027912447, 0.026746163, -8.474444e-05, 0.0...",3
2,Oscar nominee Glenn Close is on a mission to e...,"Updated 7:23 PM ET, Thu April 22, 2021",(CNN)Glenn Close is up for an Oscar this Sund...,/2021/04/22/health/oscars-glenn-close-mental-h...,"[[0.017667541, 0.030833457, 0.013139738, 0.095...",2
3,"UV lights, cars with Trump bumper stickers and...","Updated 6:27 PM ET, Fri April 30, 2021",Phoenix (CNN)At the Arizona State Fairgrounds ...,/2021/04/30/politics/arizona-ballot-audit-repu...,"[[0.017553689, 0.030668557, 0.03280953, 0.0661...",4
4,This Boston Marathon bombing survivor is on a ...,"Updated 9:12 PM ET, Wed April 14, 2021","(CNN)Eight years ago, on April 15, 2013, Heat...",/2021/04/14/us/boston-marathon-bombing-survivo...,"[[0.031652648, 0.034140192, 0.023112735, 0.095...",3
...,...,...,...,...,...,...
244,The pandemic disrupted organ transplants. But ...,"Updated 2:45 PM ET, Thu April 29, 2021",(CNN)As coronavirus shut down the world last ...,/2021/04/29/health/organ-donations-coronavirus...,"[[0.014508941, 0.055765923, 0.029397067, 0.081...",2
245,Arrests made in connection to attack on Lady G...,"Updated 5:43 PM ET, Thu April 29, 2021",(CNN)Five people have been arrested and charg...,/2021/04/29/entertainment/lady-gaga-dog-walker...,"[[0.02791615, 0.03122727, 0.033650886, 0.01534...",7
246,What women should know about the Johnson & Joh...,"Updated 5:32 AM ET, Tue April 27, 2021",(CNN)The US Centers for Disease Control and P...,/2021/04/27/health/women-johnson-johnson-vacci...,"[[-0.0020126782, 0.033508014, 0.03632118, 0.10...",1
247,Lebanon battles swarms of locusts after wind c...,"Updated 6:35 AM ET, Tue April 27, 2021","Beirut, LebanonArmy helicopters are spraying a...",/2021/04/27/middleeast/lebanon-locust-intl-scl...,"[[0.034615543, 0.044535678, -0.008911353, 0.03...",5


In [36]:
news_df.to_pickle('news_df.pkl')

In [39]:
import boto3
s3 = boto3.resource('s3')
s3_bucket_name = "nyu-cc-final-recommend-news"
object = s3.Object(s3_bucket_name, 'news_df.pkl')
object.put(Body=open("news_df.pkl", "rb").read())

{'ResponseMetadata': {'RequestId': 'C9F62SAFJA5VJJ90',
  'HostId': 'scOi5i9DSo0syCxUqm6/yiqWOt1h5lfRWgM/5o5PZuBAzyBB9J94qPBGPhIadzd3pHFWLX95Ku8=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'scOi5i9DSo0syCxUqm6/yiqWOt1h5lfRWgM/5o5PZuBAzyBB9J94qPBGPhIadzd3pHFWLX95Ku8=',
   'x-amz-request-id': 'C9F62SAFJA5VJJ90',
   'date': 'Wed, 19 May 2021 15:56:25 GMT',
   'etag': '"ccaca0df839b81cfd7f74f9bd7dbaca6"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"ccaca0df839b81cfd7f74f9bd7dbaca6"'}