In [1]:
!wget -nc https://s3-us-west-2.amazonaws.com/ray-tutorials/hackernews.zip
!unzip -o hackernews.zip
!head -n 2 submission-1.json

--2023-11-24 07:45:52--  https://s3-us-west-2.amazonaws.com/ray-tutorials/hackernews.zip
Resolving s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)... 52.92.248.40, 52.218.232.144, 52.92.160.152, ...
Connecting to s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)|52.92.248.40|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56402193 (54M) [application/zip]
Saving to: ‘hackernews.zip’


2023-11-24 07:45:55 (26.5 MB/s) - ‘hackernews.zip’ saved [56402193/56402193]

Archive:  hackernews.zip
  inflating: submission-1.json       
  inflating: submission-2.json       
  inflating: submission-3.json       
  inflating: submission-4.json       
{"body": {"descendants": 0, "url": "http://markpincus.blogspot.com/2005/03/peopleweb-i-believe-we-are-close-to.html", "text": "", "title": "The PeopleWeb | Mark Pincus Blog (March 2005)", "by": "sayemm", "score": 3, "time": 1286515576, "type": "story", "id": 1770734}, "source": "firebase", "id": 1770734, "retrieved_a

In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import json
import numpy as np
import pandas as pd
import time

In [4]:
def parse_hn_submissions(path):
    with open(path, "r") as f:
        records = []
        for line in f.readlines():
            body = json.loads(line)["body"]
            records.append({"data": body["title"], "score": body["score"]})
        return pd.DataFrame(records)

In [5]:
start_time = time.time()

files = ["submission-" + str(i) + ".json" for i in range(1, 5)]
records = [parse_hn_submissions(file) for file in files]
df = pd.concat(records)

end_time = time.time()
duration = end_time - start_time
print("Took {} seconds to parse the hackernews submissions".format(duration))

df.head()

Took 2.7181010246276855 seconds to parse the hackernews submissions


Unnamed: 0,data,score
0,The PeopleWeb | Mark Pincus Blog (March 2005),3
1,Computer science and programming are two separ...,1
2,Don't Go It Alone: Create an Advisory Board,1
3,Wikileaks Secret Dreams,1
4,MakeMyTrip.com: Is eCommerce in India Finall...,1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400000 entries, 0 to 99999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   data    400000 non-null  object
 1   score   400000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 9.2+ MB


In [7]:
df.describe()

Unnamed: 0,score
count,400000.0
mean,9.294667
std,37.853269
min,0.0
25%,1.0
50%,1.0
75%,3.0
max,2376.0


In [8]:
df["score"].median()

1.0

In [9]:
df["target"] = df["score"] > 1.0

In [10]:
df.head()

Unnamed: 0,data,score,target
0,The PeopleWeb | Mark Pincus Blog (March 2005),3,True
1,Computer science and programming are two separ...,1,False
2,Don't Go It Alone: Create an Advisory Board,1,False
3,Wikileaks Secret Dreams,1,False
4,MakeMyTrip.com: Is eCommerce in India Finall...,1,False


In [11]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2)

In [12]:
train

Unnamed: 0,data,score,target
85111,The Best brooklyn real estate,1,False
85403,"Kim Dotcom Eligible for U.S. Extradition, N.Z....",2,True
40213,System Engineers are Software Developers,2,True
75495,How Your Privacy Policy Affects Sign-Ups – Sur...,1,False
92501,Zuta labs pocket printer,1,False
...,...,...,...
5488,Self-Programming,1,False
51866,I-Swarm Micro Robots are Up and Running,12,True
98648,Quadcopter Physics by Opera Woman.,1,False
78528,Home Chip Fab,2,True


In [13]:
test

Unnamed: 0,data,score,target
16043,Louis C.K. sees ticket scalping drop over 96% ...,330,True
33978,Foundations of Computer Science,437,True
95199,FUNNY VIDEOS Funny Cats Videos Funny Animals F...,1,False
15017,Cisco shares rise as profits strengthen – FT.com,1,False
61789,Dashboard Ads Coming to Tumblr,1,False
...,...,...,...
56296,Blockchain Company Align Commerce Closes $12.5...,1,False
1055,Enormous 160mph 'super typhoon' pictured from ISS,1,False
47497,I Still Don't Believe Wireless Is Safer,1,False
69119,Sony's debut Android tablet to launch in Septe...,1,False


In [14]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier

pipeline = Pipeline([
    ("vect", CountVectorizer()),
    ("sgd", SGDClassifier(loss="hinge", penalty="l2",
                          alpha=0.001,
                          max_iter=10000, tol=1e-3,
                          warm_start=True))])
result = pipeline.fit(train.data, train.target)

predicted = result.predict(train.data)
print("Accuracy on the training set is {}".format(np.mean(predicted == train.target)))

Accuracy on the training set is 0.584765625


In [15]:
predicted = pipeline.predict(test.data)
print("Accuracy on the test set is {}".format(np.mean(predicted == test.target)))

Accuracy on the test set is 0.58085


In [16]:
pipeline.predict(["Iconic consoles of the IBM System/360 mainframes, 55 years old today",
                  "Are Banned Drugs in Your Meat?"])

array([ True, False])

In [17]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

In [18]:
import os
import pickle

In [20]:
#tuner.get_results()

In [21]:
'''
best_result_path = os.path.join(all_trials.get_best_logdir("mean_accuracy"), "model.pkl")
with open(best_result_path, "rb") as f:
    pipeline = pickle.load(f)
print("Best result was {}".format(np.mean(pipeline.predict(test.data) == test.target)))
print("Best result path is {}".format(best_result_path))
'''

'\nbest_result_path = os.path.join(all_trials.get_best_logdir("mean_accuracy"), "model.pkl")\nwith open(best_result_path, "rb") as f:\n    pipeline = pickle.load(f)\nprint("Best result was {}".format(np.mean(pipeline.predict(test.data) == test.target)))\nprint("Best result path is {}".format(best_result_path))\n'