In [None]:
import html
import re
import string

import pandas as pd
from tqdm import tqdm

# # Load the data
# df = pd.read_parquet("hacker_news_ml_ready.parquet")

# Drop rows with missing or empty titles
df = df[df['title'].notnull() & (df['title'].str.strip() != '')]
print(f"Remaining rows after filtering: {len(df)}")

Remaining rows after filtering: 4920775


In [2]:
def fast_clean_and_tokenize(text):
    if not isinstance(text, str):
        return None
    text = html.unescape(text)
    text = re.sub(r"&", "and", text)
    text = re.sub(r"\(.*?\)", "", text)
    text = re.sub(r"\[.*?\]", "", text)
    text = text.replace("-", " ")  # <-- NEW: break up compound words
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\s+", " ", text)
    tokens = text.lower().strip().split()
    tokens = [t for t in tokens if t.isalnum()]
    return tokens if tokens else None


In [3]:
df

Unnamed: 0,title,type,by,time,score,url,domain
0,"""What May Happen in the Next Hundred Years"", f...",story,ColinWright,2011-10-24 16:27:00,19,http://www.howtobearetronaut.com/wp-content/up...,howtobearetronaut.com
1,Getting Started with JavaScript Unit Testing,story,hncj,2012-01-23 11:39:25,1,http://blogs.lessthandot.com/index.php/WebDev/...,blogs.lessthandot.com
2,"Armstrong, the Django-based and open-source ne...",story,andymboyle,2011-10-24 16:27:36,2,http://www.marketwatch.com/story/the-bay-citiz...,marketwatch.com
3,Why Web Reviewers Make Up Bad Things,story,digisth,2013-07-16 05:16:26,1,http://bits.blogs.nytimes.com/2013/07/15/why-w...,bits.blogs.nytimes.com
4,You Weren't Meant to Have a Boss: The Cliff Notes,story,jazzdev,2008-03-30 09:46:25,1,http://paulgraham.com/bossnotes.html,paulgraham.com
...,...,...,...,...,...,...,...
5148742,Northeast Blackout of 2003,story,sandwichsphinx,2024-10-13 23:41:46,1,https://en.wikipedia.org/wiki/Northeast_blacko...,en.wikipedia.org
5148743,PieChartMaster- Unlock your Pie/Rose chart cre...,story,emperinter,2024-10-13 23:42:21,1,https://apps.apple.com/us/app/piechartmaster-u...,apps.apple.com
5148744,The Greatest Checkmate Ever Given,story,mellosouls,2024-10-13 23:45:21,1,https://www.youtube.com/watch?v=UULlFap1Zko,youtube.com
5148745,"Scrapling: Fast, Adaptive Web Scraping for Python",story,d4vinci,2024-10-13 23:49:42,1,https://github.com/D4Vinci/Scrapling,github.com


In [4]:
from tqdm import tqdm
tqdm.pandas(desc="Tokenising titles")

df["tokens"] = df["title"].progress_apply(fast_clean_and_tokenize)

# Drop rows where tokenisation failed (too short or bad input)
df = df[df["tokens"].notnull()]


Tokenising titles:   0%|          | 0/4920775 [00:00<?, ?it/s]

Tokenising titles: 100%|██████████| 4920775/4920775 [01:01<00:00, 80311.53it/s] 


In [5]:
df['tokens'][2]

['armstrong',
 'the',
 'django',
 'based',
 'and',
 'open',
 'source',
 'news',
 'cms',
 'is',
 'now',
 'released']

Unnamed: 0,title,type,by,time,score,url,domain,tokens
0,"""What May Happen in the Next Hundred Years"", f...",story,ColinWright,2011-10-24 16:27:00,19,http://www.howtobearetronaut.com/wp-content/up...,howtobearetronaut.com,"[what, may, happen, in, the, next, hundred, ye..."
1,Getting Started with JavaScript Unit Testing,story,hncj,2012-01-23 11:39:25,1,http://blogs.lessthandot.com/index.php/WebDev/...,blogs.lessthandot.com,"[getting, started, with, javascript, unit, tes..."
2,"Armstrong, the Django-based and open-source ne...",story,andymboyle,2011-10-24 16:27:36,2,http://www.marketwatch.com/story/the-bay-citiz...,marketwatch.com,"[armstrong, the, django, based, and, open, sou..."
3,Why Web Reviewers Make Up Bad Things,story,digisth,2013-07-16 05:16:26,1,http://bits.blogs.nytimes.com/2013/07/15/why-w...,bits.blogs.nytimes.com,"[why, web, reviewers, make, up, bad, things]"
4,You Weren't Meant to Have a Boss: The Cliff Notes,story,jazzdev,2008-03-30 09:46:25,1,http://paulgraham.com/bossnotes.html,paulgraham.com,"[you, werent, meant, to, have, a, boss, the, c..."
...,...,...,...,...,...,...,...,...
5148742,Northeast Blackout of 2003,story,sandwichsphinx,2024-10-13 23:41:46,1,https://en.wikipedia.org/wiki/Northeast_blacko...,en.wikipedia.org,"[northeast, blackout, of, 2003]"
5148743,PieChartMaster- Unlock your Pie/Rose chart cre...,story,emperinter,2024-10-13 23:42:21,1,https://apps.apple.com/us/app/piechartmaster-u...,apps.apple.com,"[piechartmaster, unlock, your, pierose, chart,..."
5148744,The Greatest Checkmate Ever Given,story,mellosouls,2024-10-13 23:45:21,1,https://www.youtube.com/watch?v=UULlFap1Zko,youtube.com,"[the, greatest, checkmate, ever, given]"
5148745,"Scrapling: Fast, Adaptive Web Scraping for Python",story,d4vinci,2024-10-13 23:49:42,1,https://github.com/D4Vinci/Scrapling,github.com,"[scrapling, fast, adaptive, web, scraping, for..."


In [8]:
df["tokens"].explode().value_counts().head(100)  # most common tokens

tokens
the          1100606
to            923113
a             737235
of            660627
in            590187
              ...   
social         35095
microsoft      34613
who            34094
could          34004
go             33887
Name: count, Length: 100, dtype: int64

In [9]:
df.to_parquet("hn_posts_tokenised.parquet", engine='pyarrow')

In [3]:
# Load the data
df = pd.read_parquet("hn_posts_tokenised.parquet")

In [4]:
import pickle

with open("user_lookup.pkl", "rb") as f:
    user_lookup = pickle.load(f)


In [5]:
user_lookup["pg"]

{'created': datetime.datetime(2006, 10, 9, 18, 21, 32),
 'karma': 157316,
 'submitted_count': 15565}

In [6]:
def get_user_features(username, post_time):
    user = user_lookup.get(username)
    if not user:
        return None, None, None
    age_at_post = (post_time - user["created"]).total_seconds()
    return user["karma"], user["submitted_count"], age_at_post

In [7]:
user_df = pd.DataFrame.from_dict(user_lookup, orient='index')
user_df.index.name = 'by'
user_df.reset_index(inplace=True)

In [8]:
# Convert 'created' to datetime if it’s not already
if not pd.api.types.is_datetime64_any_dtype(user_df['created']):
    user_df['created'] = pd.to_datetime(user_df['created'])

# Convert post times in your main df
if not pd.api.types.is_datetime64_any_dtype(df['time']):
    df['time'] = pd.to_datetime(df['time'], unit='s')

In [9]:
df = df.merge(user_df, on='by', how='left')

In [10]:
df['user_age_at_post'] = (df['time'] - df['created']).dt.total_seconds()

In [11]:
df

Unnamed: 0,title,type,by,time,score,url,domain,tokens,created,karma,submitted_count,user_age_at_post
0,"""What May Happen in the Next Hundred Years"", f...",story,ColinWright,2011-10-24 16:27:00,19,http://www.howtobearetronaut.com/wp-content/up...,howtobearetronaut.com,"[what, may, happen, in, the, next, hundred, ye...",2009-12-14 16:39:30,127765.0,17656.0,58664850.0
1,Getting Started with JavaScript Unit Testing,story,hncj,2012-01-23 11:39:25,1,http://blogs.lessthandot.com/index.php/WebDev/...,blogs.lessthandot.com,"[getting, started, with, javascript, unit, tes...",2011-10-29 16:27:52,44.0,26.0,7413093.0
2,"Armstrong, the Django-based and open-source ne...",story,andymboyle,2011-10-24 16:27:36,2,http://www.marketwatch.com/story/the-bay-citiz...,marketwatch.com,"[armstrong, the, django, based, and, open, sou...",2011-05-04 14:29:09,714.0,54.0,14954307.0
3,Why Web Reviewers Make Up Bad Things,story,digisth,2013-07-16 05:16:26,1,http://bits.blogs.nytimes.com/2013/07/15/why-w...,bits.blogs.nytimes.com,"[why, web, reviewers, make, up, bad, things]",2011-01-06 10:45:47,2395.0,532.0,79641039.0
4,You Weren't Meant to Have a Boss: The Cliff Notes,story,jazzdev,2008-03-30 09:46:25,1,http://paulgraham.com/bossnotes.html,paulgraham.com,"[you, werent, meant, to, have, a, boss, the, c...",2007-09-26 09:28:50,543.0,242.0,16071455.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4919137,Northeast Blackout of 2003,story,sandwichsphinx,2024-10-13 23:41:46,1,https://en.wikipedia.org/wiki/Northeast_blacko...,en.wikipedia.org,"[northeast, blackout, of, 2003]",2024-07-10 22:37:37,14.0,22.0,8211849.0
4919138,PieChartMaster- Unlock your Pie/Rose chart cre...,story,emperinter,2024-10-13 23:42:21,1,https://apps.apple.com/us/app/piechartmaster-u...,apps.apple.com,"[piechartmaster, unlock, your, pierose, chart,...",2024-06-30 15:12:24,7.0,35.0,9102597.0
4919139,The Greatest Checkmate Ever Given,story,mellosouls,2024-10-13 23:45:21,1,https://www.youtube.com/watch?v=UULlFap1Zko,youtube.com,"[the, greatest, checkmate, ever, given]",2019-04-12 07:34:13,14490.0,3341.0,173808668.0
4919140,"Scrapling: Fast, Adaptive Web Scraping for Python",story,d4vinci,2024-10-13 23:49:42,1,https://github.com/D4Vinci/Scrapling,github.com,"[scrapling, fast, adaptive, web, scraping, for...",2024-10-13 23:48:49,1.0,2.0,53.0


In [12]:
df.to_parquet("hn_with_user_stats.parquet", engine='pyarrow')

In [20]:
feature_df = df[["time", "tokens", "domain", "user_age_at_post", "karma", "submitted_count", "score"]]

In [21]:
feature_df

Unnamed: 0,time,tokens,domain,user_age_at_post,karma,submitted_count,score
0,2011-10-24 16:27:00,"[what, may, happen, in, the, next, hundred, ye...",howtobearetronaut.com,58664850.0,127765.0,17656.0,19
1,2012-01-23 11:39:25,"[getting, started, with, javascript, unit, tes...",blogs.lessthandot.com,7413093.0,44.0,26.0,1
2,2011-10-24 16:27:36,"[armstrong, the, django, based, and, open, sou...",marketwatch.com,14954307.0,714.0,54.0,2
3,2013-07-16 05:16:26,"[why, web, reviewers, make, up, bad, things]",bits.blogs.nytimes.com,79641039.0,2395.0,532.0,1
4,2008-03-30 09:46:25,"[you, werent, meant, to, have, a, boss, the, c...",paulgraham.com,16071455.0,543.0,242.0,1
...,...,...,...,...,...,...,...
4919137,2024-10-13 23:41:46,"[northeast, blackout, of, 2003]",en.wikipedia.org,8211849.0,14.0,22.0,1
4919138,2024-10-13 23:42:21,"[piechartmaster, unlock, your, pierose, chart,...",apps.apple.com,9102597.0,7.0,35.0,1
4919139,2024-10-13 23:45:21,"[the, greatest, checkmate, ever, given]",youtube.com,173808668.0,14490.0,3341.0,1
4919140,2024-10-13 23:49:42,"[scrapling, fast, adaptive, web, scraping, for...",github.com,53.0,1.0,2.0,1


In [22]:
feature_df.to_parquet("feature_df.parquet", engine='pyarrow')