
# Hacker News Engagement Analysis

This notebook explores the combined dataset collected via Selenium, BeautifulSoup, and the Hacker News API. We perform exploratory analysis and build predictive models to understand what headline signals correlate with high engagement.


In [None]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

sns.set_theme(style='whitegrid')

data_path = Path('data/processed/combined_dataset.csv')
df = pd.read_csv(data_path)
print(f"Loaded {len(df)} records from {data_path}")
df.head()


In [None]:

df.describe(include='all')


In [None]:

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
sns.boxplot(data=df, x='method', y='points', ax=axes[0])
axes[0].set_title('Points Distribution by Method')
axes[0].set_xlabel('Scraping Method')
axes[0].set_ylabel('Points')

sns.boxplot(data=df, x='method', y='comments_count', ax=axes[1])
axes[1].set_title('Comment Count Distribution by Method')
axes[1].set_xlabel('Scraping Method')
axes[1].set_ylabel('Comments')
plt.tight_layout()
plt.show()



## Text Features and Binary Engagement Classification

We label posts whose score falls in the top quartile as **high engagement** and train a
regularized logistic regression model on TF-IDF features over the headlines.


In [None]:

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

threshold = df['points'].quantile(0.75)
df['high_engagement'] = (df['points'] >= threshold).astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    df['title'], df['high_engagement'], test_size=0.25, random_state=42, stratify=df['high_engagement']
)

clf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=2)),
    ('log_reg', LogisticRegression(max_iter=200, class_weight='balanced')),
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba):.3f}")


In [None]:

# Inspect influential keywords
import numpy as np

vectorizer = clf.named_steps['tfidf']
log_reg = clf.named_steps['log_reg']
feature_names = vectorizer.get_feature_names_out()
coefs = log_reg.coef_[0]

top_positive_idx = np.argsort(coefs)[-15:][::-1]
top_negative_idx = np.argsort(coefs)[:15]

print('Top positive signals (more likely high engagement):')
for idx in top_positive_idx:
    print(f"  {feature_names[idx]:<20} {coefs[idx]:.3f}")

print('
Top negative signals (less likely high engagement):')
for idx in top_negative_idx:
    print(f"  {feature_names[idx]:<20} {coefs[idx]:.3f}")



## Regression: Predict Exact Engagement

We also fit a gradient boosting regressor to predict the exact point total from
combined tabular features.


In [None]:

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score

df = df.copy()
df['title_length'] = df['title'].str.len()
df['question_mark'] = df['title'].str.contains('?', regex=False).astype(int)

X = df[['title', 'title_length', 'question_mark', 'method']]
y = df['points']

preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=1000, stop_words='english'), 'title'),
        ('numeric', StandardScaler(), ['title_length', 'question_mark']),
        ('onehot', 'passthrough', ['method']),
    ],
    remainder='drop'
)

reg_model = Pipeline([
    ('prep', preprocessor),
    ('reg', GradientBoostingRegressor(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
reg_model.fit(X_train, y_train)

preds = reg_model.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, preds):.2f}")
print(f"R^2: {r2_score(y_test, preds):.3f}")



## Takeaways

- API scraping provides the leanest bandwidth usage but omits rendered comment text, making it ideal for incremental polling.
- Selenium captures interaction-heavy context but at significantly higher latency.
- Headline phrasing containing terms like *Ask HN*, *Show HN*, or incident language tends to correlate with higher engagement.
