# K-Means Upvote Clustering Lab

Explore Hacker News post engagement clusters using the processed dataset. This notebook mirrors the CLI script (notebooks/kmeans_lab.py) by loading data, fitting K-Means, labelling clusters, and evaluating how well the clusters surface high-upvote behaviour.

In [None]:
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import List, Tuple

import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


In [None]:
# Experiment settings
DATASET_PATH = Path('data/processed/combined_dataset.csv')
REPORTS_DIR = Path('reports')

N_CLUSTERS = 4
TEST_SIZE = 0.1
RANDOM_STATE = 42
OUTPUT_CSV = REPORTS_DIR / 'kmeans_holdout_comparison.csv'


In [None]:
@dataclass
class ClusterSummary:
    cluster: int
    mean_points: float
    median_points: float
    count: int
    category: str


def load_dataset(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path)
    df = df.dropna(subset=['title', 'comments_count', 'points', 'method'])
    df['title_length'] = df['title'].str.len()
    df['top_comment_length'] = df['top_comment_text'].fillna('').str.len()
    df['author_encoded'] = df['author'].fillna('unknown')
    df['has_comment'] = (df['top_comment_text'].fillna('').str.len() > 0).astype(int)
    return df


def build_pipeline(n_clusters: int, random_state: int) -> Pipeline:
    numeric_features = [
        'comments_count',
        'title_length',
        'top_comment_length',
        'has_comment',
    ]
    categorical_features = ['method', 'author_encoded']

    preprocess = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            (
                'cat',
                OneHotEncoder(handle_unknown='ignore', sparse_output=False),
                categorical_features,
            ),
        ]
    )

    model = KMeans(n_clusters=n_clusters, random_state=random_state, n_init='auto')
    return Pipeline(
        steps=[
            ('preprocess', preprocess),
            ('cluster', model),
        ]
    )


def label_clusters(train_df: pd.DataFrame, labels: np.ndarray) -> List[ClusterSummary]:
    grouped = (
        train_df.assign(cluster=labels)
        .groupby('cluster')['points']
        .agg(['mean', 'median', 'count'])
        .reset_index()
    )

    means = grouped['mean']
    high_cut = means.quantile(0.66)
    low_cut = means.quantile(0.33)

    def categorize(mean_val: float) -> str:
        if mean_val >= high_cut:
            return 'High-Upvote Cluster'
        if mean_val <= low_cut:
            return 'Low-Upvote Cluster'
        return 'Mid-Upvote Cluster'

    return [
        ClusterSummary(
            cluster=int(row['cluster']),
            mean_points=float(row['mean']),
            median_points=float(row['median']),
            count=int(row['count']),
            category=categorize(row['mean']),
        )
        for _, row in grouped.iterrows()
    ]


def summarize_holdout(
    holdout_df: pd.DataFrame,
    holdout_labels: np.ndarray,
    summaries: List[ClusterSummary],
) -> Tuple[pd.DataFrame, float]:
    summary_map = {s.cluster: s for s in summaries}

    holdout = holdout_df.copy()
    holdout['cluster'] = holdout_labels
    holdout['cluster_mean_points'] = holdout['cluster'].map(
        lambda c: summary_map[c].mean_points
    )
    holdout['cluster_category'] = holdout['cluster'].map(
        lambda c: summary_map[c].category
    )

    global_median = holdout_df['points'].median()
    actual_high = holdout_df['points'] >= global_median
    predicted_high = holdout['cluster_category'] == 'High-Upvote Cluster'
    accuracy = accuracy_score(actual_high.astype(int), predicted_high.astype(int))

    return holdout, accuracy


In [None]:
features = [
    'comments_count',
    'title_length',
    'top_comment_length',
    'has_comment',
    'method',
    'author_encoded',
]

df = load_dataset(DATASET_PATH)
train_df, holdout_df = train_test_split(
    df,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=df['method'],
)

pipeline = build_pipeline(N_CLUSTERS, RANDOM_STATE)
pipeline.fit(train_df[features])

cluster_labels = pipeline.named_steps['cluster'].labels_
summaries = label_clusters(train_df.reset_index(drop=True), cluster_labels)

holdout_labels = pipeline.predict(holdout_df[features])
holdout_results, accuracy = summarize_holdout(
    holdout_df.reset_index(drop=True),
    holdout_labels,
    summaries,
)

summary_df = (
    pd.DataFrame([s.__dict__ for s in summaries])
    .sort_values('mean_points', ascending=False)
    .reset_index(drop=True)
)

display(summary_df)
display(
    holdout_results[
        [
            'post_id',
            'title',
            'points',
            'comments_count',
            'method',
            'cluster',
            'cluster_category',
            'cluster_mean_points',
        ]
    ].head()
)
print(f'Holdout high-upvote agreement accuracy: {accuracy:.2%}')


In [None]:
OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)
holdout_results[
    [
        'post_id',
        'title',
        'points',
        'comments_count',
        'method',
        'cluster',
        'cluster_category',
        'cluster_mean_points',
    ]
].to_csv(OUTPUT_CSV, index=False)
print(f'Detailed results saved to {OUTPUT_CSV}')
