# StackOverflow Score Prediction – Advanced Models (Notebook 2)

This companion notebook extends the baseline TF-IDF experiments by layering more expressive regressors, dimensionality reduction, neural nets, document embeddings, and light ensembling while keeping the original preprocessing and data splits intact.

## Load Data & Reuse Preprocessing

In [1]:
from __future__ import annotations

from pathlib import Path
from typing import Callable, Dict, List

import gensim.downloader as gensim_api
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from nltk import download as nltk_download
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, SGDRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVR

import sys

PROJECT_ROOT = Path.cwd().resolve()
if not (PROJECT_ROOT / "data").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent
NOTEBOOKS_DIR = PROJECT_ROOT / "notebooks"
for path in {PROJECT_ROOT, NOTEBOOKS_DIR}:
    path_str = str(path)
    if path_str not in sys.path:
        sys.path.append(path_str)

from notebooks.algorithms.text_utils import (
    preprocess_text,
    tokens_to_text,
)

plt.style.use("seaborn-v0_8")
pd.set_option("display.max_colwidth", 120)
RANDOM_STATE = 42
DATA_PATH = PROJECT_ROOT / "data" / "stackexchange_dataset.csv"
print(f"Using data file: {DATA_PATH}")
from time import perf_counter


Using data file: C:\Users\tomasz.makowski.2\Desktop\SemesterII\AdvancedDataMining\adm-stack-inference\data\stackexchange_dataset.csv


In [2]:
raw_df = pd.read_csv(DATA_PATH)
print(f"Raw rows: {len(raw_df):,}")

df = (
    raw_df.rename(columns={"question_text": "body", "question_score": "score"})[
        ["question_id", "title", "body", "score", "num_tags", "tags"]
    ]
    .dropna(subset=["title", "body", "score"])
    .copy()
)
df["score"] = pd.to_numeric(df["score"], errors="coerce")
df = df.drop_duplicates()
df = df.dropna(subset=["score"])
df["score"] = df["score"].astype(float)
df["title"] = df["title"].fillna("").astype(str)
df["body"] = df["body"].fillna("").astype(str)
df["full_text"] = (df["title"].str.strip() + " " + df["body"].str.strip()).str.strip()
df = df[df["full_text"].str.len() > 0]
print(f"Filtered rows: {len(df):,}")
df.head(3)

Raw rows: 100,000
Filtered rows: 99,992


Unnamed: 0,question_id,title,body,score,num_tags,tags,full_text
0,79802517,Looking for a better way using &quot;.Include&quot; in EF,I am looking for a better way to use the .Include clause of Entity Framework. I want to avoid duplicate code. I have...,2.0,2,"['c#', 'entity-framework']",Looking for a better way using &quot;.Include&quot; in EF I am looking for a better way to use the .Include clause o...
1,79802934,"NTP is moving my clock further from the correct time, rather than closer","Windows 11 Pro 10.0.26200 Build 26200. Dell XPS 8940. When I first started noticing it, a few weeks ago, my clock wa...",0.0,1,['ntp'],"NTP is moving my clock further from the correct time, rather than closer Windows 11 Pro 10.0.26200 Build 26200. Dell..."
2,79802909,Execution of pandas&#39; info in python,"I am new to pandas library in python. When I loaded a file and was printing the output of df.info into console, the ...",0.0,2,"['python', 'pandas']",Execution of pandas&#39; info in python I am new to pandas library in python. When I loaded a file and was printing ...


In [3]:
nltk_download('stopwords', quiet=True)
stopword_list = stopwords.words('english')
stemmer = PorterStemmer()

normalized_texts: List[str] = []
clean_texts: List[str] = []
for text in df["full_text"]:
    normalized, tokens = preprocess_text(text, stopword_list, stemmer=None)
    normalized_texts.append(normalized)
    clean_texts.append(tokens_to_text(tokens))

df["normalized_text"] = normalized_texts
df["clean_text"] = clean_texts
print(df[["score", "clean_text"]].head(3))

   score  \
0    2.0   
1    0.0   
2    0.0   

                                                                                                                clean_text  
0  looking better way using quot include quot ef looking better way use include clause entity framework want avoid dupl...  
1  ntp moving clock correct time rather closer windows 11 pro 10 0 26200 build 26200 dell xps 8940 first started notici...  
2  execution pandas #39 info python new pandas library python loaded file printing output df info console data getting ...  


In [4]:
train_df, temp_df = train_test_split(
    df, test_size=0.30, random_state=RANDOM_STATE
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.50, random_state=RANDOM_STATE
)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
train_val_df = pd.concat([train_df, val_df], axis=0, ignore_index=True)

y_train = train_df["score"].values
y_val = val_df["score"].values
y_test = test_df["score"].values
y_train_val = train_val_df["score"].values

print(f"Split sizes: train={len(train_df):,}, val={len(val_df):,}, test={len(test_df):,}")

Split sizes: train=69,994, val=14,999, test=14,999


## TF-IDF Vectorizer (Reused Settings)

In [5]:
vectorizer = TfidfVectorizer(min_df=5, max_df=0.85, ngram_range=(1, 1))
X_train_tfidf = vectorizer.fit_transform(train_df["clean_text"])
X_val_tfidf = vectorizer.transform(val_df["clean_text"])
X_test_tfidf = vectorizer.transform(test_df["clean_text"])
X_train_val_tfidf = vectorizer.transform(train_val_df["clean_text"])

feature_store: Dict[str, Dict[str, object]] = {
    "tfidf": {
        "train": X_train_tfidf,
        "val": X_val_tfidf,
        "test": X_test_tfidf,
        "train_val": X_train_val_tfidf,
    }
}

print("TF-IDF shapes:", {k: v.shape for k, v in feature_store["tfidf"].items()})

TF-IDF shapes: {'train': (69994, 42114), 'val': (14999, 42114), 'test': (14999, 42114), 'train_val': (84993, 42114)}


## Target Variants

In [6]:
LOG_SCORE_CLIP_MIN = -0.999
LOG_PRED_CLIP = 6.0
y_log_train = np.log1p(np.clip(y_train, a_min=LOG_SCORE_CLIP_MIN, a_max=None))
y_log_val = np.log1p(np.clip(y_val, a_min=LOG_SCORE_CLIP_MIN, a_max=None))
y_log_test = np.log1p(np.clip(y_test, a_min=LOG_SCORE_CLIP_MIN, a_max=None))
y_log_train_val = np.log1p(np.clip(y_train_val, a_min=LOG_SCORE_CLIP_MIN, a_max=None))

target_store: Dict[str, Dict[str, np.ndarray]] = {
    "raw": {
        "train": y_train,
        "val": y_val,
        "test": y_test,
        "train_val": y_train_val,
    },
    "log": {
        "train": y_log_train,
        "val": y_log_val,
        "test": y_log_test,
        "train_val": y_log_train_val,
    },
}
print("Raw target range:", (y_train.min(), y_train.max()))
print("Log target range:", (y_log_train.min(), y_log_train.max()))

Raw target range: (np.float64(-20.0), np.float64(27126.0))
Log target range: (np.float64(-6.907755278982136), np.float64(10.20828482084353))


## Helper Functions & Tracking

In [7]:
def regression_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {"MSE": mse, "RMSE": rmse, "MAE": mae, "R2": r2}

results: List[Dict[str, object]] = []
model_registry: Dict[str, Dict[str, object]] = {}
val_predictions_store: Dict[str, np.ndarray] = {}

TARGET_DESCRIPTIONS = {"raw": "raw score", "log": "log1p(score)"}

def inverse_target(key: str, preds: np.ndarray) -> np.ndarray:
    if key == "log":
        clipped = np.clip(preds, a_min=LOG_SCORE_CLIP_MIN, a_max=LOG_PRED_CLIP)
        return np.expm1(clipped)
    return preds

def register_result(name: str, feature_type: str, target_key: str, metrics: Dict[str, float], notes: str = "", training_time: float | None = None):
    entry = {
        "model_name": name,
        "feature_type": feature_type,
        "target_type": TARGET_DESCRIPTIONS[target_key],
        "val_MSE": metrics["MSE"],
        "val_RMSE": metrics["RMSE"],
        "val_MAE": metrics["MAE"],
        "val_R2": metrics["R2"],
        "training_time_sec": training_time,
        "notes": notes,
    }
    results.append(entry)

def evaluate_model(name: str, feature_key: str, target_key: str, builder: Callable[[], object], feature_type: str, notes: str = ""):
    model = builder()
    X_train = feature_store[feature_key]["train"]
    X_val = feature_store[feature_key]["val"]
    y_train_local = target_store[target_key]["train"]
    start = perf_counter()
    model.fit(X_train, y_train_local)
    training_time = perf_counter() - start
    val_preds_target = model.predict(X_val)
    val_preds = inverse_target(target_key, val_preds_target)
    val_preds = np.nan_to_num(val_preds, nan=0.0, posinf=0.0, neginf=0.0)
    metrics = regression_metrics(y_val, val_preds)
    register_result(name, feature_type, target_key, metrics, notes, training_time=training_time)
    model_registry[name] = {
        "builder": builder,
        "feature_key": feature_key,
        "target_key": target_key,
        "feature_type": feature_type,
        "notes": notes,
    }
    val_predictions_store[name] = val_preds
    print(f"{name}: RMSE={metrics['RMSE']:.3f}, R2={metrics['R2']:.3f}, train_time={training_time:.2f}s")


## LinearSVR on TF-IDF

In [8]:
def build_linear_svr(C: float = 1.0):
    return Pipeline([
        ("scaler", StandardScaler(with_mean=False)),
        ("svr", LinearSVR(C=C, epsilon=0.1, random_state=RANDOM_STATE, max_iter=5000, tol=1e-4)),
    ])

In [9]:
# Raw target
evaluate_model(
    name="LinearSVR (raw)",
    feature_key="tfidf",
    target_key="raw",
    builder=lambda: build_linear_svr(C=0.8),
    feature_type="TF-IDF (sparse)",
    notes="LinearSVR predicting raw scores",
)

LinearSVR (raw): RMSE=158.358, R2=-0.046, train_time=138.36s




In [10]:
# Log target
evaluate_model(
    name="LinearSVR (log)",
    feature_key="tfidf",
    target_key="log",
    builder=lambda: build_linear_svr(C=0.8),
    feature_type="TF-IDF (sparse)",
    notes="LinearSVR trained on log1p scores",
)

LinearSVR (log): RMSE=184.963, R2=-0.427, train_time=179.16s




## SGDRegressor with Huber Loss

In [11]:
def build_huber_sgd(alpha: float = 1e-4):
    return Pipeline([
        ("scaler", StandardScaler(with_mean=False)),
        ("sgd", SGDRegressor(
            loss="huber",
            epsilon=1.5,
            penalty="l2",
            alpha=alpha,
            learning_rate="optimal",
            max_iter=5000,
            tol=1e-3,
            random_state=RANDOM_STATE,
        )),
    ])

In [12]:
# Raw target
evaluate_model(
    name="SGD Huber (raw)",
    feature_key="tfidf",
    target_key="raw",
    builder=lambda: build_huber_sgd(alpha=1e-4),
    feature_type="TF-IDF (sparse)",
    notes="Huber regression with SGD on raw scores",
)

SGD Huber (raw): RMSE=157.950, R2=-0.041, train_time=28.66s


In [13]:
# Log target
evaluate_model(
    name="SGD Huber (log)",
    feature_key="tfidf",
    target_key="log",
    builder=lambda: build_huber_sgd(alpha=5e-5),
    feature_type="TF-IDF (sparse)",
    notes="Huber regression with SGD on log scores",
)

SGD Huber (log): RMSE=265.615, R2=-1.944, train_time=26.37s


## TF-IDF → TruncatedSVD → Regression

In [14]:
svd = TruncatedSVD(n_components=200, random_state=RANDOM_STATE)
X_train_svd = svd.fit_transform(X_train_tfidf)
X_val_svd = svd.transform(X_val_tfidf)
X_test_svd = svd.transform(X_test_tfidf)
X_train_val_svd = svd.transform(X_train_val_tfidf)

feature_store["svd"] = {
    "train": X_train_svd,
    "val": X_val_svd,
    "test": X_test_svd,
    "train_val": X_train_val_svd,
}
print("SVD shapes:", {k: v.shape for k, v in feature_store["svd"].items()})

SVD shapes: {'train': (69994, 200), 'val': (14999, 200), 'test': (14999, 200), 'train_val': (84993, 200)}


In [15]:
def build_svd_ridge(alpha: float = 2.0):
    return Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge(alpha=alpha, random_state=RANDOM_STATE)),
    ])

# Ridge
evaluate_model(
    name="SVD + Ridge",
    feature_key="svd",
    target_key="raw",
    builder=lambda: build_svd_ridge(alpha=2.0),
    feature_type="TF-IDF → 200-d SVD",
    notes="Dense LSA features with Ridge",
)

SVD + Ridge: RMSE=152.392, R2=0.031, train_time=0.25s


In [16]:
# Random Forest
def build_rf():
    return RandomForestRegressor(
        n_estimators=400,
        max_depth=14,
        min_samples_leaf=2,
        n_jobs=-1,
        random_state=RANDOM_STATE,
    )

evaluate_model(
    name="SVD + RandomForest",
    feature_key="svd",
    target_key="raw",
    builder=build_rf,
    feature_type="TF-IDF → 200-d SVD",
    notes="Random forest on dense SVD vectors",
)

SVD + RandomForest: RMSE=154.977, R2=-0.002, train_time=1316.23s


In [17]:
# Gradient Boosting
def build_gbr():
    return GradientBoostingRegressor(
        random_state=RANDOM_STATE,
        n_estimators=500,
        max_depth=3,
        learning_rate=0.05,
        subsample=0.9,
    )

evaluate_model(
    name="SVD + GradientBoosting",
    feature_key="svd",
    target_key="raw",
    builder=build_gbr,
    feature_type="TF-IDF → 200-d SVD",
    notes="Gradient boosting regressor on SVD features",
)

SVD + GradientBoosting: RMSE=156.003, R2=-0.015, train_time=2422.55s


## MLPRegressor on SVD Features

In [18]:
def build_svd_mlp(alpha: float = 1e-4):
    return Pipeline([
        ("scaler", StandardScaler()),
        ("mlp", MLPRegressor(
            hidden_layer_sizes=(128, 64),
            activation="relu",
            alpha=alpha,
            batch_size=256,
            learning_rate_init=1e-3,
            max_iter=200,
            early_stopping=True,
            n_iter_no_change=10,
            random_state=RANDOM_STATE,
        )),
    ])

# Raw target
evaluate_model(
    name="SVD + MLP (raw)",
    feature_key="svd",
    target_key="raw",
    builder=lambda: build_svd_mlp(alpha=1e-4),
    feature_type="TF-IDF → 200-d SVD",
    notes="Two-layer MLP on SVD vectors",
)

SVD + MLP (raw): RMSE=150.232, R2=0.058, train_time=16.07s


In [19]:
# Log target
evaluate_model(
    name="SVD + MLP (log)",
    feature_key="svd",
    target_key="log",
    builder=lambda: build_svd_mlp(alpha=5e-5),
    feature_type="TF-IDF → 200-d SVD",
    notes="Two-layer MLP predicting log scores",
)

SVD + MLP (log): RMSE=155.398, R2=-0.008, train_time=10.84s


## Document Embeddings → Regression

## Simple Ensemble

In [20]:
results_df = pd.DataFrame(results).sort_values('val_RMSE').reset_index(drop=True)
print(results_df.head())

ensemble_members = results_df.nsmallest(3, 'val_RMSE')['model_name'].tolist()
print(f"Ensembling models: {ensemble_members}")

val_preds = np.column_stack([val_predictions_store[name] for name in ensemble_members])
ensemble_val_pred = val_preds.mean(axis=1)
ensemble_metrics = regression_metrics(y_val, ensemble_val_pred)
register_result(
    name="Mean Ensemble",
    feature_type="Averaged predictions",
    target_key="raw",
    metrics=ensemble_metrics,
    notes=f"Mean of {ensemble_members}",
)
val_predictions_store["Mean Ensemble"] = ensemble_val_pred
print(f"Mean Ensemble: RMSE={ensemble_metrics['RMSE']:.3f}, R2={ensemble_metrics['R2']:.3f}")
model_registry["Mean Ensemble"] = {
    "ensemble_members": ensemble_members,
    "feature_type": "Averaged predictions",
}


               model_name        feature_type   target_type       val_MSE  \
0         SVD + MLP (raw)  TF-IDF → 200-d SVD     raw score  22569.661099   
1             SVD + Ridge  TF-IDF → 200-d SVD     raw score  23223.469624   
2      SVD + RandomForest  TF-IDF → 200-d SVD     raw score  24017.792613   
3         SVD + MLP (log)  TF-IDF → 200-d SVD  log1p(score)  24148.557882   
4  SVD + GradientBoosting  TF-IDF → 200-d SVD     raw score  24336.808299   

     val_RMSE    val_MAE    val_R2  training_time_sec  \
0  150.232024  31.667474  0.058267          16.066344   
1  152.392485  41.161424  0.030986           0.253964   
2  154.976749  35.266423 -0.002157        1316.229247   
3  155.398063  20.483820 -0.007614          10.836792   
4  156.002591  34.606112 -0.015468        2422.552881   

                                         notes  
0                 Two-layer MLP on SVD vectors  
1                Dense LSA features with Ridge  
2           Random forest on dense SVD vectors 

## Validation Comparison

In [21]:
results_df = pd.DataFrame(results).sort_values('val_RMSE').reset_index(drop=True)
results_df

Unnamed: 0,model_name,feature_type,target_type,val_MSE,val_RMSE,val_MAE,val_R2,training_time_sec,notes
0,Mean Ensemble,Averaged predictions,raw score,22275.854772,149.250979,32.882758,0.070526,,"Mean of ['SVD + MLP (raw)', 'SVD + Ridge', 'SVD + RandomForest']"
1,SVD + MLP (raw),TF-IDF → 200-d SVD,raw score,22569.661099,150.232024,31.667474,0.058267,16.066344,Two-layer MLP on SVD vectors
2,SVD + Ridge,TF-IDF → 200-d SVD,raw score,23223.469624,152.392485,41.161424,0.030986,0.253964,Dense LSA features with Ridge
3,SVD + RandomForest,TF-IDF → 200-d SVD,raw score,24017.792613,154.976749,35.266423,-0.002157,1316.229247,Random forest on dense SVD vectors
4,SVD + MLP (log),TF-IDF → 200-d SVD,log1p(score),24148.557882,155.398063,20.48382,-0.007614,10.836792,Two-layer MLP predicting log scores
5,SVD + GradientBoosting,TF-IDF → 200-d SVD,raw score,24336.808299,156.002591,34.606112,-0.015468,2422.552881,Gradient boosting regressor on SVD features
6,SGD Huber (raw),TF-IDF (sparse),raw score,24948.28815,157.950271,33.287046,-0.040983,28.655761,Huber regression with SGD on raw scores
7,LinearSVR (raw),TF-IDF (sparse),raw score,25077.202775,158.357831,31.997622,-0.046362,138.358618,LinearSVR predicting raw scores
8,LinearSVR (log),TF-IDF (sparse),log1p(score),34211.311476,184.963,53.38751,-0.427488,179.16276,LinearSVR trained on log1p scores
9,SGD Huber (log),TF-IDF (sparse),log1p(score),70551.152332,265.614669,147.860321,-1.943791,26.368483,Huber regression with SGD on log scores


## Final Test Evaluation

In [22]:
def predict_with_model(name: str, train_split: str, predict_split: str):
    spec = model_registry[name]
    if "ensemble_members" in spec:
        member_preds = [
            predict_with_model(member, train_split, predict_split)[1]
            for member in spec["ensemble_members"]
        ]
        stacked = np.column_stack(member_preds)
        return None, stacked.mean(axis=1)

    builder = spec["builder"]
    feature_key = spec["feature_key"]
    target_key = spec["target_key"]
    model = builder()
    X_train_split = feature_store[feature_key][train_split]
    X_predict = feature_store[feature_key][predict_split]
    y_train_split = target_store[target_key][train_split]
    model.fit(X_train_split, y_train_split)
    preds_target = model.predict(X_predict)
    preds_raw = inverse_target(target_key, preds_target)
    return model, preds_raw

final_candidates = results_df['model_name'].tolist()
final_rows = []

for name in final_candidates:
    _, test_preds = predict_with_model(name, train_split="train_val", predict_split="test")
    metrics = regression_metrics(y_test, test_preds)
    val_row = results_df[results_df['model_name'] == name].iloc[0]
    final_rows.append({
        "model_name": name,
        "feature_type": val_row["feature_type"],
        "target_type": val_row["target_type"],
        "val_RMSE": val_row["val_RMSE"],
        "val_R2": val_row["val_R2"],
        "test_MSE": metrics["MSE"],
        "test_RMSE": metrics["RMSE"],
        "test_MAE": metrics["MAE"],
        "test_R2": metrics["R2"],
    })



In [23]:
final_results_df = pd.DataFrame(final_rows).sort_values('test_RMSE').reset_index(drop=True)
final_results_df

Unnamed: 0,model_name,feature_type,target_type,val_RMSE,val_R2,test_MSE,test_RMSE,test_MAE,test_R2
0,SVD + Ridge,TF-IDF → 200-d SVD,raw score,152.392485,0.030986,75692.079954,275.121937,42.580632,0.006925
1,Mean Ensemble,Averaged predictions,raw score,149.250979,0.070526,75819.440525,275.353301,35.866879,0.005254
2,SVD + MLP (log),TF-IDF → 200-d SVD,log1p(score),155.398063,-0.007614,76408.202611,276.420337,22.932554,-0.002471
3,SGD Huber (raw),TF-IDF (sparse),raw score,157.950271,-0.040983,76846.048946,277.211199,31.265185,-0.008215
4,LinearSVR (raw),TF-IDF (sparse),raw score,158.357831,-0.046362,76861.438384,277.238955,28.378842,-0.008417
5,SVD + RandomForest,TF-IDF → 200-d SVD,raw score,154.976749,-0.002157,77656.747041,278.669602,37.512205,-0.018851
6,SVD + MLP (raw),TF-IDF → 200-d SVD,raw score,150.232024,0.058267,78339.495774,279.891936,35.74854,-0.027809
7,SVD + GradientBoosting,TF-IDF → 200-d SVD,raw score,156.002591,-0.015468,78928.498803,280.942163,37.424328,-0.035537
8,LinearSVR (log),TF-IDF (sparse),log1p(score),184.963,-0.427488,81815.923707,286.03483,42.515413,-0.073419
9,SGD Huber (log),TF-IDF (sparse),log1p(score),265.614669,-1.943791,123119.494553,350.883876,151.63792,-0.61532
