In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.compose import TransformedTargetRegressor

from sentence_transformers import SentenceTransformer
from textblob import TextBlob

import matplotlib.pyplot as plt


2026-01-16 10:18:19.844430: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768558699.873475     164 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768558699.882060     164 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768558699.904441     164 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768558699.904503     164 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768558699.904506     164 computation_placer.cc:177] computation placer alr

In [2]:
df = pd.read_csv('/kaggle/input/dataset/Merged_data.csv')

df = df.dropna(subset=['score', 'num_comments'])
df['title'] = df['title'].fillna('')
df['body_text'] = df['body_text'].fillna('')


In [3]:
df['title_length'] = df['title'].apply(len)
df['body_length'] = df['body_text'].apply(len)
df['word_count'] = (df['title'] + ' ' + df['body_text']).apply(lambda x: len(x.split()))
df['title_sentiment'] = df['title'].apply(lambda x: TextBlob(x).sentiment.polarity)


In [None]:
text_model = SentenceTransformer('all-MiniLM-L6-v2')

title_embeddings = text_model.encode(
    df['title'].tolist(),
    batch_size=64,
    show_progress_bar=True
)


'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json (Caused by NameResolutionError("HTTPSConnection(host=\'huggingface.co\', port=443): Failed to resolve \'huggingface.co\' ([Errno -3] Temporary failure in name resolution)"))'), '(Request ID: 7d7d4fcf-3e41-4d84-ab23-8b1e2609a214)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json (Caused by NameResolutionError("HTTPSConnection(host=\'huggingface.co\', port=443): Failed to resolve \'huggingface.co\' ([Errno -3] Temporary failure in name resolution)"))'), '(Request ID: 58d4e025-3cca-476d-87cb-7b99494ba7a0)')' thrown while requesting HEAD https://huggingface.

In [None]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
subreddit_ohe = encoder.fit_transform(df[['subreddit']])


In [None]:
X = np.hstack([
    title_embeddings,
    subreddit_ohe,
    df[['title_length', 'body_length', 'word_count', 'title_sentiment']].values
])

y_score = df['score'].values
y_comments = df['num_comments'].values


In [None]:
X_train, X_test, y_score_train, y_score_test = train_test_split(
    X, y_score, test_size=0.2, random_state=42
)

_, _, y_comm_train, y_comm_test = train_test_split(
    X, y_comments, test_size=0.2, random_state=42
)


In [None]:
score_model = TransformedTargetRegressor(
    regressor=HistGradientBoostingRegressor(max_iter=500),
    func=np.log1p,
    inverse_func=np.expm1
)

comment_model = TransformedTargetRegressor(
    regressor=HistGradientBoostingRegressor(max_iter=500),
    func=np.log1p,
    inverse_func=np.expm1
)

score_model.fit(X_train, y_score_train)
comment_model.fit(X_train, y_comm_train)


In [None]:
pred_score = score_model.predict(X_test)
pred_comments = comment_model.predict(X_test)

print("SCORE")
print("R2:", r2_score(y_score_test, pred_score))
print("MAE:", mean_absolute_error(y_score_test, pred_score))

print("\nCOMMENTS")
print("R2:", r2_score(y_comm_test, pred_comments))
print("MAE:", mean_absolute_error(y_comm_test, pred_comments))


In [None]:
plt.scatter(y_score_test, pred_score, alpha=0.5)
plt.xlabel("Actual Score")
plt.ylabel("Predicted Score")
plt.title("Score Prediction")
plt.show()


### Final Observations

Transformer-based text embeddings capture semantic information from post titles effectively.
Combining embeddings with simple handcrafted features improves regression performance.
Score prediction achieves higher accuracy than comment count due to lower randomness.
