# Fingle Interest-based Recommender (Hybrid)

This notebook builds a hybrid recommender combining:
- Content signals: overlap between user interests and post tags
- Collaborative signals: latent factors from implicit feedback via TruncatedSVD
- Popularity prior: global engagement rate per post

It evaluates with a user-wise holdout split and reports Precision@3, Recall@3, MAP@3, and NDCG@3.


In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from collections import defaultdict

USERS_PATH = 'Users.csv'
POSTS_PATH = 'Posts.csv'
ENG_PATH = 'Engagements.csv'
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

users = pd.read_csv(USERS_PATH)
posts = pd.read_csv(POSTS_PATH)
eng = pd.read_csv(ENG_PATH)
users.head(), posts.head(), eng.head()


(  user_id  age gender          top_3_interests  past_engagement_score
 0      U1   24      F      sports, art, gaming                   0.61
 1      U2   32      F    travel, food, fashion                   0.93
 2      U3   28  Other  sports, travel, fashion                   0.40
 3      U4   25      M     fashion, music, tech                   0.53
 4      U5   24      M   fashion, food, fitness                   0.80,
   post_id creator_id content_type            tags
 0      P1        U44        video    sports, food
 1      P2        U26        video   music, travel
 2      P3        U32         text  sports, travel
 3      P4         U6        image   music, gaming
 4      P5        U32        image   food, fashion,
   user_id post_id  engagement
 0      U1     P52           1
 1      U1     P44           0
 2      U1      P1           1
 3      U1      P4           1
 4      U1     P65           0)

## Preprocess columns
- Normalize IDs to categorical indices.
- Parse user interests and post tags.
- Build mappings.

In [2]:
# Clean strings and split lists
def split_list(s):
    if pd.isna(s):
        return []
    return [x.strip().lower() for x in str(s).split(',')]

users['interests'] = users['top_3_interests'].apply(split_list)
posts['tag_list'] = posts['tags'].apply(split_list)

# Categorical index mappings
user_ids = users['user_id'].astype('category')
post_ids = posts['post_id'].astype('category')
uid2idx = dict(zip(user_ids, user_ids.cat.codes))
pid2idx = dict(zip(post_ids, post_ids.cat.codes))
idx2uid = {v:k for k,v in uid2idx.items()}
idx2pid = {v:k for k,v in pid2idx.items()}

users['uidx'] = users['user_id'].map(uid2idx)
posts['pidx'] = posts['post_id'].map(pid2idx)

# Join engagement limited to known users/posts
eng = eng[eng['user_id'].isin(uid2idx) & eng['post_id'].isin(pid2idx)]
eng['uidx'] = eng['user_id'].map(uid2idx)
eng['pidx'] = eng['post_id'].map(pid2idx)
eng['engagement'] = eng['engagement'].astype(int)

n_users = users.shape[0]
n_items = posts.shape[0]
n_users, n_items


(50, 100)

## Train/test split (user-wise holdout)
For each user with at least one positive (engagement==1), hold out one positive as test; train on the rest.
Negatives remain in train to provide contrastive signal.

In [3]:
# Build per-user positives
pos_by_user = eng[eng['engagement'] == 1].groupby('uidx')['pidx'].apply(list).to_dict()
test_pairs = []
train_mask = np.ones(len(eng), dtype=bool)
for uidx, plist in pos_by_user.items():
    if len(plist) == 0:
        continue
    # randomly hold out one positive for test
    held = np.random.choice(plist)
    # mark as test: find first matching row index
    test_idx = eng[(eng['uidx']==uidx) & (eng['pidx']==held) & (eng['engagement']==1)].index[0]
    train_mask[test_idx] = False
    test_pairs.append((uidx, held))

train_eng = eng[train_mask].copy()
test_pairs[:5], train_eng.shape


([(0, np.int64(53)),
  (1, np.int64(16)),
  (2, np.int64(28)),
  (3, np.int64(60)),
  (4, np.int64(60))],
 (950, 5))

## Build implicit user-item matrix and train SVD
We weight positives as 1.0 and negatives as 0.1 to reduce their influence but keep contrast.

In [4]:
# Build CSR-like arrays (dense for simplicity; dataset is small)
implicit_mat = np.zeros((n_users, n_items), dtype=np.float32)
for _, r in train_eng.iterrows():
    w = 1.0 if r['engagement'] == 1 else 0.1
    implicit_mat[int(r['uidx']), int(r['pidx'])] = max(implicit_mat[int(r['uidx']), int(r['pidx'])], w)

# TruncatedSVD for latent factors
k = 20 if min(n_users, n_items) > 20 else max(2, min(10, min(n_users, n_items)-1))
svd = TruncatedSVD(n_components=k, random_state=RANDOM_SEED)
U = svd.fit_transform(implicit_mat)  # user factors in item-space
V = svd.components_.T               # item factors
U = normalize(U)
V = normalize(V)
U.shape, V.shape


((50, 20), (100, 20))

## Content features and popularity prior
- Content: Jaccard similarity between user interests and post tags.
- Popularity: global positive rate per post (Laplace-smoothed).
- User bias: scale by user `past_engagement_score` as a mild multiplier.

In [5]:
# Content similarity
interest_sets = users.set_index('uidx')['interests'].to_dict()
tag_sets = posts.set_index('pidx')['tag_list'].to_dict()

def jaccard(a, b):
    sa, sb = set(a), set(b)
    if not sa and not sb:
        return 0.0
    return len(sa & sb) / (len(sa | sb) + 1e-9)

content_mat = np.zeros((n_users, n_items), dtype=np.float32)
for u in range(n_users):
    ua = interest_sets.get(u, [])
    for p in range(n_items):
        content_mat[u, p] = jaccard(ua, tag_sets.get(p, []))

# Popularity prior (Laplace smoothing)
post_pos = train_eng.groupby('pidx')['engagement'].sum()
post_cnt = train_eng.groupby('pidx')['engagement'].count()
alpha = 1.0
pop = (post_pos + alpha) / (post_cnt + 2*alpha)
popularity = np.zeros(n_items, dtype=np.float32)
for pidx, val in pop.items():
    popularity[int(pidx)] = float(val)

user_bias = users.set_index('uidx')['past_engagement_score'].fillna(0.5).clip(0,1).astype(float)
user_bias = user_bias.reindex(range(n_users), fill_value=0.5).to_numpy().astype(np.float32)
user_bias[:5], popularity[:5]


(array([0.61, 0.94, 0.85, 0.71, 0.9 ], dtype=float32),
 array([0.75      , 0.2       , 0.25      , 0.46153846, 0.6666667 ],
       dtype=float32))

## Hybrid scoring and recommendation
Final score per user–post: 
`score = w_cf * (U·V) + w_content * content + w_pop * popularity`.
Scaled by `(0.75 + 0.5 * user_bias)` to reflect propensity to engage.

In [6]:
# Hyperparameters
w_cf = 0.6
w_content = 0.3
w_pop = 0.1

cf_scores = (U @ V.T)
scores = w_cf * cf_scores + w_content * content_mat + w_pop * popularity
scales = 0.75 + 0.5 * user_bias.reshape(-1,1)
scores = scores * scales

# Do not recommend items a user already engaged positively with in train
train_pos = train_eng[train_eng['engagement']==1].groupby('uidx')['pidx'].apply(set).to_dict()
def recommend_top_k(u, k=3):
    s = scores[u].copy()
    for p in train_pos.get(u, set()):
        s[int(p)] = -1e9
    top = np.argpartition(-s, range(min(k, len(s))))[:k]
    top = top[np.argsort(-s[top])]
    return top

# Example: recommendations for first 3 users
[ [idx2pid[int(p)] for p in recommend_top_k(u, 3)] for u in range(min(3, n_users)) ]


[['P83', 'P65', 'P25'], ['P29', 'P90', 'P14'], ['P43', 'P33', 'P79']]

## Evaluation (P@3, R@3, MAP@3, NDCG@3)
We evaluate against the held-out positive item per user (where available).
For users without a held-out positive, they are skipped in metric aggregation.

In [7]:
def precision_at_k(recs, ground_truth, k=3):
    if k == 0: return 0.0
    hits = sum(1 for p in recs[:k] if p in ground_truth)
    return hits / k

def recall_at_k(recs, ground_truth, k=3):
    if len(ground_truth) == 0: return 0.0
    hits = sum(1 for p in recs[:k] if p in ground_truth)
    return hits / len(ground_truth)

def ap_at_k(recs, ground_truth, k=3):
    hits = 0
    ap = 0.0
    for i, p in enumerate(recs[:k], start=1):
        if p in ground_truth:
            hits += 1
            ap += hits / i
    return ap / min(len(ground_truth), k) if ground_truth else 0.0

def ndcg_at_k(recs, ground_truth, k=3):
    def dcg(xs):
        return sum(rel/np.log2(i+2) for i, rel in enumerate(xs))
    rel = [1.0 if p in ground_truth else 0.0 for p in recs[:k]]
    ideal = sorted(rel, reverse=True)
    idcg = dcg(ideal)
    return dcg(rel) / idcg if idcg > 0 else 0.0

# Evaluate
p_list, r_list, map_list, ndcg_list = [], [], [], []
for u, held in test_pairs:
    recs = recommend_top_k(u, k=3)
    gt = {held}
    p_list.append(precision_at_k(recs, gt, 3))
    r_list.append(recall_at_k(recs, gt, 3))
    map_list.append(ap_at_k(recs, gt, 3))
    ndcg_list.append(ndcg_at_k(recs, gt, 3))

metrics = {
    'users_evaluated': len(p_list),
    'Precision@3': float(np.mean(p_list)) if p_list else None,
    'Recall@3': float(np.mean(r_list)) if r_list else None,
    'MAP@3': float(np.mean(map_list)) if map_list else None,
    'NDCG@3': float(np.mean(ndcg_list)) if ndcg_list else None,
}
metrics


{'users_evaluated': 50,
 'Precision@3': 0.006666666666666666,
 'Recall@3': 0.02,
 'MAP@3': 0.006666666666666666,
 'NDCG@3': 0.01}

## Generate Top-3 recommendations per user and save
Writes `recommendations_top3.csv` with columns: `user_id,post_id,rank,score`.

In [8]:
rows = []
for u in range(n_users):
    recs = recommend_top_k(u, k=3)
    for rank, p in enumerate(recs, start=1):
        rows.append({
            'user_id': idx2uid[u],
            'post_id': idx2pid[int(p)],
            'rank': rank,
            'score': float(scores[u, int(p)])
        })
rec_df = pd.DataFrame(rows)
rec_df.to_csv('recommendations_top3.csv', index=False)
rec_df.head()


Unnamed: 0,user_id,post_id,rank,score
0,U1,P83,1,0.403907
1,U1,P65,2,0.393694
2,U1,P25,3,0.346324
3,U10,P29,1,0.453882
4,U10,P90,2,0.421442
