In [5]:
import random
import numpy as np
import pandas as pd
import math
from tqdm import tqdm
from scipy.sparse import csr_matrix, coo_matrix, hstack
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from implicit.als import AlternatingLeastSquares
from lightfm import LightFM

In [6]:
# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Constants
K_VALUES = [5, 10, 20]

In [None]:
df_train = pd.read_csv("../data/train.txt", sep="\t", names=["user_id", "movie_id", "weight"])
df_test  = pd.read_csv("../data/test.txt",  sep="\t", names=["user_id", "movie_id", "weight"])
df_meta  = pd.read_csv('../data/movies_metadata.csv')
df_users = pd.read_csv('../data/users.csv')

# Helper Functions

## Stable Mappings & Interaction Encoding

In [8]:
def build_mappings(df, user_col='user_id', item_col='movie_id'):
    # sort to ensure deterministic ordering
    users = sorted(df[user_col].unique())
    items = sorted(df[item_col].unique())
    user2idx = {u: i for i, u in enumerate(users)}
    item2idx = {i: j for j, i in enumerate(items)}
    return user2idx, item2idx

user2idx, item2idx = build_mappings(df_train)
idx2user = {v: k for k, v in user2idx.items()}
idx2item = {v: k for k, v in item2idx.items()}
n_users, n_items = len(user2idx), len(item2idx)

def encode_interactions(df):
    df = df.copy()
    df['user_idx'] = df['user_id'].map(user2idx)
    df['item_idx'] = df['movie_id'].map(item2idx)
    return df.dropna(subset=['user_idx','item_idx'])\
             .astype({'user_idx':int, 'item_idx':int, 'weight':float})

df_train_enc = encode_interactions(df_train)
df_test_enc  = encode_interactions(df_test)

## Feature Engineering

In [9]:
def build_item_features(df_meta, item2idx, n_items):
    df = df_meta[df_meta.movie_id.isin(item2idx)].copy()
    df['item_idx'] = df['movie_id'].map(item2idx)
    df = df.set_index('item_idx').reindex(range(n_items))

    mats = []
    for col in ['genres','actor','language','country']:
        if col in df:
            mlb = MultiLabelBinarizer(sparse_output=True)
            # ensure each entry is a list (replace NaN or others with empty list)
            lists = df[col].apply(lambda x: x if isinstance(x, list) else [])
            mats.append(mlb.fit_transform(lists))

    tfidf = TfidfVectorizer(max_features=500, stop_words='english')
    mats.append(tfidf.fit_transform(df['title'].fillna('')))

    bins = np.arange(1900, 2025, 10)
    df['year_bin'] = pd.cut(df['year'], bins=bins, labels=bins[:-1])
    ohe = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
    mats.append(ohe.fit_transform(df[['year_bin']]))

    return hstack(mats)

item_features = build_item_features(df_meta, item2idx, n_items)

# Build user features from df_users
def build_user_features(df_users, user2idx, n_users):
    df = df_users[df_users.user_id.isin(user2idx)].copy()
    df['user_idx'] = df['user_id'].map(user2idx)
    df = df.set_index('user_idx').reindex(range(n_users))

    mats = []
    for col in ['gender','occupation']:
        ohe = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
        mats.append(ohe.fit_transform(df[[col]]))

    bins = [0,18,25,35,45,50,56,101]
    df['age_bin'] = pd.cut(df['age'], bins=bins)
    ohe_age = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
    mats.append(ohe_age.fit_transform(df[['age_bin']]))

    return hstack(mats)

user_features = build_user_features(df_users, user2idx, n_users)

## Evaluation Metrics

In [10]:
def evaluate_metrics_by_user(recommend_fn, df_test, K_values=K_VALUES):
    user_truth = df_test.groupby('user_idx')['item_idx'].apply(list).to_dict()
    stats = {K:{'prec':[],'rec':[],'hr':[],'ndcg':[],'ap':[]} for K in K_VALUES}

    for u, true_items in tqdm(user_truth.items(), total=len(user_truth)):
        true_items = [int(x) for x in true_items]
        recs = recommend_fn(u, max(K_VALUES))
        recs = [int(x) for x in np.asarray(recs).ravel()]

        for K in K_VALUES:
            topk = recs[:K]
            hits = [i for i in topk if i in true_items]
            n, R = len(hits), len(true_items)
            stats[K]['prec'].append(n/K)
            stats[K]['rec'].append(n/R if R else 0)
            stats[K]['hr'].append(int(n>0))
            if n>0:
                dcg = sum(1/math.log2(topk.index(i)+2) for i in hits)
                ideal = sum(1/math.log2(r+2) for r in range(min(R,K)))
                stats[K]['ndcg'].append(dcg/ideal if ideal else 0)
                num_rel=0; s=0
                for idx,item in enumerate(topk,1):
                    if item in true_items:
                        num_rel+=1; s+=num_rel/idx
                stats[K]['ap'].append(s/min(R,K))
            else:
                stats[K]['ndcg'].append(0); stats[K]['ap'].append(0)

    return {f"{m}@{K}": np.mean(v) for K,metrics in stats.items() for m,v in metrics.items()}

# Recommender Classes

In [11]:
class PopularityRecommender:
    def fit(self, df, item_features=None, user_features=None):
        self.pop = df.movie_id.value_counts().index.map(item2idx).tolist()
    def recommend(self, u, N):
        return self.pop[:N]

class MemoryCF:
    def __init__(self, mode='user', K=10):
        self.mode = mode
        self.K = K

    def fit(self, df, item_features=None, user_features=None):
        rows = df.user_idx
        cols = df.item_idx
        w    = df.weight
        self.mat = csr_matrix((w.values, (rows, cols)), shape=(n_users, n_items))
        matrix = self.mat if self.mode == 'user' else self.mat.T
        self.sim = cosine_similarity(matrix)

    def recommend(self, u, N):
        seen = set(self.mat[u].indices)
        if self.mode == 'user':
            sims = self.sim[u]
            neighbors = np.argsort(sims)[::-1][1:self.K+1]
            scores = self.mat[neighbors].sum(axis=0).A1
        else:
            liked = self.mat[u].indices
            scores = self.sim[liked].sum(axis=0)
            scores[liked] = 0
        ranking = np.argsort(scores)[::-1]
        return [i for i in ranking if i not in seen][:N]

class ALSRecommender:
    def __init__(self, factors=64, regularization=0.01,
                 iterations=30, alpha=40):
        self.params = dict(factors=factors,
                           regularization=regularization,
                           iterations=iterations,
                           alpha=alpha,
                           random_state=SEED,
                           num_threads=1)   # force single-thread in ALS
    def fit(self, df, item_features=None, user_features=None):
        # **DO NOT pre-scale** data here; let ALS handle alpha internally
        rows = df.user_idx
        cols = df.item_idx
        w    = df.weight
        ui = coo_matrix((w.values, (rows, cols)),
                        shape=(n_users, n_items)).tocsr()
        self.model = AlternatingLeastSquares(**self.params)
        self.model.fit(ui)
        self.ui = ui

    def recommend(self, u, N):
        item_ids, scores = self.model.recommend(
            userid=u,
            user_items=self.ui[u],
            N=N,
            filter_already_liked_items=True
        )
        return [int(i) for i in item_ids]

class LightFMHybrid:
    def __init__(self, **kw):
        self.kw = kw

    def fit(self, df, item_features=None, user_features=None):
        rows = df.user_idx
        cols = df.item_idx
        inter = coo_matrix((np.ones(len(df)), (rows, cols)), shape=(n_users, n_items))
        self.model = LightFM(**self.kw)
        self.model.fit(
            inter,
            item_features=item_features,
            user_features=user_features,
            epochs=10,
            num_threads=4,
            verbose=True
        )

    def recommend(self, u, N):
        scores = self.model.predict(
            np.repeat(u, n_items),
            np.arange(n_items),
            item_features=item_features,
            user_features=user_features
        )
        return [int(i) for i in np.argsort(-scores)[:N]]

# Train & Evaluate

In [12]:
models = {
    'Popularity': PopularityRecommender(),
    'UserCF':    MemoryCF('user', 10),
    'ItemCF':    MemoryCF('item', 10),
    'ALS':       ALSRecommender(factors=64, regularization=0.01, iterations=30, alpha=40),
    'LightFM':   LightFMHybrid(loss='warp', no_components=128, learning_rate=0.05, random_state=SEED)
}

for name, model in models.items():
    print(f"\n▶️ {name}")
    model.fit(df_train_enc, item_features=item_features, user_features=user_features)
    metrics = evaluate_metrics_by_user(model.recommend, df_test_enc)
    print()
    for metric, score in metrics.items():
        print(f"{metric}: {score:.4f}")



▶️ Popularity


100%|██████████| 5390/5390 [00:00<00:00, 13713.19it/s]



prec@5: 0.0383
rec@5: 0.0137
hr@5: 0.1538
ndcg@5: 0.0385
ap@5: 0.0201
prec@10: 0.0380
rec@10: 0.0267
hr@10: 0.2672
ndcg@10: 0.0410
ap@10: 0.0157
prec@20: 0.0397
rec@20: 0.0562
hr@20: 0.4445
ndcg@20: 0.0517
ap@20: 0.0154

▶️ UserCF


100%|██████████| 5390/5390 [00:16<00:00, 324.51it/s]



prec@5: 0.1030
rec@5: 0.0350
hr@5: 0.3499
ndcg@5: 0.1054
ap@5: 0.0612
prec@10: 0.0940
rec@10: 0.0629
hr@10: 0.5074
ndcg@10: 0.1058
ap@10: 0.0479
prec@20: 0.0842
rec@20: 0.1101
hr@20: 0.6716
ndcg@20: 0.1161
ap@20: 0.0433

▶️ ItemCF


100%|██████████| 5390/5390 [00:10<00:00, 497.59it/s]


prec@5: 0.1072
rec@5: 0.0355
hr@5: 0.3521
ndcg@5: 0.1114
ap@5: 0.0667
prec@10: 0.0981
rec@10: 0.0640
hr@10: 0.5024
ndcg@10: 0.1108
ap@10: 0.0521
prec@20: 0.0864
rec@20: 0.1083
hr@20: 0.6568
ndcg@20: 0.1188
ap@20: 0.0459

▶️ ALS



  check_blas_config()


  0%|          | 0/30 [00:00<?, ?it/s]

100%|██████████| 5390/5390 [00:01<00:00, 3109.25it/s]



prec@5: 0.0815
rec@5: 0.0328
hr@5: 0.3221
ndcg@5: 0.0829
ap@5: 0.0439
prec@10: 0.0764
rec@10: 0.0592
hr@10: 0.4878
ndcg@10: 0.0867
ap@10: 0.0357
prec@20: 0.0734
rec@20: 0.1117
hr@20: 0.6931
ndcg@20: 0.1042
ap@20: 0.0355

▶️ LightFM


Epoch: 100%|██████████| 10/10 [00:41<00:00,  4.12s/it]
100%|██████████| 5390/5390 [00:21<00:00, 249.54it/s]


prec@5: 0.0317
rec@5: 0.0100
hr@5: 0.1371
ndcg@5: 0.0329
ap@5: 0.0170
prec@10: 0.0295
rec@10: 0.0174
hr@10: 0.2288
ndcg@10: 0.0325
ap@10: 0.0120
prec@20: 0.0281
rec@20: 0.0332
hr@20: 0.3675
ndcg@20: 0.0363
ap@20: 0.0102



