In [10]:

import json, math, os, random
from pathlib import Path
import pandas as pd, numpy as np
import scipy.sparse as sps
from collections import defaultdict
print("pandas", pd.__version__, "numpy", np.__version__)
from pathlib import Path
DATA_DIR = Path(".")
print("Data dir:", DATA_DIR)
courses = pd.read_csv(DATA_DIR / "courses.csv")
interactions = pd.read_csv(DATA_DIR / "interactions.csv")
# parse prerequisites column (JSON)
courses['prerequisites'] = courses['prerequisites'].apply(lambda x: json.loads(x) if pd.notna(x) and x.strip()!='' else [])
print("courses:", courses.shape, "interactions:", interactions.shape)
courses.head(3)


pandas 2.2.3 numpy 2.1.3
Data dir: .
courses: (200, 6) interactions: (19491, 5)


Unnamed: 0,course_id,title,topic,difficulty,duration_mins,prerequisites
0,c1000,Advanced Machine Learning,Data Science,Beginner,180,[]
1,c1001,Intro to Machine Learning,DevOps,Intermediate,90,[]
2,c1002,Hands-on Kubernetes,DevOps,Beginner,60,[]


In [11]:

# Quick EDA
print("Unique interns:", interactions['intern_id'].nunique())
print("Unique courses:", interactions['course_id'].nunique())
print("Event types distribution:")
print(interactions['event_type'].value_counts())
# compute counts per intern
per_intern = interactions.groupby('intern_id')['course_id'].nunique().describe()
per_intern


Unique interns: 500
Unique courses: 200
Event types distribution:
event_type
view        12613
start        3573
complete     2271
rating       1034
Name: count, dtype: int64


count    500.000000
mean      10.118000
std        2.857342
min        4.000000
25%        8.000000
50%       10.000000
75%       12.000000
max       20.000000
Name: course_id, dtype: float64

In [12]:
import numpy as np
import scipy.sparse as sps

# Ensure consistent dtypes
ui['intern_id'] = ui['intern_id'].astype(str)
ui['course_id'] = ui['course_id'].astype(str)

# Build mappings (intern_id → index, course_id → index)
user_ids = ui['intern_id'].unique().tolist()
item_ids = ui['course_id'].unique().tolist()

user2idx = {u: i for i, u in enumerate(user_ids)}
item2idx = {i: j for j, i in enumerate(item_ids)}  # course_id → index

# Map safely, dropping rows that don’t map
rows = ui['intern_id'].map(user2idx)
cols = ui['course_id'].map(item2idx)

# Drop any NaNs caused by unmatched IDs
mask_valid = rows.notna() & cols.notna()
ui = ui[mask_valid]
rows = rows[mask_valid].astype(int)
cols = cols[mask_valid].astype(int)

# Build sparse matrix
data = ui['strength'].astype(float)
R = sps.csr_matrix((data, (rows, cols)), shape=(len(user_ids), len(item_ids)))

print(" Built sparse matrix successfully!")
print("Shape:", R.shape, "  Non-zero entries:", R.nnz)


 Built sparse matrix successfully!
Shape: (500, 200)   Non-zero entries: 5059


In [13]:

# Simple ALS for implicit feedback (Hu, Koren, Volinsky 2008)
# c_ui = 1 + alpha * r_ui
# We optimize for user factors X and item factors Y such that:
# (Y^T C_u Y + lambda I) x_u = Y^T C_u p_u
# where p_u is binary preference (1 if r_ui > 0).

import numpy.linalg as la
def implicit_als(R, factors=32, regularization=0.1, alpha=40, iterations=10, verbose=True):
    # R: csr_matrix (users x items) with raw strengths
    users, items = R.shape
    # binary preference
    P = (R > 0).astype(int).toarray()  # caution: may be large; for this synthetic size it's OK.
    R_arr = R.toarray()
    # confidence
    C = 1 + alpha * R_arr
    # initialize factors
    X = np.random.normal(scale=0.01, size=(users, factors))
    Y = np.random.normal(scale=0.01, size=(items, factors))
    YtY = None
    for it in range(iterations):
        if verbose: print(f"ALS iter {it+1}/{iterations}")
        # Precompute Y^T Y
        YtY = Y.T.dot(Y) + regularization * np.eye(factors)
        # update user factors
        for u in range(users):
            Cu = np.diag(C[u])  # diag matrix
            # Equation: (Y^T Cu Y + reg I) x_u = Y^T Cu p_u
            A = Y.T.dot(Cu).dot(Y) + regularization * np.eye(factors)
            b = Y.T.dot(Cu).dot(P[u])
            X[u] = la.solve(A, b)
        # Precompute X^T X
        XtX = X.T.dot(X) + regularization * np.eye(factors)
        # update item factors
        for i in range(items):
            Ci = np.diag(C[:, i])
            A = X.T.dot(Ci).dot(X) + regularization * np.eye(factors)
            b = X.T.dot(Ci).dot(P[:, i])
            Y[i] = la.solve(A, b)
    return X, Y

# Train (this may take a moment)
X, Y = implicit_als(R, factors=32, regularization=0.1, alpha=40, iterations=8, verbose=True)
print("Trained factors shapes:", X.shape, Y.shape)


ALS iter 1/8
ALS iter 2/8
ALS iter 3/8
ALS iter 4/8
ALS iter 5/8
ALS iter 6/8
ALS iter 7/8
ALS iter 8/8
Trained factors shapes: (500, 32) (200, 32)


In [14]:

# Recommendation function and Precision@K evaluation (leave-one-out per user)
from heapq import nlargest
def recommend_for_user(u_idx, X, Y, known_items, topk=10):
    scores = Y.dot(X[u_idx])  # item scores
    # filter known items
    scores[known_items] = -np.inf
    top_idx = np.argsort(scores)[-topk:][::-1]
    return top_idx, scores[top_idx]

# Create train/test split: for each user leave one positive item for test (if exists)
rng = np.random.default_rng(42)
train_mask = []
test_pairs = {}
train_rows = []
train_cols = []
train_data = []
for u in range(R.shape[0]):
    row = R[u].toarray().ravel()
    pos = np.where(row>0)[0]
    if len(pos) == 0:
        continue
    if len(pos) == 1:
        test_i = pos[0]
        train_pos = []
    else:
        test_i = rng.choice(pos)
        train_pos = [p for p in pos if p != test_i]
    test_pairs[u] = test_i
    for i in train_pos:
        train_rows.append(u); train_cols.append(i); train_data.append(row[i])

R_train = sps.csr_matrix((train_data, (train_rows, train_cols)), shape=R.shape)
print("Train nnz:", R_train.nnz, "Test pairs:", len(test_pairs))

# Retrain on train matrix
X_tr, Y_tr = implicit_als(R_train, factors=32, regularization=0.1, alpha=40, iterations=6, verbose=False)

# Evaluate Precision@K
def precision_at_k(X, Y, R_train, test_pairs, K=10):
    precisions = []
    for u, test_i in test_pairs.items():
        known = set(R_train[u].nonzero()[1].tolist())
        topk, _ = recommend_for_user(u, X, Y, known_items=np.array(list(known), dtype=int), topk=K)
        if test_i in topk:
            precisions.append(1.0)
        else:
            precisions.append(0.0)
    return np.mean(precisions)

for K in [5,10,20]:
    p = precision_at_k(X_tr, Y_tr, R_train, test_pairs, K=K)
    print(f"Precision@{K}: {p:.4f}")


Train nnz: 4559 Test pairs: 500
Precision@5: 0.0280
Precision@10: 0.0540
Precision@20: 0.1040


In [15]:

# Show example recommendations for a few interns
sample_users = list(range(5))
for u in sample_users:
    known = set(R_train[u].nonzero()[1].tolist())
    topk, scores = recommend_for_user(u, X_tr, Y_tr, known_items=np.array(list(known), dtype=int), topk=10)
    print("Intern:", user_ids[u])
    for rank, idx in enumerate(topk, start=1):
        print(f"  {rank}. {item_ids[idx]} - {courses.set_index('course_id').loc[item_ids[idx]]['title']} (score {scores[rank-1]:.4f})")
    print()
# Save factor matrices for later use
np.save(DATA_DIR / 'user_factors.npy', X_tr)
np.save(DATA_DIR / 'item_factors.npy', Y_tr)
print('Saved factor matrices to data dir.')


Intern: i2000
  1. c1049 - Hands-on Neural Networks (score 0.9187)
  2. c1053 - Practical Data Visualization (score 0.7723)
  3. c1033 - Crash Course in Data Visualization (score 0.7670)
  4. c1135 - Crash Course in Git (score 0.7532)
  5. c1007 - Advanced Data Visualization (score 0.7318)
  6. c1167 - Practical Docker (score 0.7277)
  7. c1172 - Advanced AWS (score 0.7183)
  8. c1069 - Crash Course in AWS (score 0.7166)
  9. c1042 - Advanced Azure (score 0.7123)
  10. c1079 - Hands-on Machine Learning (score 0.6689)

Intern: i2001
  1. c1084 - Intro to Kubernetes (score 0.8719)
  2. c1168 - Advanced Linux (score 0.8258)
  3. c1194 - Crash Course in Azure (score 0.8060)
  4. c1190 - Hands-on Kubernetes (score 0.7739)
  5. c1063 - Practical Docker (score 0.7471)
  6. c1021 - Crash Course in Linux (score 0.7362)
  7. c1016 - Intro to Azure (score 0.7235)
  8. c1146 - Advanced SQL (score 0.7068)
  9. c1159 - Fundamentals of Azure (score 0.6938)
  10. c1195 - Intro to AWS (score 0.6394)

I