In [None]:
import os
import zipfile
import urllib.request
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from numba import njit, prange

# -----------------------------
# 1. Download and Extract Dataset
# -----------------------------
def download_and_extract_movielens(url, extract_path):
    zip_path = 'ml-25m.zip'
    if not os.path.exists(zip_path):
        print("Downloading MovieLens 25M dataset...")
        urllib.request.urlretrieve(url, zip_path)
        print("Download completed.")
    else:
        print("Zip file already exists.")
        
    if not os.path.exists(extract_path):
        print("Extracting dataset...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall()
        print("Extraction completed.")
    else:
        print("Dataset already extracted.")

dataset_url = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
data_dir = "ml-25m"
download_and_extract_movielens(dataset_url, data_dir)

# -----------------------------
# 2. Load and Index Data
# -----------------------------
print("Loading ratings data...")
ratings_path = os.path.join(data_dir, "ratings.csv")
ratings_df = pd.read_csv(ratings_path)

print("Loading movies data...")
movies_path = os.path.join(data_dir, "movies.csv")
movies_df = pd.read_csv(movies_path)

# Map userId and movieId to indices
print("Mapping user IDs and movie IDs to indices...")
unique_user_ids = ratings_df['userId'].unique()
unique_movie_ids = ratings_df['movieId'].unique()

user_id_to_index = {user_id: idx for idx, user_id in enumerate(unique_user_ids)}
movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(unique_movie_ids)}

num_users = len(unique_user_ids)
num_movies = len(unique_movie_ids)

print(f"Number of users: {num_users}")
print(f"Number of movies: {num_movies}")

# Add index columns
ratings_df['user_idx'] = ratings_df['userId'].map(user_id_to_index)
ratings_df['movie_idx'] = ratings_df['movieId'].map(movie_id_to_index)

# -----------------------------
# 3. Split Data into Training and Test Sets
# -----------------------------
print("Splitting data into training and test sets...")
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

# Convert training and test data to numpy arrays
train_users = train_df['user_idx'].values
train_items = train_df['movie_idx'].values
train_ratings = train_df['rating'].values

test_users = test_df['user_idx'].values
test_items = test_df['movie_idx'].values
test_ratings = test_df['rating'].values

# -----------------------------
# 4. Convert Training Data to CSR Matrix (Optional)
# -----------------------------
# This step is optional since ALS implementation below uses numpy arrays
# data_by_user_train = csr_matrix((train_ratings, (train_users, train_items)), shape=(num_users, num_movies))

# -----------------------------
# 5. Implement ALS with User and Item Biases using Numba
# -----------------------------
# Initialize parameters
num_factors = 0  # No latent factors, only biases
lambda_reg = 0.1
num_iters = 10
global_mean = np.mean(train_ratings)

# Initialize user and item biases
user_bias = np.zeros(num_users, dtype=np.float32)
item_bias = np.zeros(num_movies, dtype=np.float32)

# Precompute lists for faster access
# For each user, store the indices of items they have rated
print("Precomputing user-item interactions...")
user_rated_items = [[] for _ in range(num_users)]
user_rated_ratings = [[] for _ in range(num_users)]
for u, i, r in zip(train_users, train_items, train_ratings):
    user_rated_items[u].append(i)
    user_rated_ratings[u].append(r)

# For each item, store the indices of users who have rated it
print("Precomputing item-user interactions...")
item_rated_by_users = [[] for _ in range(num_movies)]
item_rated_ratings = [[] for _ in range(num_movies)]
for u, i, r in zip(train_users, train_items, train_ratings):
    item_rated_by_users[i].append(u)
    item_rated_ratings[i].append(r)

# Convert lists to numpy arrays for Numba compatibility
print("Converting lists to numpy arrays...")
max_user_rated = max(len(lst) for lst in user_rated_items)
max_item_rated = max(len(lst) for lst in item_rated_by_users)

user_rated_items_np = np.full((num_users, max_user_rated), -1, dtype=np.int32)
user_rated_ratings_np = np.zeros((num_users, max_user_rated), dtype=np.float32)

for u in range(num_users):
    items = user_rated_items[u]
    ratings = user_rated_ratings[u]
    user_rated_items_np[u, :len(items)] = items
    user_rated_ratings_np[u, :len(ratings)] = ratings

item_rated_by_users_np = np.full((num_movies, max_item_rated), -1, dtype=np.int32)
item_rated_ratings_np = np.zeros((num_movies, max_item_rated), dtype=np.float32)

for i in range(num_movies):
    users = item_rated_by_users[i]
    ratings = item_rated_ratings[i]
    item_rated_by_users_np[i, :len(users)] = users
    item_rated_ratings_np[i, :len(ratings)] = ratings

# -----------------------------
# 6. Define ALS Update Functions with Numba
# -----------------------------
@njit(parallel=True)
def update_user_biases(user_rated_items, user_rated_ratings, global_mean, item_bias, user_bias, lambda_reg, num_users, max_items):
    for u in prange(num_users):
        sum_ = 0.0
        count = 0
        for j in range(max_items):
            i = user_rated_items[u, j]
            if i == -1:
                break
            sum_ += user_rated_ratings[u, j] - global_mean - item_bias[i]
            count += 1
        if count > 0:
            user_bias[u] = sum_ / (count + lambda_reg)
    return user_bias

@njit(parallel=True)
def update_item_biases(item_rated_by_users, item_rated_ratings, global_mean, user_bias, item_bias, lambda_reg, num_movies, max_users):
    for i in prange(num_movies):
        sum_ = 0.0
        count = 0
        for j in range(max_users):
            u = item_rated_by_users[i, j]
            if u == -1:
                break
            sum_ += item_rated_ratings[i, j] - global_mean - user_bias[u]
            count += 1
        if count > 0:
            item_bias[i] = sum_ / (count + lambda_reg)
    return item_bias

@njit
def compute_loss(train_users, train_items, train_ratings, global_mean, user_bias, item_bias):
    loss = 0.0
    for idx in range(len(train_ratings)):
        u = train_users[idx]
        i = train_items[idx]
        pred = global_mean + user_bias[u] + item_bias[i]
        err = train_ratings[idx] - pred
        loss += err * err
    return loss

@njit
def compute_rmse(train_users, train_items, train_ratings, global_mean, user_bias, item_bias):
    mse = 0.0
    for idx in range(len(train_ratings)):
        u = train_users[idx]
        i = train_items[idx]
        pred = global_mean + user_bias[u] + item_bias[i]
        err = train_ratings[idx] - pred
        mse += err * err
    return np.sqrt(mse / len(train_ratings))

# -----------------------------
# 7. Train ALS Model
# -----------------------------
print("Starting ALS training...")

loss_history = []
train_rmse_history = []

for it in range(num_iters):
    # Update user biases
    user_bias = update_user_biases(
        user_rated_items_np,
        user_rated_ratings_np,
        global_mean,
        item_bias,
        user_bias,
        lambda_reg,
        num_users,
        max_user_rated
    )
    
    # Update item biases
    item_bias = update_item_biases(
        item_rated_by_users_np,
        item_rated_ratings_np,
        global_mean,
        user_bias,
        item_bias,
        lambda_reg,
        num_movies,
        max_item_rated
    )
    
    # Compute loss and RMSE
    loss = compute_loss(train_users, train_items, train_ratings, global_mean, user_bias, item_bias)
    rmse = compute_rmse(train_users, train_items, train_ratings, global_mean, user_bias, item_bias)
    
    loss_history.append(loss)
    train_rmse_history.append(rmse)
    
    print(f"Iteration {it+1}/{num_iters} - Loss: {loss:.4f}, Train RMSE: {rmse:.4f}")

print("ALS training completed.")

# -----------------------------
# 8. Evaluate on Test Set
# -----------------------------
@njit
def compute_rmse_test(test_users, test_items, test_ratings, global_mean, user_bias, item_bias):
    mse = 0.0
    for idx in range(len(test_ratings)):
        u = test_users[idx]
        i = test_items[idx]
        pred = global_mean + user_bias[u] + item_bias[i]
        err = test_ratings[idx] - pred
        mse += err * err
    return np.sqrt(mse / len(test_ratings))

test_rmse = compute_rmse_test(test_users, test_items, test_ratings, global_mean, user_bias, item_bias)
print(f"Test RMSE: {test_rmse:.4f}")

# -----------------------------
# 9. Plot Loss and RMSE
# -----------------------------
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range(1, num_iters + 1), loss_history, marker='o')
plt.title('Loss over Iterations')
plt.xlabel('Iteration')
plt.ylabel('Loss (MSE)')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(range(1, num_iters + 1), train_rmse_history, marker='o', label='Train RMSE')
plt.axhline(y=test_rmse, color='r', linestyle='--', label='Test RMSE')
plt.title('RMSE over Iterations')
plt.xlabel('Iteration')
plt.ylabel('RMSE')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

# -----------------------------
# 10. Make Recommendations for a Dummy User
# -----------------------------
print("Making recommendations for a dummy user who likes 'The Lord of the Rings: The Fellowship of the Ring'...")

# Identify the movie index for "The Lord of the Rings: The Fellowship of the Ring"
lotr_title = "The Lord of the Rings: The Fellowship of the Ring"
lotr_movie = movies_df[movies_df['title'] == lotr_title]
if lotr_movie.empty:
    print(f"Movie '{lotr_title}' not found in the dataset.")
else:
    lotr_movie_id = lotr_movie.iloc[0]['movieId']
    lotr_movie_idx = movie_id_to_index[lotr_movie_id]
    
    # Create a dummy user
    dummy_user_bias = 0.0
    dummy_user_ratings = np.zeros(num_movies, dtype=np.float32)
    dummy_user_ratings[lotr_movie_idx] = 5.0  # Liked "The Lord of the Rings: The Fellowship of the Ring"
    
    # Predict ratings for all movies for the dummy user
    dummy_user_pred = global_mean + dummy_user_bias + item_bias + 0.0  # No user bias since it's a dummy user
    dummy_user_pred[lotr_movie_idx] = -np.inf  # Exclude the liked movie from recommendations
    
    # Get top 10 recommendations
    top_n = 10
    recommended_indices = np.argpartition(dummy_user_pred, -top_n)[-top_n:]
    recommended_indices = recommended_indices[np.argsort(-dummy_user_pred[recommended_indices])]
    recommended_movie_ids = [unique_movie_ids[i] for i in recommended_indices]
    recommended_movies = movies_df[movies_df['movieId'].isin(recommended_movie_ids)]
    
    print("Top 10 Recommendations:")
    print(recommended_movies[['movieId', 'title']].to_string(index=False))

# -----------------------------
# 11. Identify Polarizing Movies
# -----------------------------
print("Identifying polarizing movies based on rating variance...")

@njit
def compute_rating_variance(ratings, num_movies):
    variances = np.zeros(num_movies, dtype=np.float32)
    counts = np.zeros(num_movies, dtype=np.int32)
    means = np.zeros(num_movies, dtype=np.float32)
    for idx in range(len(ratings)):
        i = test_items[idx]  # Using test set for demonstration
        r = test_ratings[idx]
        means[i] += r
        counts[i] += 1
    for i in prange(num_movies):
        if counts[i] > 1:
            variances[i] = 0.0
            for j in range(len(test_ratings)):
                if test_items[j] == i:
                    variances[i] += (test_ratings[j] - (means[i]/counts[i]))**2
            variances[i] /= (counts[i] - 1)
    return variances

rating_variances = compute_rating_variance(test_ratings, num_movies)

# Get top 10 polarizing movies
top_polarizing_indices = np.argsort(-rating_variances)[:10]
top_polarizing_movie_ids = [unique_movie_ids[i] for i in top_polarizing_indices]
top_polarizing_movies = movies_df[movies_df['movieId'].isin(top_polarizing_movie_ids)]

print("Top 10 Polarizing Movies:")
print(top_polarizing_movies[['movieId', 'title']].to_string(index=False))

# -----------------------------
# 12. Addressing the Cold Start Problem with Features (Optional)
# -----------------------------
# For simplicity, this step is not implemented in this script. Incorporating features would require additional data preprocessing
# and modifications to the ALS algorithm to include feature-based regularization or hybrid models.
# However, this script provides a foundation upon which such enhancements can be built.


In [None]:
import os
import zipfile
import urllib.request
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from numba import njit, prange
from sklearn.preprocessing import MultiLabelBinarizer
import seaborn as sns

# -----------------------------
# 1. Download and Extract Dataset
# -----------------------------
def download_and_extract_movielens(url, extract_path):
    zip_path = 'ml-25m.zip'
    if not os.path.exists(zip_path):
        print("Downloading MovieLens 25M dataset...")
        urllib.request.urlretrieve(url, zip_path)
        print("Download completed.")
    else:
        print("Zip file already exists.")
        
    if not os.path.exists(extract_path):
        print("Extracting dataset...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall()
        print("Extraction completed.")
    else:
        print("Dataset already extracted.")

dataset_url = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
data_dir = "ml-25m"
download_and_extract_movielens(dataset_url, data_dir)

# -----------------------------
# 2. Load and Index Data
# -----------------------------
print("Loading ratings data...")
ratings_path = os.path.join(data_dir, "ratings.csv")
ratings_df = pd.read_csv(ratings_path)

print("Loading movies data...")
movies_path = os.path.join(data_dir, "movies.csv")
movies_df = pd.read_csv(movies_path)

# Extract genres
movies_df['genres'] = movies_df['genres'].apply(lambda x: x.split('|'))

# Binarize genres
mlb = MultiLabelBinarizer()
genre_features = mlb.fit_transform(movies_df['genres'])
genre_feature_names = mlb.classes_
num_genres = genre_features.shape[1]
print(f"Number of genres: {num_genres}")

# Map userId and movieId to indices
print("Mapping user IDs and movie IDs to indices...")
unique_user_ids = ratings_df['userId'].unique()
unique_movie_ids = ratings_df['movieId'].unique()

user_id_to_index = {user_id: idx for idx, user_id in enumerate(unique_user_ids)}
movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(unique_movie_ids)}

num_users = len(unique_user_ids)
num_movies = len(unique_movie_ids)

print(f"Number of users: {num_users}")
print(f"Number of movies: {num_movies}")

# Add index columns
ratings_df['user_idx'] = ratings_df['userId'].map(user_id_to_index)
ratings_df['movie_idx'] = ratings_df['movieId'].map(movie_id_to_index)

# -----------------------------
# 3. Split Data into Training and Test Sets
# -----------------------------
print("Splitting data into training and test sets...")
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

# Convert training and test data to numpy arrays
train_users = train_df['user_idx'].values.astype(np.int32)
train_items = train_df['movie_idx'].values.astype(np.int32)
train_ratings = train_df['rating'].values.astype(np.float32)

test_users = test_df['user_idx'].values.astype(np.int32)
test_items = test_df['movie_idx'].values.astype(np.int32)
test_ratings = test_df['rating'].values.astype(np.float32)

# -----------------------------
# 4. Precompute Interactions
# -----------------------------
print("Precomputing user-item and item-user interactions...")

# For ALS with biases and latent factors
user_rated_items = [[] for _ in range(num_users)]
user_rated_ratings = [[] for _ in range(num_users)]
for u, i, r in zip(train_users, train_items, train_ratings):
    user_rated_items[u].append(i)
    user_rated_ratings[u].append(r)

item_rated_by_users = [[] for _ in range(num_movies)]
item_rated_ratings = [[] for _ in range(num_movies)]
for u, i, r in zip(train_users, train_items, train_ratings):
    item_rated_by_users[i].append(u)
    item_rated_ratings[i].append(r)

# Convert lists to numpy arrays for Numba compatibility
print("Converting lists to numpy arrays...")
max_user_rated = max(len(lst) for lst in user_rated_items)
max_item_rated = max(len(lst) for lst in item_rated_by_users)

user_rated_items_np = np.full((num_users, max_user_rated), -1, dtype=np.int32)
user_rated_ratings_np = np.zeros((num_users, max_user_rated), dtype=np.float32)

for u in range(num_users):
    items = user_rated_items[u]
    ratings = user_rated_ratings[u]
    user_rated_items_np[u, :len(items)] = items
    user_rated_ratings_np[u, :len(ratings)] = ratings

item_rated_by_users_np = np.full((num_movies, max_item_rated), -1, dtype=np.int32)
item_rated_ratings_np = np.zeros((num_movies, max_item_rated), dtype=np.float32)

for i in range(num_movies):
    users = item_rated_by_users[i]
    ratings = item_rated_ratings[i]
    item_rated_by_users_np[i, :len(users)] = users
    item_rated_ratings_np[i, :len(ratings)] = ratings

# -----------------------------
# 5. Initialize ALS Parameters
# -----------------------------
num_factors = 20  # Number of latent factors
lambda_reg = 0.1
num_iters = 10
global_mean = np.mean(train_ratings)

# Initialize biases and latent factors
user_bias = np.zeros(num_users, dtype=np.float32)
item_bias = np.zeros(num_movies, dtype=np.float32)
U = np.random.normal(scale=0.1, size=(num_users, num_factors)).astype(np.float32)
V = np.random.normal(scale=0.1, size=(num_movies, num_factors)).astype(np.float32)

# -----------------------------
# 6. Define ALS Update Functions with Numba
# -----------------------------
@njit(parallel=True)
def update_user_biases(num_users, max_items, user_rated_items, user_rated_ratings, global_mean, item_bias, U, V, lambda_reg, num_factors):
    for u in prange(num_users):
        sum_bias = 0.0
        count = 0
        for j in range(max_items):
            i = user_rated_items[u, j]
            if i == -1:
                break
            sum_bias += user_rated_ratings[u, j] - global_mean - item_bias[i] - np.dot(U[u, :], V[i, :])
            count += 1
        if count > 0:
            user_bias[u] = sum_bias / (count + lambda_reg)
    return user_bias

@njit(parallel=True)
def update_item_biases(num_movies, max_users, item_rated_by_users, item_rated_ratings, global_mean, user_bias, U, V, lambda_reg, num_factors):
    for i in prange(num_movies):
        sum_bias = 0.0
        count = 0
        for j in range(max_users):
            u = item_rated_by_users[i, j]
            if u == -1:
                break
            sum_bias += item_rated_ratings[i, j] - global_mean - user_bias[u] - np.dot(U[u, :], V[i, :])
            count += 1
        if count > 0:
            item_bias[i] = sum_bias / (count + lambda_reg)
    return item_bias

@njit(parallel=True)
def update_U(num_users, max_items, user_rated_items, user_rated_ratings, global_mean, user_bias, V, U, lambda_reg, num_factors):
    for u in prange(num_users):
        count = 0
        A = np.zeros((num_factors, num_factors), dtype=np.float32)
        b = np.zeros(num_factors, dtype=np.float32)
        for j in range(max_items):
            i = user_rated_items[u, j]
            if i == -1:
                break
            r_ui = user_rated_ratings[u, j]
            pred = global_mean + user_bias[u] + item_bias[i] + np.dot(U[u, :], V[i, :])
            e_ui = r_ui - (global_mean + user_bias[u] + item_bias[i] + np.dot(U[u, :], V[i, :]))
            A += V[i, :].reshape(num_factors,1) @ V[i, :].reshape(1, num_factors)
            b += V[i, :] * e_ui
            count += 1
        if count > 0:
            A += lambda_reg * np.eye(num_factors, dtype=np.float32)
            # Solve for U[u, :]
            U[u, :] = np.linalg.solve(A, b)
    return U

@njit(parallel=True)
def update_V(num_movies, max_users, item_rated_by_users, item_rated_ratings, global_mean, item_bias, U, V, lambda_reg, num_factors):
    for i in prange(num_movies):
        count = 0
        A = np.zeros((num_factors, num_factors), dtype=np.float32)
        b = np.zeros(num_factors, dtype=np.float32)
        for j in range(max_users):
            u = item_rated_by_users[i, j]
            if u == -1:
                break
            r_ui = item_rated_ratings[i, j]
            pred = global_mean + user_bias[u] + item_bias[i] + np.dot(U[u, :], V[i, :])
            e_ui = r_ui - (global_mean + user_bias[u] + item_bias[i] + np.dot(U[u, :], V[i, :]))
            A += U[u, :].reshape(num_factors,1) @ U[u, :].reshape(1, num_factors)
            b += U[u, :] * e_ui
            count += 1
        if count > 0:
            A += lambda_reg * np.eye(num_factors, dtype=np.float32)
            # Solve for V[i, :]
            V[i, :] = np.linalg.solve(A, b)
    return V

@njit
def compute_loss(train_users, train_items, train_ratings, global_mean, user_bias, item_bias, U, V):
    loss = 0.0
    for idx in range(len(train_ratings)):
        u = train_users[idx]
        i = train_items[idx]
        pred = global_mean + user_bias[u] + item_bias[i] + np.dot(U[u, :], V[i, :])
        err = train_ratings[idx] - pred
        loss += err * err
    return loss

@njit
def compute_rmse(train_users, train_items, train_ratings, global_mean, user_bias, item_bias, U, V):
    mse = 0.0
    for idx in range(len(train_ratings)):
        u = train_users[idx]
        i = train_items[idx]
        pred = global_mean + user_bias[u] + item_bias[i] + np.dot(U[u, :], V[i, :])
        err = train_ratings[idx] - pred
        mse += err * err
    return np.sqrt(mse / len(train_ratings))

# -----------------------------
# 7. Train ALS Model with Biases and Latent Factors
# -----------------------------
print("Starting ALS training with biases and latent factors...")

loss_history = []
train_rmse_history = []

for it in range(num_iters):
    # Update user biases
    user_bias = update_user_biases(
        num_users, max_user_rated, user_rated_items_np, user_rated_ratings_np,
        global_mean, item_bias, U, V, lambda_reg, num_factors
    )
    
    # Update item biases
    item_bias = update_item_biases(
        num_movies, max_item_rated, item_rated_by_users_np, item_rated_ratings_np,
        global_mean, user_bias, U, V, lambda_reg, num_factors
    )
    
    # Update user latent factors U
    U = update_U(
        num_users, max_user_rated, user_rated_items_np, user_rated_ratings_np,
        global_mean, user_bias, V, U, lambda_reg, num_factors
    )
    
    # Update item latent factors V
    V = update_V(
        num_movies, max_item_rated, item_rated_by_users_np, item_rated_ratings_np,
        global_mean, item_bias, U, V, lambda_reg, num_factors
    )
    
    # Compute loss and RMSE
    loss = compute_loss(train_users, train_items, train_ratings, global_mean, user_bias, item_bias, U, V)
    rmse = compute_rmse(train_users, train_items, train_ratings, global_mean, user_bias, item_bias, U, V)
    
    loss_history.append(loss)
    train_rmse_history.append(rmse)
    
    print(f"Iteration {it+1}/{num_iters} - Loss: {loss:.4f}, Train RMSE: {rmse:.4f}")

print("ALS training with biases and latent factors completed.")

# -----------------------------
# 8. Evaluate on Test Set
# -----------------------------
@njit
def compute_rmse_test(test_users, test_items, test_ratings, global_mean, user_bias, item_bias, U, V):
    mse = 0.0
    for idx in range(len(test_ratings)):
        u = test_users[idx]
        i = test_items[idx]
        pred = global_mean + user_bias[u] + item_bias[i] + np.dot(U[u, :], V[i, :])
        err = test_ratings[idx] - pred
        mse += err * err
    return np.sqrt(mse / len(test_ratings))

test_rmse = compute_rmse_test(test_users, test_items, test_ratings, global_mean, user_bias, item_bias, U, V)
print(f"Test RMSE after training: {test_rmse:.4f}")

# -----------------------------
# 9. Plot Loss and RMSE
# -----------------------------
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.plot(range(1, num_iters + 1), loss_history, marker='o', color='blue')
plt.title('Loss over Iterations')
plt.xlabel('Iteration')
plt.ylabel('Loss (MSE)')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(range(1, num_iters + 1), train_rmse_history, marker='o', label='Train RMSE', color='green')
plt.axhline(y=test_rmse, color='red', linestyle='--', label='Test RMSE')
plt.title('RMSE over Iterations')
plt.xlabel('Iteration')
plt.ylabel('RMSE')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

# -----------------------------
# 10. Make Recommendations for a Dummy User
# -----------------------------
print("Making recommendations for a dummy user who likes 'The Lord of the Rings: The Fellowship of the Ring'...")

# Identify the movie index for "The Lord of the Rings: The Fellowship of the Ring"
lotr_title = "The Lord of the Rings: The Fellowship of the Ring"
lotr_movie = movies_df[movies_df['title'] == lotr_title]
if lotr_movie.empty:
    print(f"Movie '{lotr_title}' not found in the dataset.")
else:
    lotr_movie_id = lotr_movie.iloc[0]['movieId']
    lotr_movie_idx = movie_id_to_index.get(lotr_movie_id, -1)
    
    if lotr_movie_idx == -1:
        print(f"Movie ID for '{lotr_title}' not found.")
    else:
        # Create a dummy user's predicted ratings
        # Assuming the dummy user has no biases (new user), but to incorporate features, we'll handle in next section
        # For simplicity, we'll use average ratings plus item biases and latent factors
        dummy_user_pred = global_mean + item_bias + np.dot(V, np.zeros(num_factors, dtype=np.float32))
        
        # Exclude the liked movie from recommendations
        dummy_user_pred[lotr_movie_idx] = -np.inf
        
        # Get top 10 recommendations
        top_n = 10
        recommended_indices = np.argpartition(dummy_user_pred, -top_n)[-top_n:]
        recommended_indices = recommended_indices[np.argsort(-dummy_user_pred[recommended_indices])]
        recommended_movie_ids = [unique_movie_ids[i] for i in recommended_indices]
        recommended_movies = movies_df[movies_df['movieId'].isin(recommended_movie_ids)]
        
        print("Top 10 Recommendations:")
        print(recommended_movies[['movieId', 'title']].to_string(index=False))

# -----------------------------
# 11. Identify Polarizing Movies
# -----------------------------
print("Identifying polarizing movies based on rating variance...")

@njit(parallel=True)
def compute_rating_variance(train_users, train_items, train_ratings, num_movies):
    variances = np.zeros(num_movies, dtype=np.float32)
    counts = np.zeros(num_movies, dtype=np.int32)
    means = np.zeros(num_movies, dtype=np.float32)
    
    # First pass to compute means
    for idx in prange(len(train_ratings)):
        i = train_items[idx]
        r = train_ratings[idx]
        means[i] += r
        counts[i] += 1
    
    for i in prange(num_movies):
        if counts[i] > 0:
            means[i] /= counts[i]
    
    # Second pass to compute variance
    for idx in prange(len(train_ratings)):
        i = train_items[idx]
        r = train_ratings[idx]
        if counts[i] > 1:
            variances[i] += (r - means[i]) ** 2
    
    for i in prange(num_movies):
        if counts[i] > 1:
            variances[i] /= (counts[i] - 1)
        else:
            variances[i] = 0.0
    
    return variances

rating_variances = compute_rating_variance(train_users, train_items, train_ratings, num_movies)

# Get top 10 polarizing movies
top_polarizing_indices = np.argsort(-rating_variances)[:10]
top_polarizing_movie_ids = [unique_movie_ids[i] for i in top_polarizing_indices]
top_polarizing_movies = movies_df[movies_df['movieId'].isin(top_polarizing_movie_ids)]

print("Top 10 Polarizing Movies:")
print(top_polarizing_movies[['movieId', 'title']].to_string(index=False))

# -----------------------------
# 12. ALS with Features Added (Handling Cold Start)
# -----------------------------
print("Starting ALS training with features to handle cold start...")

# Incorporate genre features into item latent factors
# Initialize V using genre features
# Assuming genres influence item factors, we can set V = genre_features * W, where W is a weight matrix
# For simplicity, we'll initialize V with genre features and learn an additional weight matrix

# Here, we modify the ALS updates to include item features
# This is a simplistic approach; more sophisticated methods can be employed

# Let's concatenate genre features with latent factors
# New V will have num_factors + num_genres dimensions
total_factors = num_factors + num_genres

# Initialize new V
V_feat = np.random.normal(scale=0.1, size=(num_movies, total_factors)).astype(np.float32)
# Incorporate genre features
V_feat[:, :num_genres] = genre_features.astype(np.float32) * 0.1  # Scale appropriately

# Update functions to handle augmented V
@njit(parallel=True)
def update_user_biases_feat(num_users, max_items, user_rated_items, user_rated_ratings, global_mean, item_bias, U, V_feat, lambda_reg, num_factors, num_genres):
    for u in prange(num_users):
        sum_bias = 0.0
        count = 0
        for j in range(max_items):
            i = user_rated_items[u, j]
            if i == -1:
                break
            sum_bias += user_rated_ratings[u, j] - global_mean - item_bias[i] - np.dot(U[u, :], V_feat[i, num_genres:])
            count += 1
        if count > 0:
            user_bias[u] = sum_bias / (count + lambda_reg)
    return user_bias

@njit(parallel=True)
def update_item_biases_feat(num_movies, max_users, item_rated_by_users, item_rated_ratings, global_mean, user_bias, U, V_feat, lambda_reg, num_factors, num_genres):
    for i in prange(num_movies):
        sum_bias = 0.0
        count = 0
        for j in range(max_users):
            u = item_rated_by_users[i, j]
            if u == -1:
                break
            sum_bias += item_rated_ratings[i, j] - global_mean - user_bias[u] - np.dot(U[u, :], V_feat[i, num_genres:])
            count += 1
        if count > 0:
            item_bias[i] = sum_bias / (count + lambda_reg)
    return item_bias

@njit(parallel=True)
def update_U_feat(num_users, max_items, user_rated_items, user_rated_ratings, global_mean, user_bias, V_feat, U, lambda_reg, num_factors, num_genres):
    for u in prange(num_users):
        count = 0
        A = np.zeros((num_factors, num_factors), dtype=np.float32)
        b = np.zeros(num_factors, dtype=np.float32)
        for j in range(max_items):
            i = user_rated_items[u, j]
            if i == -1:
                break
            r_ui = user_rated_ratings[u, j]
            pred = global_mean + user_bias[u] + item_bias[i] + np.dot(U[u, :], V_feat[i, num_genres:])
            e_ui = r_ui - pred
            A += V_feat[i, num_genres:].reshape(num_factors,1) @ V_feat[i, num_genres:].reshape(1, num_factors)
            b += V_feat[i, num_genres:] * e_ui
            count += 1
        if count > 0:
            A += lambda_reg * np.eye(num_factors, dtype=np.float32)
            # Solve for U[u, :]
            U[u, :] = np.linalg.solve(A, b)
    return U

@njit(parallel=True)
def update_V_feat(num_movies, max_users, item_rated_by_users, item_rated_ratings, global_mean, item_bias, U, V_feat, lambda_reg, num_factors, num_genres):
    for i in prange(num_movies):
        count = 0
        A = np.zeros((num_factors, num_factors), dtype=np.float32)
        b = np.zeros(num_factors, dtype=np.float32)
        for j in range(max_users):
            u = item_rated_by_users[i, j]
            if u == -1:
                break
            r_ui = item_rated_ratings[i, j]
            pred = global_mean + user_bias[u] + item_bias[i] + np.dot(U[u, :], V_feat[i, num_genres:])
            e_ui = r_ui - pred
            A += U[u, :].reshape(num_factors,1) @ U[u, :].reshape(1, num_factors)
            b += U[u, :] * e_ui
            count += 1
        if count > 0:
            A += lambda_reg * np.eye(num_factors, dtype=np.float32)
            # Solve for V_feat[i, num_genres:]
            V_feat[i, num_genres:] = np.linalg.solve(A, b)
    return V_feat

# -----------------------------
# 13. Train ALS Model with Features (Handling Cold Start)
# -----------------------------
print("Starting ALS training with features to handle cold start...")

loss_history_feat = []
train_rmse_history_feat = []

for it in range(num_iters):
    # Update user biases
    user_bias = update_user_biases_feat(
        num_users, max_user_rated, user_rated_items_np, user_rated_ratings_np,
        global_mean, item_bias, U, V_feat, lambda_reg, num_factors, num_genres
    )
    
    # Update item biases
    item_bias = update_item_biases_feat(
        num_movies, max_item_rated, item_rated_by_users_np, item_rated_ratings_np,
        global_mean, user_bias, U, V_feat, lambda_reg, num_factors, num_genres
    )
    
    # Update user latent factors U
    U = update_U_feat(
        num_users, max_user_rated, user_rated_items_np, user_rated_ratings_np,
        global_mean, user_bias, V_feat, U, lambda_reg, num_factors, num_genres
    )
    
    # Update item latent factors V_feat
    V_feat = update_V_feat(
        num_movies, max_item_rated, item_rated_by_users_np, item_rated_ratings_np,
        global_mean, item_bias, U, V_feat, lambda_reg, num_factors, num_genres
    )
    
    # Compute loss and RMSE
    loss = compute_loss(train_users, train_items, train_ratings, global_mean, user_bias, item_bias, U, V_feat[:, num_genres:])
    rmse = compute_rmse(train_users, train_items, train_ratings, global_mean, user_bias, item_bias, U, V_feat[:, num_genres:])
    
    loss_history_feat.append(loss)
    train_rmse_history_feat.append(rmse)
    
    print(f"Feature Iteration {it+1}/{num_iters} - Loss: {loss:.4f}, Train RMSE: {rmse:.4f}")

print("ALS training with features completed.")

# -----------------------------
# 14. Evaluate on Test Set with Features
# -----------------------------
@njit
def compute_rmse_test_feat(test_users, test_items, test_ratings, global_mean, user_bias, item_bias, U, V_feat, num_genres):
    mse = 0.0
    for idx in range(len(test_ratings)):
        u = test_users[idx]
        i = test_items[idx]
        pred = global_mean + user_bias[u] + item_bias[i] + np.dot(U[u, :], V_feat[i, num_genres:])
        err = test_ratings[idx] - pred
        mse += err * err
    return np.sqrt(mse / len(test_ratings))

test_rmse_feat = compute_rmse_test_feat(test_users, test_items, test_ratings, global_mean, user_bias, item_bias, U, V_feat, num_genres)
print(f"Test RMSE after training with features: {test_rmse_feat:.4f}")

# -----------------------------
# 15. Plot Loss and RMSE for Feature-Enhanced ALS
# -----------------------------
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.plot(range(1, num_iters + 1), loss_history_feat, marker='o', color='purple')
plt.title('Loss over Iterations with Features')
plt.xlabel('Iteration')
plt.ylabel('Loss (MSE)')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(range(1, num_iters + 1), train_rmse_history_feat, marker='o', label='Train RMSE with Features', color='orange')
plt.axhline(y=test_rmse_feat, color='red', linestyle='--', label='Test RMSE with Features')
plt.title('RMSE over Iterations with Features')
plt.xlabel('Iteration')
plt.ylabel('RMSE')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

# -----------------------------
# 16. Recommendations for a Dummy User with Features
# -----------------------------
print("Making recommendations for a dummy user who likes 'The Lord of the Rings: The Fellowship of the Ring' with features...")

# Identify the movie index for "The Lord of the Rings: The Fellowship of the Ring"
if lotr_movie.empty:
    print(f"Movie '{lotr_title}' not found in the dataset.")
else:
    lotr_movie_id = lotr_movie.iloc[0]['movieId']
    lotr_movie_idx = movie_id_to_index.get(lotr_movie_id, -1)
    
    if lotr_movie_idx == -1:
        print(f"Movie ID for '{lotr_title}' not found.")
    else:
        # Create a dummy user's predicted ratings using features
        # Assuming the dummy user has no biases (new user), but using item features to predict
        # Since we don't have user features, the dummy user is represented by zero vector in U
        dummy_U = np.zeros(num_factors, dtype=np.float32)
        dummy_user_pred_feat = global_mean + item_bias + np.dot(V_feat[:, num_genres:], dummy_U)
        
        # Exclude the liked movie from recommendations
        dummy_user_pred_feat[lotr_movie_idx] = -np.inf
        
        # Get top 10 recommendations
        top_n = 10
        recommended_indices_feat = np.argpartition(dummy_user_pred_feat, -top_n)[-top_n:]
        recommended_indices_feat = recommended_indices_feat[np.argsort(-dummy_user_pred_feat[recommended_indices_feat])]
        recommended_movie_ids_feat = [unique_movie_ids[i] for i in recommended_indices_feat]
        recommended_movies_feat = movies_df[movies_df['movieId'].isin(recommended_movie_ids_feat)]
        
        print("Top 10 Recommendations with Features:")
        print(recommended_movies_feat[['movieId', 'title']].to_string(index=False))

# -----------------------------
# 17. Identify Polarizing Movies (Optional)
# -----------------------------
print("Identifying polarizing movies based on rating variance...")

@njit(parallel=True)
def compute_rating_variance_train(train_users, train_items, train_ratings, num_movies):
    variances = np.zeros(num_movies, dtype=np.float32)
    counts = np.zeros(num_movies, dtype=np.int32)
    means = np.zeros(num_movies, dtype=np.float32)
    
    # First pass to compute means
    for idx in prange(len(train_ratings)):
        i = train_items[idx]
        r = train_ratings[idx]
        means[i] += r
        counts[i] += 1
    
    for i in prange(num_movies):
        if counts[i] > 0:
            means[i] /= counts[i]
    
    # Second pass to compute variance
    for idx in prange(len(train_ratings)):
        i = train_items[idx]
        r = train_ratings[idx]
        if counts[i] > 1:
            variances[i] += (r - means[i]) ** 2
    
    for i in prange(num_movies):
        if counts[i] > 1:
            variances[i] /= (counts[i] - 1)
        else:
            variances[i] = 0.0
    
    return variances

rating_variances_train = compute_rating_variance_train(train_users, train_items, train_ratings, num_movies)

# Get top 10 polarizing movies
top_polarizing_indices_train = np.argsort(-rating_variances_train)[:10]
top_polarizing_movie_ids_train = [unique_movie_ids[i] for i in top_polarizing_indices_train]
top_polarizing_movies_train = movies_df[movies_df['movieId'].isin(top_polarizing_movie_ids_train)]

print("Top 10 Polarizing Movies:")
print(top_polarizing_movies_train[['movieId', 'title']].to_string(index=False))

# Optional: Visualize Polarizing Movies
plt.figure(figsize=(12, 8))
sns.barplot(x='title', y=rating_variances_train[top_polarizing_indices_train], data=top_polarizing_movies_train)
plt.xticks(rotation=90)
plt.title('Top 10 Polarizing Movies by Rating Variance')
plt.xlabel('Movie Title')
plt.ylabel('Rating Variance')
plt.tight_layout()
plt.show()


In [None]:
# -----------------------------
# 18. Visualize Embeddings and Save as PDFs
# -----------------------------
print("Visualizing user and item embeddings and saving as PDFs...")

# Perform PCA on user embeddings
print("Performing PCA on user embeddings...")
pca_users = PCA(n_components=2)
U_pca = pca_users.fit_transform(U)

# Sample a subset of users for visualization
sample_size_users = 1000
if num_users > sample_size_users:
    np.random.seed(42)
    sample_indices_users = np.random.choice(num_users, sample_size_users, replace=False)
    U_pca_sample = U_pca[sample_indices_users]
else:
    U_pca_sample = U_pca

# Plot User Embeddings
plt.figure(figsize=(8,6))
plt.scatter(U_pca_sample[:,0], U_pca_sample[:,1], alpha=0.5, s=10, color='blue')
plt.title('User Embeddings Visualization (PCA)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True)
plt.tight_layout()
plt.savefig('user_embeddings.pdf')
plt.close()
print("User embeddings plot saved as 'user_embeddings.pdf'.")

# Perform PCA on item embeddings (excluding genre features)
print("Performing PCA on item embeddings...")
V_only = V_feat[:, num_genres:]
pca_items = PCA(n_components=2)
V_pca = pca_items.fit_transform(V_only)

# Sample a subset of items for visualization
sample_size_items = 1000
if num_movies > sample_size_items:
    np.random.seed(42)
    sample_indices_items = np.random.choice(num_movies, sample_size_items, replace=False)
    V_pca_sample = V_pca[sample_indices_items]
    genres_sample = genre_features[sample_indices_items]
    # Assign a primary genre for coloring
    primary_genres = np.argmax(genres_sample, axis=1)
else:
    V_pca_sample = V_pca
    genres_sample = genre_features
    primary_genres = np.argmax(genres_sample, axis=1)

# Create a color palette
unique_genres_sample = np.unique(primary_genres)
palette = sns.color_palette("hsv", len(unique_genres_sample))

# Plot Item Embeddings with Genre Colors
plt.figure(figsize=(8,6))
for genre in unique_genres_sample:
    idx = primary_genres == genre
    plt.scatter(V_pca_sample[idx,0], V_pca_sample[idx,1], 
                alpha=0.5, s=10, label=genre_feature_names[genre], color=palette[genre])
plt.title('Item Embeddings Visualization (PCA)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Genres', bbox_to_anchor=(1.05, 1), loc='upper left', markerscale=2, fontsize='small')
plt.grid(True)
plt.tight_layout()
plt.savefig('item_embeddings.pdf')
plt.close()
print("Item embeddings plot saved as 'item_embeddings.pdf'.")

In [None]:
from joblib import Parallel, delayed

def evaluate_model(params):
    num_factors = params['num_factors']
    lambda_reg = params['lambda_reg']
    num_iters = params['num_iters']
    
    # Initialize model parameters
    U_temp = np.random.normal(scale=0.1, size=(num_users, num_factors)).astype(np.float32)
    V_temp = np.random.normal(scale=0.1, size=(num_movies, num_factors)).astype(np.float32)
    user_bias_temp = np.zeros(num_users, dtype=np.float32)
    item_bias_temp = np.zeros(num_movies, dtype=np.float32)
    
    # Train the model
    for it in range(num_iters):
        user_bias_temp = update_user_biases(
            num_users, max_user_rated, user_rated_items_np, user_rated_ratings_np,
            global_mean, item_bias_temp, U_temp, V_temp, lambda_reg, num_factors
        )
        item_bias_temp = update_item_biases(
            num_movies, max_item_rated, item_rated_by_users_np, item_rated_ratings_np,
            global_mean, user_bias_temp, U_temp, V_temp, lambda_reg, num_factors
        )
        U_temp = update_U(
            num_users, max_user_rated, user_rated_items_np, user_rated_ratings_np,
            global_mean, user_bias_temp, V_temp, U_temp, lambda_reg, num_factors
        )
        V_temp = update_V(
            num_movies, max_item_rated, item_rated_by_users_np, item_rated_ratings_np,
            global_mean, item_bias_temp, U_temp, V_temp, lambda_reg, num_factors
        )
    
    # Compute RMSE on test set
    rmse = compute_rmse_test(
        test_users, test_items, test_ratings,
        global_mean, user_bias_temp, item_bias_temp, U_temp, V_temp
    )
    
    return rmse, params

# Define a list of hyperparameter combinations
param_grid = [
    {'num_factors': 20, 'lambda_reg': 0.1, 'num_iters': 10},
    {'num_factors': 30, 'lambda_reg': 0.05, 'num_iters': 20},
    # Add more combinations as needed
]

# Parallel evaluation
results = Parallel(n_jobs=-1)(
    delayed(evaluate_model)(params) for params in param_grid
)

# Find the best parameters
best_rmse = float('inf')
best_params = {}
for rmse, params in results:
    print(f"Parameters: {params} => Test RMSE: {rmse:.4f}")
    if rmse < best_rmse:
        best_rmse = rmse
        best_params = params

print(f"Best RMSE: {best_rmse:.4f} with parameters: {best_params}")


Key Hyperparameters in ALS
a. Number of Latent Factors (num_factors)
Definition: The dimensionality of the latent feature space for both users and items.
Impact:
Too Low: The model may be too simplistic, failing to capture underlying patterns, leading to underfitting.
Too High: The model may capture noise, leading to overfitting and increased computational cost.
b. Regularization Parameter (lambda_reg)
Definition: Controls the extent to which the model penalizes large weights in the latent factor matrices.
Impact:
High Value: Prevents overfitting by discouraging complex models but might lead to underfitting.
Low Value: Allows the model to fit the training data more closely but risks overfitting.
c. Number of Iterations (num_iters)
Definition: The number of times the ALS algorithm iteratively updates user and item factors.
Impact:
Too Few: The model may not converge, leading to suboptimal performance.
Too Many: Increases computation time with diminishing returns once convergence is achieved.
d. Early Stopping Criteria (Optional)
Definition: A condition to halt training early if the model stops improving.
Impact: Saves computational resources and prevents overfitting.
e. Feature-Related Parameters (if using features)
Definition: Parameters related to the incorporation of additional features (e.g., genre weights).
Impact: Enhances the model's ability to handle cold start problems but adds complexity.

optl

In [None]:
import numpy as np
import pandas as pd
import random
import csv
import os
from datetime import datetime
from scipy.stats import ttest_ind

# -----------------------------
# 1. ALS Model Functions
# -----------------------------

def initialize_als_model(num_users, num_items, num_factors=20, lambda_reg=0.1, down_weight_bias=False):
    """
    Initialize ALS model parameters.

    Parameters:
    - num_users: Total number of users.
    - num_items: Total number of items.
    - num_factors: Number of latent factors.
    - lambda_reg: Regularization parameter.
    - down_weight_bias: If True, apply a tweak to down-weigh item biases.

    Returns:
    - user_bias: User bias vector.
    - item_bias: Item bias vector.
    - U: User latent factor matrix.
    - V: Item latent factor matrix.
    - down_weight_bias: Boolean flag for model version.
    """
    user_bias = np.zeros(num_users, dtype=np.float32)
    item_bias = np.zeros(num_items, dtype=np.float32)
    U = np.random.normal(scale=0.1, size=(num_users, num_factors)).astype(np.float32)
    V = np.random.normal(scale=0.1, size=(num_items, num_factors)).astype(np.float32)
    return user_bias, item_bias, U, V, down_weight_bias

def train_als(user_bias, item_bias, U, V, train_data, num_users, num_items, num_factors=20, lambda_reg=0.1, num_iters=10, down_weight_bias=False):
    """
    Train ALS model by updating biases and latent factors.

    Parameters:
    - user_bias: User bias vector.
    - item_bias: Item bias vector.
    - U: User latent factor matrix.
    - V: Item latent factor matrix.
    - train_data: List of tuples (user_idx, item_idx, rating).
    - num_users: Total number of users.
    - num_items: Total number of items.
    - num_factors: Number of latent factors.
    - lambda_reg: Regularization parameter.
    - num_iters: Number of iterations.
    - down_weight_bias: If True, apply a tweak to down-weigh item biases.

    Returns:
    - Updated user_bias, item_bias, U, V
    """
    for it in range(num_iters):
        # Update user biases
        for u in range(num_users):
            items_rated_by_u = [item for (user, item, _) in train_data if user == u]
            if not items_rated_by_u:
                continue
            sum_ratings = sum([rating for (user, item, rating) in train_data if user == u])
            sum_biases = sum([item_bias[i] for i in items_rated_by_u])
            U_u = U[u]
            sum_latent = sum([np.dot(U_u, V[i]) for i in items_rated_by_u])
            user_bias[u] = (sum_ratings - sum_biases - sum_latent) / (len(items_rated_by_u) + lambda_reg)
        
        # Update item biases
        for i in range(num_items):
            users_who_rated_i = [user for (user, item, _) in train_data if item == i]
            if not users_who_rated_i:
                continue
            sum_ratings = sum([rating for (user, item, rating) in train_data if item == i])
            sum_biases = sum([user_bias[u] for u in users_who_rated_i])
            V_i = V[i]
            sum_latent = sum([np.dot(U[u], V_i) for u in users_who_rated_i])
            item_bias[i] = (sum_ratings - sum_biases - sum_latent) / (len(users_who_rated_i) + lambda_reg)
        
        # Update user latent factors U
        for u in range(num_users):
            items_rated_by_u = [item for (user, item, _) in train_data if user == u]
            if not items_rated_by_u:
                continue
            A = np.zeros((num_factors, num_factors), dtype=np.float32)
            b = np.zeros(num_factors, dtype=np.float32)
            for i in items_rated_by_u:
                V_i = V[i]
                A += np.outer(V_i, V_i) + lambda_reg * np.eye(num_factors, dtype=np.float32)
                b += (train_data[[idx for idx, d in enumerate(train_data) if d[0]==u and d[1]==i][0]][2] - user_bias[u] - item_bias[i]) * V_i
            U[u] = np.linalg.solve(A, b)
        
        # Update item latent factors V
        for i in range(num_items):
            users_who_rated_i = [user for (user, item, _) in train_data if item == i]
            if not users_who_rated_i:
                continue
            A = np.zeros((num_factors, num_factors), dtype=np.float32)
            b = np.zeros(num_factors, dtype=np.float32)
            for u in users_who_rated_i:
                U_u = U[u]
                A += np.outer(U_u, U_u) + lambda_reg * np.eye(num_factors, dtype=np.float32)
                b += (train_data[[idx for idx, d in enumerate(train_data) if d[0]==u and d[1]==i][0]][2] - user_bias[u] - item_bias[i]) * U_u
            V[i] = np.linalg.solve(A, b)
        
        # Apply tweak for Version B if needed
        if down_weight_bias:
            item_bias *= 0.8  # Down-weigh item biases by 20%
        
        print(f"Iteration {it+1}/{num_iters} completed.")
    
    return user_bias, item_bias, U, V

def predict_rating(user_idx, item_idx, user_bias, item_bias, U, V, down_weight_bias=False):
    """
    Predict rating for a given user and item.

    Parameters:
    - user_idx: Index of the user.
    - item_idx: Index of the item.
    - user_bias: User bias vector.
    - item_bias: Item bias vector.
    - U: User latent factor matrix.
    - V: Item latent factor matrix.
    - down_weight_bias: If True, apply a tweak to down-weigh item biases.

    Returns:
    - Predicted rating.
    """
    bias = user_bias[user_idx] + item_bias[item_idx]
    latent = np.dot(U[user_idx], V[item_idx])
    if down_weight_bias:
        bias *= 0.8  # Down-weight item biases by 20%
    return bias + latent

# -----------------------------
# 2. Recommendation Function
# -----------------------------

def get_candidate_items(user_id, num_items=100):
    """
    Retrieve candidate items for recommendation.

    Parameters:
    - user_id: ID of the user.
    - num_items: Number of candidate items to retrieve.

    Returns:
    - List of dictionaries with 'item_id' and 'item_idx'.
    """
    # Placeholder: In practice, retrieve based on user history or item popularity
    return [{'item_id': i, 'item_idx': i} for i in range(1, num_items + 1)]

def createreco(user_id, version_id, models, num_users, num_items):
    """
    Generate a list of recommended items for a user based on the model version.

    Parameters:
    - user_id: Unique identifier for the user.
    - version_id: 'A' for control or 'B' for variant.
    - models: Dictionary containing model versions.
    - num_users: Total number of users.
    - num_items: Total number of items.

    Returns:
    - List of tuples containing (item_id, item_title).
    """
    model = models.get(version_id)
    if model is None:
        raise ValueError(f"Invalid version_id: {version_id}")
    
    user_idx = user_id - 1  # Assuming user IDs start at 1
    candidate_items = get_candidate_items(user_id, num_items=num_items)
    recommendations = []
    
    for item in candidate_items:
        item_id = item['item_id']
        item_idx = item['item_idx'] - 1  # Assuming item IDs start at 1
        pred_rating = predict_rating(user_idx, item_idx, model['user_bias'], model['item_bias'], model['U'], model['V'], model['down_weight_bias'])
        recommendations.append((item_id, f"Item {item_id}", pred_rating))
    
    # Sort by predicted rating descending
    recommendations.sort(key=lambda x: x[2], reverse=True)
    
    # Return top 10 recommendations
    top_n = 10
    return [(item_id, title) for item_id, title, _ in recommendations[:top_n]]

# -----------------------------
# 3. Logging Function
# -----------------------------

LOG_FILE = 'feedback_log.csv'

# Initialize the CSV file with headers if it doesn't exist
if not os.path.exists(LOG_FILE):
    with open(LOG_FILE, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['timestamp', 'user_id', 'item_id', 'feedback', 'version_id'])

def log_feedback(user_id, item_id, feedback, version_id):
    """
    Log user feedback to a CSV file.

    Parameters:
    - user_id: Unique identifier for the user.
    - item_id: Unique identifier for the item.
    - feedback: User's feedback on the item (e.g., rating).
    - version_id: 'A' or 'B' indicating the model version used.
    """
    timestamp = datetime.utcnow().isoformat()
    with open(LOG_FILE, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow([timestamp, user_id, item_id, feedback, version_id])

# -----------------------------
# 4. User Assignment
# -----------------------------

def assign_users_to_groups(user_ids, seed=42):
    """
    Randomly assign users to group A or B.

    Parameters:
    - user_ids: List of user IDs.
    - seed: Random seed for reproducibility.

    Returns:
    - Dictionary mapping user IDs to group ('A' or 'B').
    """
    random.seed(seed)
    user_group = {}
    for user_id in user_ids:
        group = random.choice(['A', 'B'])
        user_group[user_id] = group
    return user_group

# -----------------------------
# 5. Simulate Feedback
# -----------------------------

def simulate_feedback(recommendations, true_preferences):
    """
    Simulate user feedback based on true preferences.

    Parameters:
    - recommendations: List of recommended items (item_id, item_title).
    - true_preferences: Dictionary mapping item_id to user's true preference score.

    Returns:
    - List of simulated feedback scores.
    """
    feedback = []
    for item_id, _ in recommendations:
        true_pref = true_preferences.get(item_id, 3.0)  # Default preference is neutral
        noise = np.random.normal(0, 0.5)  # Gaussian noise
        simulated_rating = np.clip(true_pref + noise, 1.0, 5.0)  # Ratings between 1 and 5
        feedback.append(simulated_rating)
    return feedback

# -----------------------------
# 6. Run A/B Test
# -----------------------------

def run_ab_test(user_group_mapping, models, true_preferences_mapping, num_users, num_items):
    """
    Run A/B test by generating recommendations, simulating feedback, and logging interactions.

    Parameters:
    - user_group_mapping: Dictionary mapping user IDs to group ('A' or 'B').
    - models: Dictionary mapping version IDs to model parameters.
    - true_preferences_mapping: Dictionary mapping user IDs to their true preferences (dicts).
    - num_users: Total number of users.
    - num_items: Total number of items.
    """
    for user_id, group in user_group_mapping.items():
        recommendations = createreco(user_id, group, models, num_users, num_items)
        true_prefs = true_preferences_mapping.get(user_id, {})
        feedback = simulate_feedback(recommendations, true_prefs)
        for (item_id, _), fb in zip(recommendations, feedback):
            log_feedback(user_id, item_id, fb, group)

# -----------------------------
# 7. Evaluation Functions
# -----------------------------

def evaluate_ab_test(log_file):
    """
    Evaluate A/B test results by comparing feedback metrics between groups A and B.

    Parameters:
    - log_file: Path to the CSV file containing logged feedback.

    Returns:
    - Dictionary containing evaluation metrics for both groups.
    """
    df = pd.read_csv(log_file)
    group_A = df[df['version_id'] == 'A']
    group_B = df[df['version_id'] == 'B']
    
    metrics = {}
    for group, data in zip(['A', 'B'], [group_A, group_B]):
        metrics[group] = {}
        metrics[group]['mean_rating'] = data['feedback'].mean()
        metrics[group]['std_rating'] = data['feedback'].std()
        metrics[group]['count'] = data.shape[0]
        metrics[group]['rating_distribution'] = data['feedback'].value_counts(normalize=True).sort_index()
    
    return metrics

def compare_groups(metrics, log_file):
    """
    Compare performance metrics between groups A and B.

    Parameters:
    - metrics: Dictionary containing evaluation metrics for both groups.
    - log_file: Path to the CSV file containing logged feedback.
    """
    print("A/B Test Evaluation Results:")
    for group in ['A', 'B']:
        print(f"\nGroup {group}:")
        print(f"  Number of Feedbacks: {metrics[group]['count']}")
        print(f"  Mean Rating: {metrics[group]['mean_rating']:.2f}")
        print(f"  Std Rating: {metrics[group]['std_rating']:.2f}")
        print(f"  Rating Distribution:")
        print(metrics[group]['rating_distribution'])
    
    # Perform t-test to see if differences are significant
    df = pd.read_csv(log_file)
    ratings_A = df[df['version_id'] == 'A']['feedback']
    ratings_B = df[df['version_id'] == 'B']['feedback']
    
    t_stat, p_val = ttest_ind(ratings_A, ratings_B, equal_var=False)
    print("\nStatistical Test (Independent t-test):")
    print(f"  t-statistic: {t_stat:.4f}")
    print(f"  p-value: {p_val:.4f}")
    
    if p_val < 0.05:
        print("  Conclusion: Significant difference between Group A and B.")
    else:
        print("  Conclusion: No significant difference between Group A and B.")

# -----------------------------
# 8. Example Workflow
# -----------------------------

def main():
    # Parameters
    num_users = 10  # Example: 10 users
    num_items = 100  # Example: 100 items
    num_factors = 20
    lambda_reg = 0.1
    num_iters = 5  # Reduced for demonstration purposes
    
    # Step 1: Initialize Models
    user_bias_A, item_bias_A, U_A, V_A, down_weight_bias_A = initialize_als_model(num_users, num_items, num_factors, lambda_reg, down_weight_bias=False)
    user_bias_B, item_bias_B, U_B, V_B, down_weight_bias_B = initialize_als_model(num_users, num_items, num_factors, lambda_reg, down_weight_bias=True)
    
    # Step 2: Prepare Training Data (Placeholder)
    # In practice, replace this with your actual training data
    # Here, we simulate some training data
    train_data = []
    for u in range(num_users):
        for i in range(num_items):
            rating = np.random.uniform(1, 5)  # Random rating between 1 and 5
            train_data.append((u, i, rating))
    
    # Step 3: Train Models
    user_bias_A, item_bias_A, U_A, V_A = train_als(user_bias_A, item_bias_A, U_A, V_A, train_data, num_users, num_items, num_factors, lambda_reg, num_iters, down_weight_bias=False)
    user_bias_B, item_bias_B, U_B, V_B = train_als(user_bias_B, item_bias_B, U_B, V_B, train_data, num_users, num_items, num_factors, lambda_reg, num_iters, down_weight_bias=True)
    
    # Step 4: Define Models Dictionary
    models = {
        'A': {
            'user_bias': user_bias_A,
            'item_bias': item_bias_A,
            'U': U_A,
            'V': V_A,
            'down_weight_bias': False
        },
        'B': {
            'user_bias': user_bias_B,
            'item_bias': item_bias_B,
            'U': U_B,
            'V': V_B,
            'down_weight_bias': True
        }
    }
    
    # Step 5: Assign Users to Groups
    all_user_ids = list(range(1, num_users + 1))  # User IDs from 1 to num_users
    user_group_mapping = assign_users_to_groups(all_user_ids, seed=42)
    
    # Step 6: Define True Preferences for Simulation
    # For simulation, assign some true preferences to users
    true_preferences_mapping = {
        1: {1: 5.0, 2: 3.0, 3: 4.0},
        2: {4: 2.0, 5: 5.0, 6: 3.5},
        3: {7: 4.5, 8: 2.5, 9: 3.0},
        4: {10: 3.0, 11: 4.0, 12: 2.5},
        5: {13: 5.0, 14: 3.5, 15: 4.5},
        6: {16: 2.5, 17: 4.0, 18: 3.0},
        7: {19: 3.5, 20: 2.0, 21: 4.5},
        8: {22: 4.0, 23: 3.0, 24: 5.0},
        9: {25: 1.5, 26: 4.0, 27: 3.5},
        10: {28: 2.0, 29: 3.5, 30: 4.0}
    }
    
    # Step 7: Run A/B Test
    run_ab_test(user_group_mapping, models, true_preferences_mapping, num_users, num_items)
    
    # Step 8: Evaluate Results
    metrics = evaluate_ab_test(LOG_FILE)
    compare_groups(metrics, LOG_FILE)

if __name__ == "__main__":
    main()


In [None]:
python ab_test_als.py