# Part 2: Content-Based Recommendation

## 3. Feature Extraction and Vector Space Model

In this section, we implement **Text Feature Extraction** using a manually constructed **TF-IDF (Term Frequency-Inverse Document Frequency)** vectorizer. 

We will not use high-level libraries like `sklearn`'s `TfidfVectorizer` for the core logic. instead, we will build the process step-by-step:
1.  **Text Preprocessing**: Tokenization and stop-word removal.
2.  **Vocabulary Building**: Identifying unique terms and their document frequencies.
3.  **IDF Computation**: Calculating the inverse document frequency weights.
4.  **Vector Construction**: Transforming text data into a sparse TF-IDF matrix.


In [None]:
import numpy as np
import pandas as pd
import re
import math
from collections import Counter
from scipy.sparse import csr_matrix, hstack, save_npz, vstack, diags
import os

# Ensure results directory exists
RESULTS_DIR = "../results"
if not os.path.exists(RESULTS_DIR):
    os.makedirs(RESULTS_DIR)

### Subtask 1: Define the text source for items

We will extract the relevant text data from our items. We will combine `title` and `categories` into a single text string for each item.

In [None]:
def get_item_text_data(df_items):
    """
    Extracts and combines title and categories into a single text field.
    Expects a DataFrame unique by item_id.
    """
    print("Extracting item text data...")
    
    # Ensure we are working with string types
    df_items['title'] = df_items['title'].fillna('')
    df_items['categories'] = df_items['categories'].fillna('')
    
    # Combine title and categories
    # Helper to clean category string representation if it looks like "['Books', 'Fiction']"
    def clean_cat(c):
        if isinstance(c, str) and c.startswith("[") and c.endswith("]"):
             # Simple parse or just strip
             return c.replace("'", "").replace("[", "").replace("]", "").replace(",", " ")
        return str(c)

    df_items['text_source'] = df_items['title'] + " " + df_items['categories'].apply(clean_cat)
    
    print(f"Created text corpus for {len(df_items)} items.")
    return df_items['text_source'].tolist(), df_items['item_id'].tolist()

### Subtask 2: Basic text preprocessing

We perform tokenization (splitting text into words) and remove common stop-words.

In [None]:
def manual_tokenize_and_clean(text_corpus):
    """
    Tokenizes text and removes stop words manually.
    Returns a list of lists of tokens.
    """
    print("Preprocessing text (Tokenization & Stop-word removal)...")
    
    # Basic English Stop Words Set
    STOP_WORDS = set([
        'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
        'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
        'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
        'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
        'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
        'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
        'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
        'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
        'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
        'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
        'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
        'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', '&'
    ])
    
    processed_corpus = []
    
    for text in text_corpus:
        # 1. Lowercase
        text = text.lower()
        # 2. Tokenize using regex (keep only alphanumeric)
        tokens = re.findall(r'\b[a-z]{2,}\b', text)
        # 3. Remove stop words
        clean_tokens = [t for t in tokens if t not in STOP_WORDS]
        processed_corpus.append(clean_tokens)
        
    print(f"Preprocessed {len(processed_corpus)} documents.")
    return processed_corpus

### Subtask 3 & 4: Configure and Fit TF-IDF Vectorizer

We will build the vocabulary from our processed corpus, filtering rare words to keep dimensionality manageable. Then, we calculate the IDF for each word.

$$ IDF(t) = \log \left( \frac{N}{DF(t)} \right) $$
where $N$ is total documents and $DF(t)$ is document frequency of term $t$.

In [None]:
def build_vocabulary_and_idf(processed_corpus, min_df=5):
    """
    Builds vocabulary and calculates IDF scores.
    min_df: Ignore terms that appear in less than `min_df` documents.
    """
    print(f"Building vocabulary (min_df={min_df})...")
    
    # 1. Calculate Document Frequencies (DF)
    doc_freqs = Counter()
    N = len(processed_corpus)
    
    for tokens in processed_corpus:
        unique_tokens = set(tokens)
        for t in unique_tokens:
            doc_freqs[t] += 1
            
    total_unique_terms = len(doc_freqs)
    
    # 2. Filter by min_df and build Vocab Index
    vocab = {}
    idf_scores = {}
    idx = 0
    
    sorted_terms = sorted(doc_freqs.keys()) 
    
    for term in sorted_terms:
        df = doc_freqs[term]
        if df >= min_df:
            vocab[term] = idx
            # Standard IDF: log(N / df) + 1 (smoothing)
            idf_scores[term] = math.log((N + 1) / (df + 1)) + 1
            idx += 1
            
    print(f"Validation: Processed {N} documents. Found {total_unique_terms} unique terms. "
          f"Kept {len(vocab)} terms with frequency >= {min_df}.")
    return vocab, idf_scores

### Subtask 4 (Continued): Transform to TF-IDF Matrix

We now calculate the TF-IDF vector for each document and store it in a sparse CSR matrix.
$$ TF\hbox{-}IDF(t, d) = TF(t, d) \times IDF(t) $$

In [None]:
def compute_tf_idf_matrix(processed_corpus, vocab, idf_scores):
    """
    Converts the corpus into a sparse TF-IDF matrix.
    """
    print("Computing TF-IDF matrix...")
    rows = []
    cols = []
    data = []
    
    for doc_idx, tokens in enumerate(processed_corpus):
        term_counts = Counter(tokens)
        for term, count in term_counts.items():
            if term in vocab:
                col_idx = vocab[term]
                tf_idf_val = count * idf_scores[term]
                rows.append(doc_idx)
                cols.append(col_idx)
                data.append(tf_idf_val)

    matrix = csr_matrix((data, (rows, cols)), shape=(len(processed_corpus), len(vocab)))
    
    print("Performing L2 Normalization...")
    row_sums = np.array(matrix.power(2).sum(axis=1))
    row_norms = np.sqrt(row_sums)
    row_norms[row_norms == 0] = 1.0 
    row_norms = row_norms.flatten()
    
    inv_norms = 1.0 / row_norms
    from scipy.sparse import diags
    norm_matrix = diags(inv_norms) @ matrix
    
    print(f"TF-IDF matrix computation complete. Shape: {norm_matrix.shape}")
    first_row_norm = np.sqrt(np.sum(norm_matrix[0].data**2)) if norm_matrix.shape[0] > 0 else 0
    print(f"Validation: L2 norm of the first document vector: {first_row_norm:.4f}")
    
    return norm_matrix

### Subtask 5 & 6: Inspect and Validate

We check the sparsity of our matrix and save a summary of the vocabulary to the results.

In [None]:
def validate_and_save_features(matrix, vocab, item_ids):
    """
    Validates the feature matrix and saves vocab summary.
    """
    print("\n--- Feature Space Validation ---")
    n_docs, n_terms = matrix.shape
    nnz = matrix.nnz
    sparsity = 1.0 - (nnz / (n_docs * n_terms))
    
    print(f"Matrix Shape: ({n_docs}, {n_terms})")
    print(f"Non-zero elements: {nnz})")
    print(f"Sparsity: {sparsity*100:.4f}%")
    
    # Save Vocabulary Summary
    vocab_list = sorted(vocab.items(), key=lambda x: x[1])
    df_vocab = pd.DataFrame(vocab_list, columns=['Term', 'Index'])
    
    full_vocab_path = os.path.join(RESULTS_DIR, "tfidf_vocabulary_full.csv")
    sample_vocab_path = os.path.join(RESULTS_DIR, "tfidf_vocabulary_sample.csv")
    
    df_vocab.to_csv(full_vocab_path, index=False)
    df_vocab.head(100).to_csv(sample_vocab_path, index=False)
    
    print(f"Saved full vocabulary to {full_vocab_path}")
    print(f"Saved top 100 vocabulary terms to {sample_vocab_path}")
    
    return df_vocab

## 3.2. Additional Features

We incorporate **numerical features** (`price`, `average_rating`) and **categorical features** (`categories`) to enrich the item representation.

### Subtask 1: Identify available additional features
We extract `price` and `average_rating` from the metadata. `price` often requires cleaning (removing '$', converting to float).

In [None]:
def load_additional_features(df_items):
    """
    Extracts price and average_rating. 
    Expects df_items to have these columns. If not, we fill with defaults.
    """
    print("\n--- Extracting Additional Features ---")
    
    # Create a copy to avoid SettingWithCopy warnings on the main df
    df_features = df_items.copy()
    
    # Check if columns exist, otherwise create them with NaNs
    if 'price' not in df_features.columns:
        df_features['price'] = np.nan
    if 'average_rating' not in df_features.columns:
        df_features['average_rating'] = np.nan
        
    # Clean Price
    def clean_price(p):
        if isinstance(p, (int, float)):
            return float(p)
        if isinstance(p, str):
            match = re.search(r'(\d+\.?\d*)', p)
            if match:
                return float(match.group(1))
        return np.nan

    df_features['price_num'] = df_features['price'].apply(clean_price)
    
    # Clean Rating
    def clean_rating(r):
        try:
            return float(r)
        except:
            return np.nan
            
    df_features['rating_num'] = df_features['average_rating'].apply(clean_rating)
    
    # Impute missing values with median (Manual Imputation)
    price_median = df_features['price_num'].median()
    rating_median = df_features['rating_num'].median()
    
    if pd.isna(price_median): price_median = 0.0
    if pd.isna(rating_median): rating_median = 3.0 
    
    df_features['price_num'] = df_features['price_num'].fillna(price_median)
    df_features['rating_num'] = df_features['rating_num'].fillna(rating_median)
    
    print(f"Extracted numerical features. Imputed Price Median: {price_median}, Rating Median: {rating_median}")
    return df_features[['item_id', 'price_num', 'rating_num', 'categories']]

### Subtask 2-4: Process features

**1. Numerical Features**: We normalize `price` and `rating` using **Min-Max Scaling** manually to bring them to [0, 1] range.

**2. Categorical Features**: We perform a simplified **One-Hot Encoding** for the top `K` most frequent categories.

In [None]:
def process_numerical_features(df_features):
    """
    Normalizes numerical columns using Min-Max scaling manually.
    """
    print("Processing numerical features (Min-Max Scaling)...")
    
    # Price
    p_min = df_features['price_num'].min()
    p_max = df_features['price_num'].max()
    if p_max > p_min:
        df_features['price_scaled'] = (df_features['price_num'] - p_min) / (p_max - p_min)
    else:
        df_features['price_scaled'] = 0.0
        
    # Rating
    r_min = df_features['rating_num'].min()
    r_max = df_features['rating_num'].max()
    if r_max > r_min:
        df_features['rating_scaled'] = (df_features['rating_num'] - r_min) / (r_max - r_min)
    else:
        df_features['rating_scaled'] = 0.5
        
    return df_features[['price_scaled', 'rating_scaled']].values

def process_categorical_features(df_features, top_k=20):
    """
    Manually One-Hot Encodes top K categories.
    """
    print(f"Processing categorical features (Top {top_k} OHE)...")
    
    all_cats = []
    cat_series = df_features['categories'].astype(str)
    
    for entry in cat_series:
        clean = re.sub(r'[^a-zA-Z0-9\s]', ' ', entry)
        words = clean.split()
        all_cats.extend(words)
        
    cat_counts = Counter(all_cats)
    top_cats = [c[0] for c in cat_counts.most_common(top_k) if len(c[0]) > 2]
    top_cats = top_cats[:top_k]
    
    print(f"Top categories identified: {top_cats}")
    
    N = len(df_features)
    cat_matrix = np.zeros((N, len(top_cats)))
    
    for i, entry in enumerate(cat_series):
        entry_lower = entry.lower()
        for j, cat in enumerate(top_cats):
            if cat.lower() in entry_lower:
                cat_matrix[i, j] = 1.0
                
    return cat_matrix, top_cats

### Subtask 5: Combine features

We concatenate the sparse TF-IDF matrix with the dense numerical and categorical matrices.

In [None]:
def combine_features(tfidf_matrix, num_features, cat_features):
    """
    Combines TF-IDF, Numerical, and Categorical features.
    """
    print("Combining all features...")
    
    num_sparse = csr_matrix(num_features)
    cat_sparse = csr_matrix(cat_features)
    
    final_matrix = hstack([tfidf_matrix, num_sparse, cat_sparse], format='csr')
    
    print(f"Final Feature Matrix Shape: {final_matrix.shape}")
    
    # Save matrix
    save_npz_path = os.path.join(RESULTS_DIR, "feature_matrix.npz")
    save_npz(save_npz_path, final_matrix)
    print(f"Feature matrix saved to {save_npz_path}")
    
    return final_matrix

## 3.3. Create item-feature matrix and document your feature selection

We have combined the features. Now we must **validate** the consistency of the feature space and **document** our rationale.

### Subtask 5 & 6: Validate and Check Scaling
We check if the different feature distincts (TF-IDF vs Numerical) have vastly different magnitudes. While TF-IDF is unit-length (L2), numerical features are [0, 1]. In high dimensions, unit vectors have small components. We verify statistics.

In [None]:
def validate_item_feature_matrix(matrix):
    """
    Checks the final matrix properties and scaling consistency.
    """
    print("\n--- Validating Item-Feature Matrix ---")
    
    # 1. Shape and Sparsity
    n_items, n_features = matrix.shape
    nnz = matrix.nnz
    sparsity = 1.0 - (nnz / (n_items * n_features))
    print(f"Final Matrix Shape: ({n_items}, {n_features})")
    print(f"Sparsity: {sparsity*100:.4f}%")
    
    # 2. Statistics of Values (Random Sample of rows)
    # We want to see max values in the matrix to ensure nothing explodes
    sample_indices = np.random.choice(n_items, size=min(1000, n_items), replace=False)
    sample_matrix = matrix[sample_indices]
    
    max_val = sample_matrix.max()
    mean_val = sample_matrix.mean()
    
    print(f"Max value in sample: {max_val:.4f}")
    print(f"Mean value in sample: {mean_val:.6f} (Expected to be low due to sparsity)")
    
    # 3. Check Row Norms (Are they roughly consistent?)
    # TF-IDF rows are norm 1. Adding [0,1] features will increase norm > 1.
    row_sums = np.array(sample_matrix.power(2).sum(axis=1))
    row_norms = np.sqrt(row_sums).flatten()
    
    print(f"Average Row L2 Norm: {np.mean(row_norms):.4f} (TF-IDF base was 1.0)")
    
    if max_val > 10.0:
        print("WARNING: Some features have very high values. scaling might be off.")
    else:
        print("Scaling consistency check passed (Values roughly in expected range).")

### Subtask 7: Document feature selection rationale

We save a explanation of why we chose these features.

In [None]:
def save_feature_selection_rationale():
    """
    Saves the feature selection rationale.
    """
    print("Saving feature selection rationale...")
    
    rationale = """
# Feature Selection Rationale for Book Recommendation

## 1. Text Features (TF-IDF)
- **Source**: 'title' combined with 'categories'.
- **Method**: TF-IDF (Term Frequency-Inverse Document Frequency).
- **Reason**: Books are content-rich items. Title and categories provide strong semantic signals about the book's topic/genre. TF-IDF downweights common words ensuring unique keywords drive similarity.

## 2. Numerical Features
- **Features**: 'price', 'average_rating'.
- **Method**: Min-Max Scaling [0, 1].
- **Reason**: 
    - **Price**: Users often prefer books in specific price ranges. Normalization ensures it doesn't dominate the vector space.
    - **Average Rating**: Acts as a proxy for quality. Higher rated items might be more 'summable' with other high quality items.

## 3. Categorical Features
- **Method**: One-Hot Encoding (Top 20 frequent categories).
- **Reason**: While 'categories' are in the text indices, explicit dimensions for major genres (Fiction, Mystery, etc.) allow the model to strictly cluster items of the same 'type' even if titles are very different.

## 4. Combination
- We stack these vectors. The final representation mixes semantic similarity (Text) with property similarity (Price/Quality/Genre).
"""
    
    path = os.path.join(RESULTS_DIR, "feature_selection_rationale.md")
    with open(path, "w") as f:
        f.write(rationale)
        
    print(f"Rationale saved to {path}")
    
    print("Feature Matrix Creation and Documentation Complete.")

# -------------------------------------------------------------------------
# 4. User Profile Construction
# -------------------------------------------------------------------------

We construct user profiles by computing the **weighted average** of the feature vectors of items they have rated. 

### Subtask 1-4: Build Profiles (Batched)
**Correction:** Due to large number of users, we process in batches and save intermediate sparse matrices to avoid memory errors.

For each user $u$:
$$ \vec{p}_u = \frac{\sum_{i \in I_u} r_{ui} \cdot \vec{f}_i}{\sum_{i \in I_u} r_{ui}} $$
where $\vec{f}_i$ is the item feature vector and $r_{ui}$ is the rating.

In [None]:
def build_user_profiles(df_interactions, item_feature_matrix, df_items_map, batch_size=5000):
    """
    Constructs user profiles in batches to manage memory.
    Saves chunks to results/user_profiles_parts/.
    Returns a list of paths to the saved chunks.
    """
    print("\n--- User Profile Construction (Batched) ---")
    
    # Create parts directory
    parts_dir = os.path.join(RESULTS_DIR, "user_profiles_parts")
    if not os.path.exists(parts_dir):
        os.makedirs(parts_dir)
        
    # 1. Map Item IDs -> Matrix Indices
    print("Mapping item IDs to feature matrix indices...")
    item_to_idx = {iid: idx for idx, iid in enumerate(df_items_map['item_id'])}
    
    # 2. Group interactions
    user_groups = df_interactions.groupby('user_id')
    n_users = len(user_groups)
    print(f"Total users to process: {n_users}")
    
    current_batch_vectors = []
    current_batch_ids = []
    saved_batches = []
    
    count = 0
    n_features = item_feature_matrix.shape[1]
    
    for uid, group in user_groups:
        valid_indices = []
        ratings = []
        
        for _, row in group.iterrows():
            iid = row['item_id']
            r = row['rating']
            if iid in item_to_idx:
                valid_indices.append(item_to_idx[iid])
                ratings.append(r)
        
        # Calculate User Vector
        if not valid_indices:
            # Cold start / No valid items -> Zero Vector
            user_vec_sparse = csr_matrix((1, n_features))
        else:
            item_vecs = item_feature_matrix[valid_indices]
            # Weighted Sum: sum(r_i * v_i)
            # item_vecs is sparse. ratings is scalar list.
            # multiply returns sparse. sum returns dense np.matrix.
            ratings_arr = np.array(ratings).reshape(-1, 1)
            weighted_sum_dense = item_vecs.multiply(ratings_arr).sum(axis=0)
            
            # Normalize by total rating spread
            total_rating = np.sum(ratings)
            if total_rating > 0:
                weighted_sum_dense /= total_rating
                
            # CRITICAL: Convert back to sparse immediately to save memory
            user_vec_sparse = csr_matrix(weighted_sum_dense)
            
        current_batch_vectors.append(user_vec_sparse)
        current_batch_ids.append(uid)
        count += 1
        
        # Batch Save Check
        if len(current_batch_vectors) >= batch_size:
            # Stack
            batch_matrix = vstack(current_batch_vectors)
            
            # Normalize Batch (L2) - Optimization allows doing this on chunks
            # Row norms
            row_sums = np.array(batch_matrix.power(2).sum(axis=1))
            row_norms = np.sqrt(row_sums).flatten()
            row_norms[row_norms == 0] = 1.0
            inv_norms = 1.0 / row_norms
            batch_matrix = diags(inv_norms) @ batch_matrix
            
            # Save
            batch_idx = len(saved_batches)
            filename = f"user_profiles_part_{batch_idx}.npz"
            path = os.path.join(parts_dir, filename)
            save_npz(path, batch_matrix)
            
            # Save IDs
            id_path = path.replace(".npz", "_ids.csv")
            pd.DataFrame(current_batch_ids, columns=['user_id']).to_csv(id_path, index=False)
            
            saved_batches.append(path)
            print(f"Saved batch {batch_idx}: {batch_matrix.shape} to {filename}")
            
            # Clear Memory
            current_batch_vectors = []
            current_batch_ids = []
            import gc; gc.collect()
            
        if count % 10000 == 0:
            print(f"Processed {count}/{n_users} users...")

    # Process Final Batch
    if current_batch_vectors:
        batch_matrix = vstack(current_batch_vectors)
        
        # Normalize
        row_sums = np.array(batch_matrix.power(2).sum(axis=1))
        row_norms = np.sqrt(row_sums).flatten()
        row_norms[row_norms == 0] = 1.0
        inv_norms = 1.0 / row_norms
        batch_matrix = diags(inv_norms) @ batch_matrix
        
        batch_idx = len(saved_batches)
        filename = f"user_profiles_part_{batch_idx}.npz"
        path = os.path.join(parts_dir, filename)
        save_npz(path, batch_matrix)
        
        id_path = path.replace(".npz", "_ids.csv")
        pd.DataFrame(current_batch_ids, columns=['user_id']).to_csv(id_path, index=False)
        saved_batches.append(path)
        print(f"Saved final batch {batch_idx}: {batch_matrix.shape} to {filename}")
        
    print(f"\nUser Profile Construction Complete. Saved {len(saved_batches)} parts.")
    return saved_batches

## 4.2. Handle cold-start users (Popular Item Features Strategy)

### Subtask 1: Define what a cold-start user is
A **Cold-Start User** is a new user who has not interacted with (rated or viewed) any items in the system yet.
Since there is no historical data to compute a personalized profile or find similar users, traditional Collaborative Filtering fails.
To address this, we use a **Popularity-Based** or **Demographic-Based** strategy to generate an initial profile.
Here, we use the **Popular Item Features** strategy: we assume a new user is likely to be interested in what the majority of people like.

In [None]:
# Subtask 2: Identify popular items
def identify_popular_items(df_interactions, top_n=50):
    """
    Identifies the top N most rated items.
    """
    print(f"Identifying top {top_n} popular items...")
    popular_counts = df_interactions['item_id'].value_counts().head(top_n)
    popular_item_ids = popular_counts.index.tolist()
    print(f"Found {len(popular_item_ids)} popular items.")
    return popular_item_ids

In [None]:
# Subtask 3: Extract feature vectors of popular items
def extract_popular_vectors(popular_item_ids, item_feature_matrix, df_items_map):
    """
    Retrieves the sparse feature vectors for the popular items.
    """
    print("Extracting feature vectors for popular items...")
    # Map IDs to matrix indices
    item_to_idx = {iid: idx for idx, iid in enumerate(df_items_map['item_id'])}
    
    indices = []
    for iid in popular_item_ids:
        if iid in item_to_idx:
            indices.append(item_to_idx[iid])
            
    if not indices:
        print("Warning: No popular items found in feature matrix.")
        return None
        
    pop_vectors = item_feature_matrix[indices]
    print(f"Extracted shape: {pop_vectors.shape}")
    return pop_vectors

In [None]:
# Subtask 4 & 5: Construct and Normalize cold-start profile
def construct_cold_start_profile(df_interactions, item_feature_matrix, df_items_map, top_n=50):
    """
    Creates a single profile vector representing the 'average' user based on popular items.
    """
    print("\n--- Constructing Cold-Start User Profile ---")
    
    # 1. Identify Popular Items
    pop_ids = identify_popular_items(df_interactions, top_n)
    
    # 2. Extract Vectors
    pop_vectors = extract_popular_vectors(pop_ids, item_feature_matrix, df_items_map)
    
    if pop_vectors is None:
        # Return zero vector if fails
        return csr_matrix((1, item_feature_matrix.shape[1]))
    
    # 3. Average (Centroid)
    # sum along axis 0 (items), then divide by count
    cold_start_vec = pop_vectors.sum(axis=0) / pop_vectors.shape[0]
    cold_start_vec = csr_matrix(cold_start_vec)
    
    # 4. Normalize (Subtask 5)
    # L2 Normalization to ensure unit length
    norm = np.linalg.norm(cold_start_vec.data)
    if norm > 0:
        cold_start_vec = cold_start_vec / norm
        
    print(f"Cold-start profile constructed. Shape: {cold_start_vec.shape}")
    return cold_start_vec

### Subtask 6: Explain when this profile is used
This **Cold-Start Profile** is used whenever the system encounters a user with **zero interactions** (or fewer than a threshold, e.g., < 3).
Instead of returning random items, we use this profile to calculate cosine similarity against all items, effectively returning items that differ slightly from pure popularity but are semantically similar to the popular 'consensus'.

### Subtask 7: Justify the strategy
**Justification**:
1.  **Robustness**: Popular items are statistically significant 'safe bets' for unknown users.
2.  **Content-Aware**: By averaging *features* of popular items rather than just recommending IDs, we can recommend *niche* items that are similar content-wise to popular ones, improving diversity (Serendipity) compared to a simple "Top-N Popular" list.
3.  **Simplicity**: It effectively boosts the user into the vector space immediately without requiring expensive model retraining (SVD) or demographic data lookup.

In [None]:
# Main Execution for Cold Start (Function Wrapper)
def run_cold_start_module(df_interactions, item_feature_matrix, df_items_map):
    """
    Runs the cold start profile construction and saves the result.
    """
    print("\n=== Running Cold-Start Module ===")
    cold_user_profile = construct_cold_start_profile(df_interactions, item_feature_matrix, df_items_map)
    
    # Save result
    save_path = os.path.join(RESULTS_DIR, "cold_start_profile.npz")
    save_npz(save_path, cold_user_profile)
    print(f"Saved cold-start profile to {save_path}")
    return cold_user_profile

## 5. Similarity Computation and Recommendation

We now calculate the similarity between the user profile and all items to generate recommendations.

### Subtask 1: Ensure vector space alignment
We verify that the User Vector and Item Feature Matrix share the same number of dimensions before proceeding.

In [None]:
def check_vector_alignment(user_vec, item_matrix):
    """
    Verifies that user vector and item matrix have consistent dimensions.
    """
    print("\n--- Verifying Vector Alignment ---")
    user_dim = user_vec.shape[1]
    item_dim = item_matrix.shape[1]
    
    if user_dim != item_dim:
        raise ValueError(f"Dimension mismatch! User: {user_dim}, Item: {item_dim}")
        
    print(f"Alignment Verified. Dimensions: {user_dim}")
    return True

### Subtask 2: Define cosine similarity formally

**Cosine Similarity** measures the cosine of the angle between two non-zero vectors. 
$$ \text{similarity} = \cos(\theta) = \frac{\mathbf{A} \cdot \mathbf{B}}{||\mathbf{A}|| \cdot ||\mathbf{B}||} $$

Since our vectors (both TF-IDF/Item vectors and User vectors) effectively undergo L2 normalization during their construction, their magnitudes are close to 1. Thus, the calculation simplifies to the **Dot Product**:
$$ \text{similarity} \approx \mathbf{A} \cdot \mathbf{B} $$

In [None]:
# Subtask 3: Compute user–item similarity scores
def compute_cosine_similarity(user_vec, item_matrix):
    """
    Computes Dot Product between User Vector and all Item Vectors.
    Assumes vectors are L2 normalized.
    """
    print("Computing Cosine Similarity...")
    
    # Dot Product: (1 x F) dot (N x F)^T = (1 x N)
    # Scipy sparse optimization
    similarity_scores = item_matrix.dot(user_vec.T)
    
    # Convert to dense array for easier handling [N x 1] -> [N]
    similarity_scores = similarity_scores.toarray().flatten()
    
    print(f"Computed {len(similarity_scores)} similarity scores.")
    return similarity_scores

In [None]:
# Subtask 4: Store similarity scores properly
def save_similarity_scores(scores, item_ids, filename="similarity_scores.csv"):
    """
    Saves the full list of scores to results for audit.
    """
    print(f"Saving similarity scores to {filename}...")
    path = os.path.join(RESULTS_DIR, filename)
    
    # Create DataFrame
    df_scores = pd.DataFrame({
        'item_id': item_ids,
        'score': scores
    })
    
    # Sort desceding
    df_scores = df_scores.sort_values(by='score', ascending=False)
    
    # Save CSV (potentially large, so maybe just top 100k if really huge, but here we save all)
    df_scores.to_csv(path, index=False)
    print("File saved.")
    return df_scores

In [None]:
# Subtask 5: Verify similarity results
def verify_similarity_results(scores):
    """
    Checks valid range [-1, 1] for cosine similarity.
    """
    print("Verifying similarity scores...")
    min_s = scores.min()
    max_s = scores.max()
    
    print(f"Range: [{min_s:.4f}, {max_s:.4f}]")
    
    if min_s < -1.01 or max_s > 1.01:
        print("WARNING: Scores out of expected cosine range [-1, 1]. Check normalization.")
    else:
        print("Verification Passed: Scores within valid range.")

### Subtask 6: Explain what similarity scores mean

The **Similarity Score** (ranging from -1 to 1) quantifies how close the item's content is to the user's preference profile.
- **Approaching 1**: The item is very semantically similar to what the user likes (or the popular consensus in the cold-start case).
- **Approaching 0**: The item is orthogonal (unrelated) to the user's profile.
- **Approaching -1**: The item is opposite (rare in positive-only Feature spaces like TF-IDF, but possible mathematically).

In [None]:
# Main Execution for Similarity (Function Wrapper)
def run_similarity_module(user_vec, item_matrix, df_items_map):
    """
    Runs the similarity pipeline.
    """
    print("\n=== Running Similarity Component ===")
    
    # 1. Check Alignment
    check_vector_alignment(user_vec, item_matrix)
    
    # 2. Compute
    scores = compute_cosine_similarity(user_vec, item_matrix)
    
    # 3. Verify
    verify_similarity_results(scores)
    
    # 4. Save
    df_scored = save_similarity_scores(scores, df_items_map['item_id'], filename="cold_start_similarity_scores.csv")
    
    return df_scored

## 5.2. Generate Top-N Recommendations

We rank items by their similarity scores and exclude items the user has already rated to generate the final Top-N list.

In [None]:

# Subtask 1: Decide which users you will generate recommendations for
def get_target_users(df_interactions, cold_start_profile, n_existing=2):
    """
    Selects valid users for demonstration: 1 Cold Start (simulated) + N existing users.
    Returns a dictionary of {user_label: user_profile_vector/user_id}
    """
    print("\n--- Selecting Target Users ---")
    target_users = {}
    
    # 1. Cold Start User
    target_users['Cold_Start_User'] = {
        'type': 'cold',
        'data': cold_start_profile
    }
    
    # 2. Existing Users (Pick random high-activity users)
    user_counts = df_interactions['user_id'].value_counts()
    active_users = user_counts.head(50).index.tolist()
    selected_existing = active_users[:n_existing]
    
    for uid in selected_existing:
        target_users[f'User_{uid}'] = {
            'type': 'existing',
            'data': uid # We will need to fetch their vector later or re-compute it
        }
        
    print(f"Selected {len(target_users)} target users: {list(target_users.keys())}")

    return target_users




In [None]:
def get_target_users_1(df_interactions, cold_start_profile, n_existing=2, user_col='user_id'):
    print("\n--- Selecting Target Users ---")
    target_users = {
        'Cold_Start_User': {'type': 'cold', 'data': cold_start_profile}
    }

    # Validate df
    if df_interactions is None or len(df_interactions) == 0:
        print("WARNING: df_interactions is empty. No existing users can be selected.")
        return target_users

    if user_col not in df_interactions.columns:
        raise ValueError(f"'{user_col}' not found in df_interactions columns: {list(df_interactions.columns)}")

    user_counts = df_interactions[user_col].dropna().value_counts()
    if len(user_counts) == 0:
        print(f"WARNING: No valid users found in column '{user_col}'.")
        return target_users

    active_users = user_counts.head(50).index.tolist()
    selected_existing = active_users[:n_existing]

    for uid in selected_existing:
        target_users[f'User_{uid}'] = {'type': 'existing', 'data': uid}

    print(f"Selected {len(target_users)} target users: {list(target_users.keys())}")
    return target_users


In [None]:
# Subtask 2: Build “already-rated items” set per user
def get_user_rated_items(df_interactions, user_id):
    """
    Returns a set of item_ids that the user has already interacted with.
    """
    if user_id is None: # Cold start
        return set()
        
    user_data = df_interactions[df_interactions['user_id'] == user_id]
    rated_items = set(user_data['item_id'].unique())
    # print(f"User {user_id} has rated {len(rated_items)} items.")
    return rated_items

In [None]:
# Subtask 3 & 4: Filter and Rank candidate items
def rank_and_filter_items(similarity_scores, item_ids, rated_items_set):
    """
    - Pairs items with scores.
    - Removes already rated items.
    - Sorts by score descending.
    """
    print(f"Ranking items... (Total candidates: {len(item_ids)}) ")
    
    # Create a list of tuples: (score, item_id)
    # We assume similarity_scores and item_ids are aligned by index
    candidates = []
    
    for score, iid in zip(similarity_scores, item_ids):
        if iid not in rated_items_set:
            candidates.append((score, iid))
            
    print(f"Filtered out {len(item_ids) - len(candidates)} rated items. Remaining candidates: {len(candidates)}")
    
    # Sort descending by score (Manual sort)
    candidates.sort(key=lambda x: x[0], reverse=True)
    
    return candidates

In [None]:
# Subtask 5: Select Top-10 and Top-20
def generate_top_n_recommendations(sorted_candidates, top_n_list=[10, 20]):
    """
    Slices the sorted list to get top N.
    """
    results = {}
    for n in top_n_list:
        results[n] = sorted_candidates[:n]
        
    return results

In [None]:
# Subtask 6: Save recommendations in report-friendly format
def save_recommendations(user_label, top_n_results, df_items_map):
    """
    Saves top-20 recommendations to CSV with details (Title, Category, Score).
    """
    print(f"Saving recommendations for {user_label}...")
    
    # Map item_id to details
    item_map = df_items_map.set_index('item_id').to_dict('index')
    
    # We focus on the largest N (Top 20) for saving
    max_n = max(top_n_results.keys())
    top_items = top_n_results[max_n]
    
    data = []
    rank = 1
    for score, iid in top_items:
        info = item_map.get(iid, {})
        row = {
            'Rank': rank,
            'User': user_label,
            'Item_ID': iid,
            'Score': round(score, 6),
            'Title': info.get('title', 'Unknown'),
            'Category': info.get('categories', 'Unknown')
        }
        data.append(row)
        rank += 1
        
    df_recs = pd.DataFrame(data)
    filename = f"recommendations_{user_label}.csv"
    path = os.path.join(RESULTS_DIR, filename)
    df_recs.to_csv(path, index=False)
    
    print(f"Saved to {path}")
    return df_recs

### Subtask 8: Explanation of ranking logic

The ranking is purely based on the **Cosine Similarity Score**.
1.  We calculate the similarity between the User Profile (weighted average of their history) and every Item Vector.
2.  We **exclude** items the user has already rated to ensure novelty.
3.  We **sort** the remaining items in descending order of similarity.
4.  The top items represented the "best match" in the vector space.

In [None]:
# Subtask 7: Provide example outputs for 2-3 users (Orchestrator)
def run_recommendation_pipeline(df_interactions, item_feature_matrix, df_items_map, cold_start_profile):
    """
    Runs the full recommendation process for selected users.
    """
    print("\n=== Running Recommendation Pipeline ===")
    
    # 1. Select Users
    target_users = get_target_users(df_interactions, cold_start_profile, n_existing=2)
    
    all_recs = []
    
    for label, info in target_users.items():
        print(f"\nProcessing {label}...")
        
        # Get User Vector
        if info['type'] == 'cold':
            user_vec = info['data']
            rated_items = set()
        else:
            # Re-construct user vector on the fly for demonstration (or load if we saved all)
            # Here we cheat/reuse the build logic for single user or simplest: just accept we saved them?
            # Since we saved them in batches, loading one is hard. Let's recompute it fast.
            uid = info['data']
            rated_items = get_user_rated_items(df_interactions, uid)
            
            # Helper to get vector (simplified version of build_profiles)
            item_to_idx = {iid: idx for idx, iid in enumerate(df_items_map['item_id'])}
            indices = [item_to_idx[iid] for iid in rated_items if iid in item_to_idx]
            
            if not indices:
                 user_vec = csr_matrix((1, item_feature_matrix.shape[1]))
            else:
                # We need ratings too for weighted avg
                user_data = df_interactions[df_interactions['user_id'] == uid]
                ratings_map = temp_r_map = dict(zip(user_data['item_id'], user_data['rating']))
                
                # Re-extract to be safe (slow but correct for 1 user)
                valid_indices = []
                valid_ratings = []
                for iid in rated_items:
                    if iid in item_to_idx:
                        valid_indices.append(item_to_idx[iid])
                        valid_ratings.append(ratings_map[iid])
                        
                item_vecs = item_feature_matrix[valid_indices]
                ratings_arr = np.array(valid_ratings).reshape(-1, 1)
                weighted = item_vecs.multiply(ratings_arr).sum(axis=0)
                weighted /= np.sum(valid_ratings)
                user_vec = csr_matrix(weighted)
                
                # Normalize
                norm = np.linalg.norm(user_vec.data)
                if norm > 0: user_vec = user_vec / norm
        
        # 2. Compute Similarity
        scores = compute_cosine_similarity(user_vec, item_feature_matrix)
        
        # 3. Filter & Rank
        item_ids = df_items_map['item_id'].tolist()
        candidates = rank_and_filter_items(scores, item_ids, rated_items)
        
        # 4. Top N
        top_results = generate_top_n_recommendations(candidates, [10, 20])
        
        # 5. Save
        df_rec = save_recommendations(label, top_results, df_items_map)
        all_recs.append(df_rec)
        
        print(f"Top 5 for {label}: ")
        print(df_rec[['Title', 'Score']].head(5))
    
    return all_recs

# 6. k-Nearest Neighbors (k-NN)

## 6.1. Implement Item-Based k-NN

We implement a memory-efficient Item-based k-NN. Instead of computing the full $N \times N$ similarity matrix (which can be huge), we compute similarities row-by-row and only store the **Top-K** nearest neighbors for each item.

### Subtask 1: Choose the item representation
We use the **Feature Matrix** constructed in Section 3 (`final_feature_matrix`) as the item representation. It combines TF-IDF, numerical, and categorical features.

In [None]:
import numpy as np
from scipy.sparse import csr_matrix, vstack
import heapq

# Subtask 2 & 3: Compute item-item similarity and find k most similar (Memory Efficient)
def compute_top_k_similar_items(item_feature_matrix, df_items_map, k_list=[10, 20]):
    """
    Computes cosine similarity for each item against all others and keeps top max(k) neighbors.
    Returns a dictionary: {item_id: [(score, neighbor_id), ...]}.
    """
    print("\n--- Computing Item-Item Similarity (Top-K) ---")
    
    max_k = max(k_list)
    n_items = item_feature_matrix.shape[0]
    item_ids = df_items_map['item_id'].tolist()
    
    # Output dictionary
    item_neighbors = {}
    
    # We iterate row by row to save memory
    # For better performance, we can process in batches (e.g., 100 rows at a time)
    batch_size = 100
    
    print(f"Processing {n_items} items in batches of {batch_size}...")
    
    for start_idx in range(0, n_items, batch_size):
        end_idx = min(start_idx + batch_size, n_items)
        
        # 1. Get Batch of Item Vectors
        batch_vecs = item_feature_matrix[start_idx:end_idx]
        
        # 2. Compute Similarity against ALL items
        # Shape: (Batch, F) @ (N, F).T = (Batch, N)
        # Using L2 normalized matrix -> Dot product is Cosine Sim
        sim_batch = batch_vecs.dot(item_feature_matrix.T)
        
        # 3. Extract Top-K for each item in batch
        # We need to exclude self-similarity (which is 1.0 at index i)
        if isinstance(sim_batch, csr_matrix):
            sim_batch = sim_batch.toarray()
            
        for i in range(len(sim_batch)):
            current_item_idx = start_idx + i
            current_item_id = item_ids[current_item_idx]
            
            scores = sim_batch[i]
            
            # Eliminate self (set to -1)
            scores[current_item_idx] = -1.0
            
            # Find Top K using argpartition (faster than full sort)
            # We want top max_k. argpartition puts smallest at front, largest at back.
            # So we partition by N - max_k - 1?? No, just use simple sort for clarity if N is small (<10k)
            # Or use heapq.nlargest
            
            # Efficient Top-K indices
            # Note: If N ~ 50k, sorting 50k floats 50k times is slow but acceptable for assignment.
            top_indices = np.argsort(scores)[-max_k:][::-1]
            
            neighbors = []
            for idx in top_indices:
                score = scores[idx]
                if score > 0: # Only positive similarity
                    neighbor_id = item_ids[idx]
                    neighbors.append((float(score), neighbor_id))
            
            item_neighbors[current_item_id] = neighbors
        
        if (start_idx // batch_size) % 10 == 0:
             print(f"Processed {end_idx}/{n_items} items...")
             
    print("Top-K Neighbor computation complete.")
    return item_neighbors

In [None]:
# Subtask 4: Define prediction formula (Weighted Average)
def predict_rating_knn(user_id, target_item_id, item_neighbors, user_ratings_map, k=20):
    """
    Predicts rating for user u on item i using Item-KNN.
    Formula: Sum(sim(i,j) * r_uj) / Sum(|sim(i,j)|)
    where j are neighbors of i that u has rated.
    """
    # 1. Get Neighbors of target_item_id
    neighbors = item_neighbors.get(target_item_id, [])
    
    # 2. Filter neighbors to Top-K that User has rated
    # Note: item_neighbors is already sorted Top-MaxK. We assume neighbors[:k] restricts to k.
    # But usually KNN means "K nearest neighbors that have a rating".
    # Standard Item-KNN: Find K most similar items (S_k(i)). intersection with I_u.
    # OR Find all items in I_u, sort by similarity to i, take top K.
    # The latter is better for coverage. 
    # With pre-computed Top-K item-item, we are limited to the global Top-K similar items.
    # If the user hasn't rated any of the global Top-K similar items, we cannot predict.
    # We will use the pre-computed neighbors approach (Model-Based KNN).
    
    top_k_neighbors = neighbors[:k]
    
    weighted_sum = 0.0
    sum_sim = 0.0
    
    count_contributors = 0
    
    for score, neighbor_id in top_k_neighbors:
        if neighbor_id in user_ratings_map:
            r_uj = user_ratings_map[neighbor_id]
            weighted_sum += score * r_uj
            sum_sim += abs(score)
            count_contributors += 1
            
    # Subtask 5: Handle edge cases
    if count_contributors == 0 or sum_sim == 0:
        # Fallback: User's average rating or global average
        # Returning None to signal "No Prediction possible via KNN"
        return None 
        
    prediction = weighted_sum / sum_sim
    return prediction

In [None]:
# Subtask 6 & 7: Generate predictions and save
def generate_knn_recommendations(target_users, item_neighbors, df_interactions, df_items_map, k_list=[10, 20]):
    """
    Generates Top-N recommendations using KNN for specific users.
    Since KNN is a rating predictor, we predict ratings for ALL unrated items and rank them.
    This is expensive. For demo, we might only score a candidate set (e.g., top 100 popular unrated).
    """
    print("\n--- Generating KNN Recommendations ---")
    all_results = []
    
    # Valid items list
    all_item_ids = df_items_map['item_id'].tolist()
    item_map = df_items_map.set_index('item_id').to_dict('index')
    
    for user_label, info in target_users.items():
        if info['type'] == 'cold':
            print(f"Skipping {user_label} for KNN (Requires history).")
            continue
            
        user_id = info['data']
        print(f"generating for {user_label} ({user_id})...")
        
        # 1. Build User Rating Map
        user_data = df_interactions[df_interactions['user_id'] == user_id]
        user_ratings_map = dict(zip(user_data['item_id'], user_data['rating']))
        rated_items = set(user_ratings_map.keys())
        
        # 2. Define Candidates (All items - Rated items)
        # heuristic: score only items that appear in neighbors of rated items? (Item-Based expansion)
        # This is much faster than scoring ALL items.
        candidate_set = set()
        for rated_item in rated_items:
            # Get neighbors of what user liked
            # We look at the precomputed similarity. 
            # If i is similar to j (rated), then i is a candidate.
            # But our map is i -> neighbors. It's symmetric. 
            neighbors = item_neighbors.get(rated_item, [])
            for s, n_id in neighbors:
                if n_id not in rated_items:
                    candidate_set.add(n_id)
        
        print(f"Identified {len(candidate_set)} candidate items via neighbor expansion.")
        
        # 3. Predict & Rank
        predictions = []
        for k in k_list:
            # We generate for largest K, then slice
            pass
            
        # Use max K for finding best items
        max_k = max(k_list)
        
        candidates_scored = []
        for item_id in candidate_set:
            pred = predict_rating_knn(user_id, item_id, item_neighbors, user_ratings_map, k=max_k)
            if pred is not None:
                candidates_scored.append((pred, item_id))
                
        # Sort
        candidates_scored.sort(key=lambda x: x[0], reverse=True)
        
        # Select Top 20
        top_20 = candidates_scored[:20]
        
        # Save CSV
        out_data = []
        rank = 1
        for score, iid in top_20:
            meta = item_map.get(iid, {})
            out_data.append({
                'Rank': rank,
                'User': user_id,
                'Item_ID': iid,
                'Predicted_Rating': round(score, 4),
                'Title': meta.get('title', 'Unknown'),
                'Method': f'Item-KNN (k={max_k})'
            })
            rank += 1
            
        df_out = pd.DataFrame(out_data)
        filename = f"knn_recommendations_{user_id}.csv"
        path = os.path.join(RESULTS_DIR, filename)
        df_out.to_csv(path, index=False)
        print(f"Saved KNN recs to {path}")
        all_results.append(df_out)
        
    return all_results

## 6.2. Compare Content-Based and k-NN Approaches

We compare the two methods using a consistent evaluation setup.

In [None]:
# Subtask 1-5: Comparison Module
def evaluate_and_compare(target_users, df_interactions, item_feature_matrix, item_neighbors, df_items_map):
    """
    Evaluates CB and KNN on a 'Leave-One-Out' task for the existing users.
    Metric: Hit Rate @ 10 (Is the hidden item in Top 10?)
    Interpretation: Which method retrieves the actual hidden interaction better?
    """
    print("\n=== Comparing Content-Based vs k-NN (Leave-One-Out Evaluation) ===")
    
    results = []
    
    for label, info in target_users.items():
        if info['type'] != 'existing':
            continue
            
        user_id = info['data']
        
        # 1. Get User History
        user_data = df_interactions[df_interactions['user_id'] == user_id]
        if len(user_data) < 2:
            continue
            
        # 2. Leave One Out (Last one)
        hidden_item = user_data.iloc[-1]['item_id']
        train_items = set(user_data.iloc[:-1]['item_id'].unique())
        train_map = dict(zip(user_data.iloc[:-1]['item_id'], user_data.iloc[:-1]['rating']))
        
        print(f"Evaluating User {user_id}. Hidden Item: {hidden_item}")
        
        # --- Method A: Content-Based ---
        # Rebuild profile from TRAIN items only
        # (Simplified profile build for single user)
        item_to_idx = {iid: idx for idx, iid in enumerate(df_items_map['item_id'])}
        indices = [item_to_idx[iid] for iid in train_items if iid in item_to_idx]
        
        cb_hit = 0
        knn_hit = 0
        
        if indices:
            item_vecs = item_feature_matrix[indices]
            ratings_arr = np.array([train_map[iid] for iid in train_items if iid in item_to_idx]).reshape(-1, 1)
            # Weighted Avg
            user_vec = item_vecs.multiply(ratings_arr).sum(axis=0)
            user_vec = csr_matrix(user_vec / np.sum(ratings_arr))
            # Normalize
            if np.linalg.norm(user_vec.data) > 0:
                user_vec = user_vec / np.linalg.norm(user_vec.data)
                
            # Predict (Score all items)
            scores = compute_cosine_similarity(user_vec, item_feature_matrix)
            
            # Rank (Exclude Train)
            # We include Hidden Item in candidates
            cb_candidates = []
            all_ids = df_items_map['item_id'].tolist()
            for s, iid in zip(scores, all_ids):
                if iid not in train_items:
                    cb_candidates.append((s, iid))
            cb_candidates.sort(key=lambda x: x[0], reverse=True)
            
            # Check Hit @ 10
            top_10 = [c[1] for c in cb_candidates[:10]]
            if hidden_item in top_10:
                cb_hit = 1
        
        # --- Method B: Item-KNN ---
        # Predict Score for Hidden Item vs Random 100 negatives? 
        # Or just rank all candidates like CB? 
        # KNN ranks by predicted rating.
        # We use strict Candidate set = [Hidden Item] + [100 Random Unrated]
        # To save time vs scoring 50k items.
        negatives = []
        import random
        while len(negatives) < 100:
            i_rand = random.choice(all_ids)
            if i_rand not in train_items and i_rand != hidden_item:
                negatives.append(i_rand)
                
        test_candidates = [hidden_item] + negatives
        
        knn_scores = []
        for cand in test_candidates:
            score = predict_rating_knn(user_id, cand, item_neighbors, train_map, k=20)
            if score is None: score = 0 # Default low
            knn_scores.append((score, cand))
            
        knn_scores.sort(key=lambda x: x[0], reverse=True)
        top_10_knn = [c[1] for c in knn_scores[:10]]
        
        if hidden_item in top_10_knn:
            knn_hit = 1
            
        results.append({
            'User': user_id,
            'CB_Hit_10': cb_hit,
            'KNN_Hit_10': knn_hit,
            'Hidden_Item': hidden_item
        })
        
    # Subtask 4: Build Comparison Table
    df_res = pd.DataFrame(results)
    print("\n--- Evaluation Results (Hit Rate @ 10) ---")
    print(df_res)
    save_path = os.path.join(RESULTS_DIR, "method_comparison.csv")
    df_res.to_csv(save_path, index=False)

    
    # Subtask 5: Interpret
    cb_acc = df_res['CB_Hit_10'].mean()
    knn_acc = df_res['KNN_Hit_10'].mean()
    
    print("\n--- Interpretation ---")
    print(f"Content-Based Hit Rate: {cb_acc:.2f}")
    print(f"Item-KNN Hit Rate: {knn_acc:.2f}")
    
    if knn_acc > cb_acc:
        print("Conclusion: k-NN performed better. Collaborative signals (ratings) might be stronger than content Metadata here.")
    elif cb_acc > knn_acc:
        print("Conclusion: Content-Based performed better. Metadata specificities might be more effective than sparse rating overlaps.")
    else:
        print("Conclusion: Both methods performed similarly.")
        
    return df_res

# 7. Complete Numerical Example

## 7.1. Step-by-Step Numerical Walkthrough
This section provides a detailed numerical example using a small subset (3-5 items) to transparently demonstrate the calculations behind the scenes.

### Subtask 1: Select 3–5 sample items

In [None]:
def select_sample_items(df_items, n=5):
    """
    Selects the first n items for distinct demonstration.
    """
    print(f"\n--- [Numerical Example] Selecting {n} Sample Items ---")
    
    # We pick the head. If text source not there, we ensure it is.
    # We assume 'text_source' column exists or we recreate it briefly.
    sample = df_items.head(n).copy()
    if 'text_source' not in sample.columns:
        # Simple fallback re-creation just for this df
        sample['text_source'] = sample['title'].fillna('') + " " + sample['categories'].fillna('')
        
    # Display
    for i, row in sample.iterrows():
        print(f"Item {row['item_id']}: {row['title']} | Text: {row['text_source'][:50]}...")
        
    return sample

In [None]:
# Subtask 2-4: Build Vocabulary and Compute TF/IDF (Small Scale)
def run_step_by_step_tfidf(sample_df):
    """
    Manually computes TF, IDF, and TF-IDF for the sample set to show the math.
    """
    print("\n--- [Numerical Example] TF-IDF Calculation ---")
    
    # 1. Tokenize
    texts = sample_df['text_source'].tolist()
    processed_docs = manual_tokenize_and_clean(texts)
    
    # 2. Build Vocabulary (Subtask 2)
    # We iterate and find unique terms just for this subset
    unique_terms = sorted(set(term for doc in processed_docs for term in doc))
    vocab = {term: i for i, term in enumerate(unique_terms)}
    print(f"\nSample Vocabulary ({len(vocab)} terms): {list(vocab.keys())}")
    
    # 3. Compute TF (Subtask 3)
    # TF(t, d) = count of t in d
    tfs = []
    print("\nTerm Frequencies (TF):")
    for i, doc in enumerate(processed_docs):
        counts = Counter(doc)
        row = [counts[term] for term in unique_terms]
        tfs.append(row)
        print(f"Doc {i} ({sample_df.iloc[i]['item_id']}): {dict(zip(unique_terms, row))}")
    
    # 4. Compute IDF (Subtask 4)
    # IDF(t) = log((N+1) / (df(t)+1)) + 1
    N = len(processed_docs)
    dfs = [sum(1 for doc in processed_docs for term in unique_terms if term in doc) for term in unique_terms]
    # Wait, the list comprehension above is slightly wrong 'if term in doc' check
    # Correct Logic: for each term, count docs containing it.
    dfs = []
    for term in unique_terms:
        count = sum(1 for doc in processed_docs if term in doc)
        dfs.append(count)
    
    idfs = []
    print(f"\nInverse Document Frequencies (IDF) [log((N+1)/(df+1)) + 1]:")
    for term, df in zip(unique_terms, dfs):
        val = math.log((N + 1) / (df + 1)) + 1
        idfs.append(val)
        print(f"Term '{term}': DF={df}, IDF={val:.4f}")
        
    # 5. Compute TF-IDF (Subtask 5)
    print("\nTF-IDF Matrix (TF * IDF) before Normalization:")
    tfidf_matrix = []
    for i, tf_row in enumerate(tfs):
        vec = [tf * idf for tf, idf in zip(tf_row, idfs)]
        tfidf_matrix.append(vec)
        # Print simplified
        vec_str = ", ".join([f"{v:.2f}" for v in vec])
        print(f"Doc {i}: [{vec_str}]")
        
    # Normalize
    print("\nL2 Normalization:")
    final_matrix = []
    for i, vec in enumerate(tfidf_matrix):
        norm = math.sqrt(sum(v*v for v in vec))
        if norm > 0:
            norm_vec = [v/norm for v in vec]
        else:
            norm_vec = vec
        final_matrix.append(norm_vec)
        norm_vec_str = ", ".join([f"{v:.3f}" for v in norm_vec])
        print(f"Doc {i} Normalized: [{norm_vec_str}]")
        
    # Return as DataFrame for easier handling downstream
    df_tfidf = pd.DataFrame(final_matrix, columns=unique_terms, index=sample_df['item_id'])
    
    # Save for report
    df_tfidf.to_csv(os.path.join(RESULTS_DIR, "numerical_example_tfidf.csv"))
    return df_tfidf


In [None]:
# Subtask 6: Define Sample User & Ratings
def define_sample_user(sample_df):
    """
    Creates a user who rated specific items from the sample.
    """
    print("\n--- [Numerical Example] Defining Sample User ---")
    # Let's say user rated 2 of the 5 items.
    # We pick item 0 and item 2
    items = sample_df['item_id'].tolist()
    if len(items) < 2:
         print("Not enough items to define sample user.")
         return {}
         
    # User Ratings
    user_ratings = {
        items[0]: 5.0, # Loved Item 0
        items[1]: 2.0  # Disliked Item 1 (if available)
    }
    if len(items) > 2:
         user_ratings[items[2]] = 4.0
    
    print(f"User Ratings: {user_ratings}")
    return user_ratings

In [None]:
# Subtask 7: Construct User Profile
def construct_sample_profile(user_ratings, df_tfidf_sample):
    """
    Weighted average of rated item vectors.
    """
    print("\n--- [Numerical Example] User Profile Construction ---")
    
    n_features = df_tfidf_sample.shape[1]
    terms = df_tfidf_sample.columns.tolist()
    
    # Initialize sum vector
    weighted_sum = np.zeros(n_features)
    total_rating = 0.0
    
    print("Calculation (Sum[r * v] / Sum[r]):")
    
    for iid, rating in user_ratings.items():
        if iid in df_tfidf_sample.index:
            vec = df_tfidf_sample.loc[iid].values
            weighted_sum += rating * vec
            total_rating += rating
            print(f" + (Rating {rating}) * Vector[{iid}]")
            
    if total_rating > 0:
        user_profile = weighted_sum / total_rating
    else:
        user_profile = weighted_sum
        
    # Normalize
    norm = np.linalg.norm(user_profile)
    if norm > 0:
        user_profile = user_profile / norm
        
    print(f"\nFinal Normalized User Profile Vector (Top 5 features):")
    # Display top 5 dim for readability
    top_indices = np.argsort(user_profile)[::-1][:5]
    for idx in top_indices:
        print(f"'{terms[idx]}': {user_profile[idx]:.4f}")
        
    return user_profile

In [None]:
# Subtask 8 & 9: Compute Similarity & Rank
def compute_sample_similarity_and_rank(user_profile, df_tfidf_sample):
    """
    Dot product execution.
    """
    print("\n--- [Numerical Example] Similarity & Ranking ---")
    
    results = []
    
    for iid, row in df_tfidf_sample.iterrows():
        item_vec = row.values
        # Dot product
        score = np.dot(user_profile, item_vec)
        results.append((score, iid))
        print(f"Item {iid}: Score = Dot(User, Item) = {score:.6f}")
        
    # Rank
    results.sort(key=lambda x: x[0], reverse=True)
    
    print("\n--- Top Recommendations --- ")
    rank = 1
    out_list = []
    for score, iid in results:
        print(f"{rank}. Item {iid} (Score: {score:.4f})")
        out_list.append({'Rank': rank, 'Item_ID': iid, 'Score': score})
        rank += 1
        
    # Save
    pd.DataFrame(out_list).to_csv(os.path.join(RESULTS_DIR, "numerical_example_results.csv"), index=False)
    return results

In [None]:
# Orchestrator for Section 7
def run_numerical_example_pipeline(df_items):
    print("\n==============================================")
    print("       STARTING SECTION 7: NUMERICAL EXAMPLE       ")
    print("==============================================")
    
    # 1. Select
    sample = select_sample_items(df_items, n=5)
    
    # 2-5. TF-IDF
    df_tfidf_sample = run_step_by_step_tfidf(sample)
    
    # 6. User
    user_ratings = define_sample_user(sample)
    
    # 7. Profile
    user_profile = construct_sample_profile(user_ratings, df_tfidf_sample)
    
    # 8-9. Similarity & Rank
    compute_sample_similarity_and_rank(user_profile, df_tfidf_sample)
    
    print("\n[Numerical Example Completed Successfully]")