In [None]:
import os
import joblib
import json
import logging
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [None]:
logging.basicConfig(level=logging.INFO)

In [None]:
# Global variables for the loaded model and auxiliary data
model = None
user_inv_map = None
item_map = None
train_sparse = None
train_data = None
itemid_to_index = None
item_ids = None
tfidf_matrix = None

In [None]:
def hybrid_recommendations(user_id, alpha=0.5, top_n=10):
    """
    Generates hybrid (ALS and Content-Based) recommendations for a user.
    """
    global model, user_inv_map, item_map, train_sparse, train_data, itemid_to_index, item_ids, tfidf_matrix

    # Check for all required dependencies
    if any(x is None for x in [model, user_inv_map, item_map, train_sparse, train_data, itemid_to_index, item_ids, tfidf_matrix]):
        logging.error("Inference dependencies not loaded. Cannot proceed.")
        return []

    if user_id not in user_inv_map:
        logging.debug(f"User {user_id} not in training data")
        return []

    user_index = user_inv_map[user_id]

    # 1. ALS scores (Collaborative Filtering)
    try:
        user_row = train_sparse[user_index]
        # model.recommend is assumed to be from the 'lightfm' or similar library
        als_recs = model.recommend(
            user_index,
            user_row,
            N=top_n * 2,
            filter_already_liked_items=True
        )
        item_indices_als, scores = als_recs
        # Map item indices back to original item IDs
        als_scores = {item_map.get(iid): score for iid, score in zip(item_indices_als, scores) if item_map.get(iid) is not None}
    except IndexError as e:
        logging.error(f"IndexError in ALS for user {user_id}: {e}")
        return []
    except Exception as e:
        logging.error(f"Error in ALS recommendation for user {user_id}: {e}")
        return []

    # 2. Content-based scores
    user_interactions = train_data[train_data['visitorid'] == user_id]['itemid'].values
    item_indices_cb = [itemid_to_index.get(iid) for iid in user_interactions if itemid_to_index.get(iid) is not None]

    if not item_indices_cb:
        logging.debug(f"No valid item interactions for user {user_id}. Returning pure ALS.")
        return list(als_scores.keys())[:top_n]

    # Vectorized content-based scoring
    # Calculate the average TF-IDF vector of the items the user interacted with
    user_tfidf = np.asarray(tfidf_matrix[item_indices_cb].mean(axis=0))

    # Calculate cosine similarity between the user's average TF-IDF and all item TF-IDF vectors
    sims = cosine_similarity(user_tfidf.reshape(1, -1), tfidf_matrix).flatten()

    if len(sims) != len(item_ids):
        logging.error(f"Mismatch: sims length {len(sims)} != item_ids length {len(item_ids)}")
        return list(als_scores.keys())[:top_n]

    cb_scores = {}
    for idx, score in enumerate(sims):
        item_id = item_ids[idx] # item_ids maps index back to original item ID
        cb_scores[item_id] = score

    # 3. Combine scores
    all_items = set(als_scores.keys()) | set(cb_scores.keys())
    combined_scores = {}
    for item in all_items:
        # Get scores, defaulting to 0 if an item only appears in one set
        als_score = als_scores.get(item, 0)
        cb_score = cb_scores.get(item, 0)
        # Apply the blending formula
        combined_scores[item] = alpha * als_score + (1 - alpha) * cb_score

    # 4. Rank and return top N
    ranked_items = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    return [item for item, _ in ranked_items[:top_n]]

In [None]:
def model_fn(model_dir):
    """
    Load the model and auxiliary files from the model directory.

    This function is called once when the container starts.
    The 'model_dir' is the path where your model artifacts from S3
    are downloaded (the 'code' directory in your compressed tarball).
    """
    global model, user_inv_map, item_map, train_sparse, train_data, itemid_to_index, item_ids, tfidf_matrix

    # 1. Load the ALS model artifact
    # Assuming the joblib file is named 'als_model.joblib' and is in the tarball
    try:
        model_path = os.path.join(model_dir, 'als_model.joblib')
        model = joblib.load(model_path)
        logging.info(f"Successfully loaded ALS model from {model_path}")
    except Exception as e:
        logging.error(f"Error loading als_model.joblib: {e}")
        raise e

    # 2. Load Auxiliary Data
    # For a complex hybrid model, you MUST include all auxiliary data
    # (mappings, matrices, training data frame, etc.) in your model tarball.
    # I'm assuming they are saved as joblib or numpy files.
    try:
        # Example loading of auxiliary files - adjust file names as necessary!
        user_inv_map = joblib.load(os.path.join(model_dir, 'user_inv_map.joblib'))
        item_map = joblib.load(os.path.join(model_dir, 'item_map.joblib'))
        train_sparse = joblib.load(os.path.join(model_dir, 'train_sparse.joblib')) # Must be a scipy sparse matrix
        train_data = pd.read_csv(os.path.join(model_dir, 'train_data.csv')) # Assuming train_data is a CSV
        itemid_to_index = joblib.load(os.path.join(model_dir, 'itemid_to_index.joblib'))
        item_ids = joblib.load(os.path.join(model_dir, 'item_ids.joblib')) # A list/array of item IDs
        tfidf_matrix = joblib.load(os.path.join(model_dir, 'tfidf_matrix.joblib')) # Must be a scipy sparse or dense matrix

        logging.info("Successfully loaded all auxiliary data for hybrid model.")

        # Ensure sparse matrices are in a usable format
        if isinstance(train_sparse, np.ndarray):
            train_sparse = csr_matrix(train_sparse)
        if isinstance(tfidf_matrix, np.ndarray):
            tfidf_matrix = csr_matrix(tfidf_matrix)

    except Exception as e:
        logging.error(f"Error loading auxiliary files. Check the file names in your tarball: {e}")
        # The inference will fail if these are missing, which is correct behavior
        raise e

    return model

In [None]:
def input_fn(request_body, request_content_type):
    """
    Parses the request body into a user ID for prediction.
    """
    if request_content_type == 'application/json':
        data = json.loads(request_body)
        # Expecting a structure like: {"user_id": "visitor_123", "alpha": 0.6, "top_n": 5}
        # We primarily need the user_id. Default alpha and top_n are used otherwise.
        user_id = data.get('user_id')
        alpha = data.get('alpha', 0.5)
        top_n = data.get('top_n', 10)
        return user_id, alpha, top_n

    raise ValueError(f"Unsupported content type: {request_content_type}")

In [None]:
def predict_fn(input_data, model):
    """
    Runs the recommendation logic.
    """
    user_id, alpha, top_n = input_data
    logging.info(f"Generating recommendations for user {user_id} with alpha={alpha}, top_n={top_n}")

    # The 'model' argument here is the return value of model_fn,
    # but since our actual logic relies on global variables for auxiliary data,
    # we call our custom function directly.
    recommendations = hybrid_recommendations(user_id, alpha, top_n)
    return recommendations

In [None]:
def output_fn(prediction, accept):
    """
    Formats the prediction result into a response body.
    """
    if accept == 'application/json':
        response_body = json.dumps({'recommendations': prediction})
        return response_body, accept

    # Handle other acceptable response formats
    raise ValueError(f"Unsupported accept type: {accept}")