In [2]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, split, udf, array_contains, expr, lit, array
from pyspark.sql.types import ArrayType, StringType, DoubleType, IntegerType
from pyspark.ml.feature import IndexToString, StringIndexer, Tokenizer, HashingTF, IDF, VectorAssembler, Normalizer
from pyspark.ml.recommendation import ALS
from pyspark.ml import Pipeline
from pyspark.mllib.linalg import Vectors
from pyspark.ml.linalg import SparseVector
from collections import defaultdict
import numpy as np
import json
import os
import sys
# --- FIX: Increase spark.driver.memory to handle large .collect() operations ---
# Set the driver memory higher (e.g., 8GB) to accommodate the collected track metadata.
DRIVER_MEMORY = "8g" 

spark = SparkSession.builder \
    .appName("PySparkHybridRecommender") \
    .config("spark.driver.memory", DRIVER_MEMORY) \
    .getOrCreate()

print(f"Spark Session Initialized with Driver Memory: {DRIVER_MEMORY}")

25/12/11 14:20:10 WARN Utils: Your hostname, DESKTOP-P46QK96 resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/12/11 14:20:10 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/11 14:20:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark Session Initialized with Driver Memory: 8g


In [3]:
# Cell 2: Data Loading Utilities (PySpark)

# Cell 2: Data Loading Utilities (PySpark)

def load_track_map(spark: SparkSession, map_path: str) -> pyspark.sql.DataFrame:
    """Load the Track ID <-> Song ID mapping file and return a DataFrame mapping Song ID to Track ID."""
    print(f"[LOAD] Loading Track ID <-> Song ID mapping from {map_path}...")
    
    # Trying the literal separator '<SEP>' based on the file content snippet.
    df_map = spark.read.csv(
        map_path, 
        sep='<SEP>',  # ASSUMING THE LITERAL DELIMITER IS '<SEP>'
        header=False, 
        inferSchema=True
    )

    # Temporary debugging step: print schema to see how many columns Spark found
    print(" - Debug Schema (load_track_map):")
    df_map.printSchema()
    
    # We now expect to find columns _c0, _c1, _c2, _c3
    
    try:
        df_map = df_map.select(
            col('_c0').alias('track_id'),
            col('_c1').alias('song_id')
        ).filter(col('track_id').isNotNull() & col('song_id').isNotNull())
    except Exception as e:
        print(f" - FATAL WARNING: Failed to select columns. Please check the delimiter in your file. Error: {e}")
        # Reraise the error to halt execution and let the user fix the file/path.
        raise

    df_map = df_map.dropDuplicates(['song_id']).dropDuplicates(['track_id'])
    
    print(f" - Track mapping loaded: {df_map.count()} records")
    return df_map

# ... rest of Cell 2 functions remain the same

def load_interactions(spark: SparkSession, interactions_path: str, df_map: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:
    """Load and join the interactions file (User, Song ID, Playcount) with the map to get Track IDs."""
    print(f"[LOAD] Loading interactions from {interactions_path} and joining with Track IDs...")
    
    # Load raw interactions (User ID, Song ID, Play Count)
    df_raw = spark.read.csv(interactions_path, sep='\\t', header=False, inferSchema=True)
    df_raw = df_raw.toDF('user', 'song_id', 'playcount')
    df_raw = df_raw.withColumn('playcount', col('playcount').cast(DoubleType()))
    
    # Join with the mapping file to switch from Song ID (SO*) to Track ID (TR*)
    # This join operation should also show a progress bar.
    interactions_df = df_raw.join(
        df_map,
        on='song_id',
        how='inner'
    ).select('user', 'track_id', 'playcount')
    
    # Force count to ensure the loading and join jobs run and show progress
    count = interactions_df.count()
    print(f" - Final Interactions loaded and joined: {count} records")
    return interactions_df

from ast import literal_eval
def parse_string_to_list(s):
    if not s or s == '[]': return []
    try:
        # literal_eval safely evaluates a string containing a Python structure (like a list)
        return literal_eval(s)
    except Exception:
        return []

# The UDFs needed for tags and similars remain mostly the same, but now they process
# the output of the parser, which is a list of lists/tuples, from the CSV string column.

def load_track_metadata(spark: SparkSession, csv_path: str) -> pyspark.sql.DataFrame:
    """
    Load track metadata from the SINGLE CSV file generated by the Pandas script.
    """
    print(f"[LOAD] Loading track metadata from single CSV file: {csv_path}...")
    
    # 1. Read the CSV file
    df_raw = spark.read.csv(
        csv_path,
        header=True, # The CSV has a header row from Pandas
        inferSchema=True
    )
    
    # Force count to ensure the read job runs and shows progress
    raw_count = df_raw.count()
    print(f" - CSV file loaded: {raw_count} records")

    # 2. Define UDFs for cleaning the serialized columns
    parse_list_udf = udf(parse_string_to_list, ArrayType(ArrayType(StringType()))) 

    # UDFs to extract final tags/similars structure from the parsed lists
    def extract_tags(tags_list):
        # tags_list is now a list of lists like [['tag', count], ...]
        if tags_list is None: return []
        try:
            # We assume tag is the first element (index 0)
            return [t[0] for t in tags_list if t and len(t) > 0] 
        except Exception:
            return []
            
    def extract_similars(similars_list):
        # similars_list is now a list of lists like [['track_id', score], ...]
        if similars_list is None: return []
        try:
            # We assume track_id is the first element (index 0)
            return [s[0] for s in similars_list if s and len(s) > 0]
        except Exception:
            return []

    extract_tags_udf = udf(extract_tags, ArrayType(StringType()))
    extract_similars_udf = udf(extract_similars, ArrayType(StringType()))

    # 3. Apply parsing and cleaning
    df_tracks = df_raw.select(
        col('track_id'),
        col('artist'),
        col('title'),
        extract_tags_udf(parse_list_udf(col('tags'))).alias('tag_list'),
        extract_similars_udf(parse_list_udf(col('similars'))).alias('similar_track_ids')
    ).filter(col('track_id').isNotNull())

    # 4. Clean up
    print(" - Starting cleaning and dropping duplicates. Look for a Spark job progress bar here.")
    df_tracks = df_tracks.dropDuplicates(['track_id'])
    
    cleaned_count = df_tracks.count() # Force action here too
    print(f" - Cleaned track metadata: {cleaned_count} unique tracks")

    return df_tracks

def load_and_filter_tags(spark: SparkSession, path: str, min_count: int = 10000) -> set:
    """Load the global tag list and return a set of popular tags."""
    print(f"[LOAD] Loading and filtering global tag list from {path}...")
    try:
        df_tags = spark.read.csv(path, sep='\\t', header=False, inferSchema=True).toDF('tag', 'count')
        
        # This collect() operation forces the file read job
        popular_tags_set = set(df_tags.filter(col('count') >= min_count).select('tag').rdd.flatMap(lambda x: x).collect())
        
        print(f" - Filtered tag list size: {len(popular_tags_set):,} (min_count={min_count:,})")
        return popular_tags_set
    except Exception as e:
        print(f" - WARNING: Could not load or process tag list at {path}. Proceeding without tag filtering. Error: {e}")
        return set()

In [4]:
def preprocess_for_als(interactions_df: pyspark.sql.DataFrame) -> tuple:
    """Indexes user and track IDs to prepare for PySpark MLlib ALS."""
    print(f"[PREP] Indexing User and Track IDs for ALS...")

    user_indexer = StringIndexer(inputCol="user", outputCol="user_idx")
    track_indexer = StringIndexer(inputCol="track_id", outputCol="track_idx")

    pipeline = Pipeline(stages=[user_indexer, track_indexer])
    model = pipeline.fit(interactions_df)
    indexed_df = model.transform(interactions_df)

    indexed_df.cache()
    print(f" - Indexed DataFrame prepared and cached.")

    user_indexer_model = model.stages[0]
    track_indexer_model = model.stages[1]

    return indexed_df, user_indexer_model, track_indexer_model

In [5]:
def train_pyspark_als(indexed_df: pyspark.sql.DataFrame, factors: int = 64, regularization: float = 0.01, iterations: int = 15):
    """Trains a PySpark MLlib Alternating Least Squares (ALS) model."""
    print(f"[ALS] Training PySpark MLlib ALS Model...")

    als = ALS(
        rank=factors,
        maxIter=iterations,
        regParam=regularization,
        userCol="user_idx",
        itemCol="track_idx",
        ratingCol="playcount",
        implicitPrefs=True,
        coldStartStrategy="drop",
        seed=42
    )

    model = als.fit(indexed_df)
    print(f" - ALS Model training complete.")
    return model

In [6]:
# Cell 5: build_content_features (FIXED)

def build_content_features(tracks_df: pyspark.sql.DataFrame, popular_tags: set, max_features: int = 20000):
    """
    Builds a TF-IDF matrix using track tags (filtered by popular_tags) and title.
    """
    print(f"[CONTENT] Building Content TF-IDF Features...")

    # --- FIX APPLIED HERE ---
    # Corrected access to SparkContext via sparkSession
    popular_tags_broadcast = tracks_df.sparkSession.sparkContext.broadcast(popular_tags)    
    def filter_and_flatten_tags(tags, title):
        if tags is None: tags = []
        
        if popular_tags_broadcast.value:
             # popular_tags_broadcast.value contains the set of popular tags
             filtered_tags = [t for t in tags if t in popular_tags_broadcast.value]
        else:
             filtered_tags = tags
             
        # Combine filtered tags and title into a single text document
        text = " ".join(filtered_tags) + " " + (title if title else "")
        return text.strip()

    filter_and_flatten_tags_udf = udf(filter_and_flatten_tags, StringType())

    # 1. Create the text feature column
    content_df = tracks_df.withColumn(
        "features_text",
        filter_and_flatten_tags_udf(col('tag_list'), col('title'))
    ).filter(col('features_text') != '') # Drop records where feature text is empty

    # 2. Tokenize, Hash (TF), and IDF
    tokenizer = Tokenizer(inputCol="features_text", outputCol="words")
    hashing_tf = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="raw_features", numFeatures=max_features)
    idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="content_features_vector")

    pipeline = Pipeline(stages=[tokenizer, hashing_tf, idf])
    model = pipeline.fit(content_df)
    tfidf_df = model.transform(content_df)

    # 3. Normalize the vector
    normalizer = Normalizer(inputCol="content_features_vector", outputCol="normalized_features", p=2.0)
    normalized_tfidf_df = normalizer.transform(tfidf_df)

    # Force action to materialize the TF-IDF creation job and show progress
    normalized_tfidf_df.cache()
    count = normalized_tfidf_df.count()
    print(f" - Content TF-IDF matrix built: {count} records.")

    return normalized_tfidf_df

In [7]:
def build_cooccurrence_matrix(indexed_df: pyspark.sql.DataFrame, top_k: int = 50):
    """Calculates item-item co-occurrence scores from user transactions (session-based)."""
    print(f"[COOCC] Calculating Item-Item Co-occurrence for top {top_k} similars...")

    user_tracks = indexed_df.groupBy('user_idx').agg(expr('collect_list(track_idx)').alias('tracks'))
    exploded_tracks = user_tracks.withColumn("item1", explode("tracks"))
    
    cooccurrences = exploded_tracks.alias('df1').join(
        exploded_tracks.alias('df2'),
        (col('df1.user_idx') == col('df2.user_idx')) & (col('df1.item1') != col('df2.item1'))
    ).select(
        col('df1.item1').alias('track1'),
        col('df2.item1').alias('track2')
    ).groupBy('track1', 'track2').count()

    cooccurrences.cache()

    from pyspark.sql.window import Window
    window_spec = Window.partitionBy("track1").orderBy(col("count").desc())
    ranked_coocc = cooccurrences.withColumn("rank", expr(f"row_number() over (partition by track1 order by count desc)"))
    top_coocc = ranked_coocc.filter(col("rank") <= top_k).drop("rank")

    coocc_list = top_coocc.collect()
    coocc_dict = defaultdict(list)
    for row in coocc_list:
        coocc_dict[row.track1].append((row.track2, row['count']))

    print(f" - Co-occurrence calculation complete. {len(coocc_dict)} unique tracks found.")
    return coocc_dict

In [8]:
class HybridRecommenderPySpark:
    def __init__(self, spark_session, als_model, track_indexer_model, user_indexer_model,
                 tracks_df, content_features_df, coocc_dict):
        # ... (initialization as before)
        self.spark = spark_session
        self.als_model = als_model
        self.coocc_dict = coocc_dict

        self.track_indexer = track_indexer_model
        self.user_indexer = user_indexer_model

        self.raw_track_ids = track_indexer_model.labels
        self.raw_user_ids = user_indexer_model.labels
        self.track_id_to_idx = {raw: i for i, raw in enumerate(self.raw_track_ids)}
        self.user_id_to_idx = {raw: i for i, raw in enumerate(self.raw_user_ids)}

        print("[INIT] Building local content feature and track dictionary...")
        self.content_features_dict = {
            row.track_id: row.normalized_features for row in content_features_df.select('track_id', 'normalized_features').collect()
        }
        self.track_metadata = {
            row.track_id: {'artist': row.artist, 'title': row.title, 'tag_list': row.tag_list, 'similar_track_ids': row.similar_track_ids}
            for row in tracks_df.collect()
        }

    def get_track_info(self, track_id: str) -> dict:
        return self.track_metadata.get(track_id, {'artist': 'Unknown', 'title': 'Unknown', 'tag_list': [], 'similar_track_ids': []})

    def recommend_for_user(self, user_id: str, top_k: int = 10, weights: dict = None) -> list:
        if weights is None:
            weights = {'als': 0.6, 'coocc': 0.4}

        if user_id not in self.user_id_to_idx:
            print(f"Cold start user: {user_id}. Returning dummy popular items.")
            return [('TRdummy_pop1', 1.0), ('TRdummy_pop2', 0.9)]

        user_idx = self.user_id_to_idx[user_id]
        scored = defaultdict(float)

        # 1. ALS Recommendations (The primary engine)
        if self.als_model is not None:
            try:
                users_df = self.spark.createDataFrame([(user_idx,)], [self.als_model.getUserCol()])
                recs_df = self.als_model.recommendForUserSubset(users_df, top_k*5)

                recs_row = recs_df.collect()[0] if recs_df.count() > 0 else None
                recs = recs_row.recommendations if recs_row else []

                for rec in recs:
                    tid = self.raw_track_ids[rec.track_idx]
                    scored[tid] += weights.get('als', 0.0) * float(rec.rating)
                print(f" - ALS recommendations processed.")
            except Exception as e:
                print(f"ALS recommendation failed: {e}")

        # 2. Co-occurrence Recommendations (Item-to-Item from user history - simplified)
        print(" - Skipping Co-occurrence recommendation due to need for user history lookup/logic.")

        # Final combination and sorting (based on ALS/other scores)
        final = sorted(scored.items(), key=lambda x: x[1], reverse=True)
        return final[:top_k]

In [8]:
# Cell A: Paths & Data Loading (Run this only once)

# --- CONFIGURE FILE PATHS HERE ---
# 1. THE USER INTERACTION DATA (The file you just found)
interactions_path = "data/kaggle_visible_evaluation_triplets.txt"    
# 2. THE TRACK ID <-> SONG ID MAPPING 
track_map_path = "data/unique_tracks.txt" 
# 3. THE TRACK METADATA CSV (from your converter script)
tracks_data_dir = "data/lastfm_data.csv"
# 4. THE GLOBAL TAG COUNT FILE
tag_list_path = "data/list_of_tags.txt"        

# -----------------------------------

# --- 1. Load Data & Prepare Filters ---
print("\n[STEP 1] Starting Data Loading...")
df_track_map = load_track_map(spark, track_map_path)
interactions_df = load_interactions(spark, interactions_path, df_track_map)
tracks_df = load_track_metadata(spark, tracks_data_dir)
popular_tags = load_and_filter_tags(spark, tag_list_path, min_count=10000)

print("\n\n--- DATA LOADING COMPLETE ---")


[STEP 1] Starting Data Loading...
[LOAD] Loading Track ID <-> Song ID mapping from data/unique_tracks.txt...


                                                                                

 - Debug Schema (load_track_map):
root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)



                                                                                

 - Track mapping loaded: 999056 records
[LOAD] Loading interactions from data/kaggle_visible_evaluation_triplets.txt and joining with Track IDs...


                                                                                

 - Final Interactions loaded and joined: 1450933 records
[LOAD] Loading track metadata from single CSV file: data/lastfm_data.csv...


                                                                                

 - CSV file loaded: 839122 records
 - Starting cleaning and dropping duplicates. Look for a Spark job progress bar here.


                                                                                

 - Cleaned track metadata: 825713 unique tracks
[LOAD] Loading and filtering global tag list from data/list_of_tags.txt...


[Stage 45:>                                                         (0 + 3) / 3]

 - Filtered tag list size: 86 (min_count=10,000)


--- DATA LOADING COMPLETE ---


                                                                                

In [9]:
# Cell B: Preprocessing & Feature Engineering

print("\n[STEP 2] Starting Preprocessing and Feature Engineering...")
# --- 2. Preprocess for ALS ---
indexed_df, user_indexer, track_indexer = preprocess_for_als(interactions_df)

# --- 3. Content Features ---
# This step now uses the fixed build_content_features function from Cell 5
content_features_df = build_content_features(tracks_df, popular_tags, max_features=10000)

print("\n\n--- PREPROCESSING & FEATURE ENGINEERING COMPLETE ---")


[STEP 2] Starting Preprocessing and Feature Engineering...
[PREP] Indexing User and Track IDs for ALS...


                                                                                

 - Indexed DataFrame prepared and cached.
[CONTENT] Building Content TF-IDF Features...




 - Content TF-IDF matrix built: 825713 records.


--- PREPROCESSING & FEATURE ENGINEERING COMPLETE ---


                                                                                

In [10]:
# Cell C: Model Training & Recommender Initialization

print("\n[STEP 3] Starting Model Training...")
# --- 4. Collaborative Filtering Model (ALS) ---
als_model = train_pyspark_als(indexed_df, factors=64, regularization=0.01, iterations=15)

# --- 5. Item-Item Co-occurrence ---
# --- 5. Item-Item Co-occurrence (Optimized by sampling users) ---
print("[COOCC] Sampling users for faster co-occurrence calculation...")
# Collect unique users, sample 10% of their IDs
user_sample = indexed_df.select('user_idx').distinct().sample(False, 0.1, seed=42)
# Filter the original indexed_df to only include the sampled users
sampled_indexed_df = indexed_df.join(user_sample, 'user_idx', 'inner')

coocc_dict = build_cooccurrence_matrix(sampled_indexed_df, top_k=50)
# --- 6. Initialize Hybrid Recommender ---
print("\n[STEP 4] Initializing Hybrid Recommender System...")
hreco = HybridRecommenderPySpark(
    spark_session=spark,
    als_model=als_model,
    track_indexer_model=track_indexer,
    user_indexer_model=user_indexer,
    tracks_df=tracks_df,
    content_features_df=content_features_df,
    coocc_dict=coocc_dict
)

print("\n\n--- MODEL TRAINING & SYSTEM INITIALIZATION COMPLETE ---")


[STEP 3] Starting Model Training...
[ALS] Training PySpark MLlib ALS Model...


25/12/11 13:28:52 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 13:28:58 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 13:29:00 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 13:29:02 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 13:29:04 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 13:29:07 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 13:29:08 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 13:29:10 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 13:29:11 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/12/11 13:29:12 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 13:29:14 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 13:29:16 WARN 

 - ALS Model training complete.
[COOCC] Sampling users for faster co-occurrence calculation...
[COOCC] Calculating Item-Item Co-occurrence for top 50 similars...


25/12/11 13:35:06 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 13:35:06 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 13:35:12 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 13:35:12 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 13:35:13 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 13:35:14 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 13:35:16 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 13:35:18 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 13:35:21 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 13:35:23 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 13:35:27 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
                                           

 - Co-occurrence calculation complete. 52467 unique tracks found.

[STEP 4] Initializing Hybrid Recommender System...
[INIT] Building local content feature and track dictionary...


                                                                                



--- MODEL TRAINING & SYSTEM INITIALIZATION COMPLETE ---


In [1]:
# Cell D: Test Recommendations

# --- 7. Test Recommendations ---
sample_user = indexed_df.select('user').limit(1).collect()[0].user

print("\n" + "="*60)
print("[TEST 1] Finding User Recommendations (PySpark Hybrid System)")
print(f"  - User ID: {sample_user}")
recs = hreco.recommend_for_user(sample_user, top_k=5)
print("âœ“ Recommendations found:")
for i, (track_id, score) in enumerate(recs, 1):
    info = hreco.get_track_info(track_id)
    print(f"    {i:2d}. Track: {info['title']:25s} | Artist: {info['artist']:25s} | Score: {score:.6f}")

print("\n" + "="*60)
print("System execution complete.")

# Optional: Stop the Spark session when done with all work
# spark.stop()

NameError: name 'indexed_df' is not defined

In [11]:
# Cell 8: Evaluation Utility Functions (Without scikit-learn)

from pyspark.sql.functions import col, lit
from pyspark.sql.types import StringType
from collections import defaultdict
import pandas as pd
import numpy as np
from tqdm import tqdm # Used to show progress bar during evaluation

def split_data_temporal(interactions_df: pyspark.sql.DataFrame, split_ratio: float = 0.8):
    """
    Splits the interactions data into training and testing sets by randomly splitting users.
    (Custom implementation replacing sklearn.model_selection.train_test_split)
    
    Returns:
        train_df, test_df (PySpark DataFrames)
    """
    print("[EVAL] Splitting interaction data for offline evaluation...")
    
    # 1. Collect unique users 
    users = [row.user for row in interactions_df.select('user').distinct().collect()]
    
    # --- Custom User Split using NumPy ---
    np.random.seed(42) # Set seed for reproducibility
    
    # Shuffle the list of users
    shuffled_users = np.random.permutation(users)
    
    # Calculate the split index
    train_size = int(len(shuffled_users) * split_ratio)
    
    # Split the users
    train_users = shuffled_users[:train_size].tolist()
    test_users = shuffled_users[train_size:].tolist()
    # ------------------------------------
    
    # Convert lists to PySpark DataFrames for joining
    # Note: We still need 'StringType' imported if the original user IDs were strings.
    train_users_df = interactions_df.sparkSession.createDataFrame(train_users, StringType()).toDF('user')
    test_users_df = interactions_df.sparkSession.createDataFrame(test_users, StringType()).toDF('user')

    # 2. Split the original interactions DataFrame
    train_df = interactions_df.join(train_users_df, 'user', 'inner')
    test_df = interactions_df.join(test_users_df, 'user', 'inner')

    print(f" - Train set size: {train_df.count()} interactions")
    print(f" - Test set size: {test_df.count()} interactions")
    
    return train_df, test_df

def calculate_ranking_metrics(predictions, ground_truth, k: int):
    """
    Calculates Precision@K and Recall@K for a list of predictions against ground truth.
    """
    if not ground_truth:
        return 0.0, 0.0

    predicted_k = set(predictions[:k])
    hits = len(predicted_k.intersection(ground_truth))
    
    # Precision@K: (Relevant items in top K) / (Total items in top K)
    precision = hits / k
    
    # Recall@K: (Relevant items in top K) / (Total relevant items)
    recall = hits / len(ground_truth)
    
    return precision, recall

def evaluate_model_performance(spark, model_recommender, test_df, k: int):
    """
    Main evaluation routine to measure model performance on a test set.
    Note: It samples a maximum of 100 users for quick evaluation based on previous optimization.
    """
    print(f"\n[EVAL] Starting evaluation for K={k}...")
    
    test_data_pd = test_df.toPandas()
    ground_truth = defaultdict(set)
    for index, row in test_data_pd.iterrows():
        ground_truth[row['user']].add(row['track_id'])
        
    test_users = list(ground_truth.keys())
    all_precisions = []
    all_recalls = []

    # Use 100 users as the sampled size for faster results
    MAX_TEST_USERS = min(len(test_users), 100) 
    
    print(f" - Generating Top-{k} recommendations for {MAX_TEST_USERS:,} test users...")
    
    # Sample the test users
    np.random.seed(42) # for reproducibility
    test_users_sample = np.random.choice(test_users, size=MAX_TEST_USERS, replace=False)

    for user_id in tqdm(test_users_sample, desc=f"Evaluating on {MAX_TEST_USERS} users"):
        relevant_tracks = ground_truth[user_id]
        if len(relevant_tracks) < 1:
            continue
            
        try:
            predictions = [t[0] for t in model_recommender.recommend_for_user(user_id, top_k=k)]
        except Exception:
            continue

        precision, recall = calculate_ranking_metrics(predictions, relevant_tracks, k)
        
        all_precisions.append(precision)
        all_recalls.append(recall)

    mean_precision = np.mean(all_precisions)
    mean_recall = np.mean(all_recalls)

    print(f"\n--- Evaluation Results (N={len(all_precisions)} users) ---")
    print(f"Mean Precision@{k}: {mean_precision:.4f}")
    print(f"Mean Recall@{k}:    {mean_recall:.4f}")
    print("-------------------------------------------------")

    return mean_precision, mean_recall

In [None]:
if __name__ == "__main__":
    # --- CONFIGURE FILE PATHS HERE ---
    # 1. THE USER INTERACTION DATA
    interactions_path = "data/kaggle_visible_evaluation_triplets.txt"    
    # 2. THE TRACK ID <-> SONG ID MAPPING
    track_map_path = "data/unique_tracks.txt" 
    # 3. THE TRACK METADATA JSONS
    tracks_data_dir = "data/lastfm_data.csv"
    # 4. THE GLOBAL TAG COUNT FILE
    tag_list_path = "data/list_of_tags.txt"        
    
    # -----------------------------------

    # --- 1. Load Data & Prepare Filters ---
    df_track_map = load_track_map(spark, track_map_path)
    interactions_df = load_interactions(spark, interactions_path, df_track_map)
    tracks_df = load_track_metadata(spark, tracks_data_dir)
    popular_tags = load_and_filter_tags(spark, tag_list_path, min_count=10000)

    # --- 2. Preprocess for ALS & Split Data ---
    indexed_df, user_indexer, track_indexer = preprocess_for_als(interactions_df)
    
    # Split the original (unindexed) interactions data for evaluation
    train_interactions_df, test_interactions_df = split_data_temporal(interactions_df, split_ratio=0.8)

    # --- 3. Content Features ---
    content_features_df = build_content_features(tracks_df, popular_tags, max_features=10000)

    # --- 4. Collaborative Filtering Model ---
    als_model = train_pyspark_als(indexed_df, factors=64, regularization=0.01, iterations=15)

    # --- 5. Item-Item Co-occurrence (Optimized by sampling users) ---
    print("[COOCC] Sampling users for faster co-occurrence calculation...")
    # Sample 10% of users for the co-occurrence matrix calculation
    user_sample = indexed_df.select('user_idx').distinct().sample(False, 0.1, seed=42)
    sampled_indexed_df = indexed_df.join(user_sample, 'user_idx', 'inner')

    coocc_dict = build_cooccurrence_matrix(sampled_indexed_df, top_k=50)

    # --- 6. Initialize Hybrid Recommender ---
    print("\n[STEP 6] Initializing Hybrid Recommender System...")
    hreco = HybridRecommenderPySpark(
        spark_session=spark,
        als_model=als_model,
        track_indexer_model=track_indexer,
        user_indexer_model=user_indexer,
        tracks_df=tracks_df,
        content_features_df=content_features_df,
        coocc_dict=coocc_dict
    )
    print("\n\n--- MODEL TRAINING & SYSTEM INITIALIZATION COMPLETE ---")
    
    # --- 7. Evaluate the Model ---
    K = 10 
    print(f"\n[EVAL] Starting Model Evaluation (Precision@{K} and Recall@{K})")
    evaluate_model_performance(spark, hreco, test_interactions_df, k=K)

    # --- 8. Final Cleanup ---
    # The existing test recommendation block is removed/commented out to prioritize the full evaluation.
    
    # Stop the Spark session
    spark.stop()

[LOAD] Loading Track ID <-> Song ID mapping from data/unique_tracks.txt...


                                                                                

 - Debug Schema (load_track_map):
root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)



                                                                                

 - Track mapping loaded: 999056 records
[LOAD] Loading interactions from data/kaggle_visible_evaluation_triplets.txt and joining with Track IDs...


                                                                                

 - Final Interactions loaded and joined: 1450933 records
[LOAD] Loading track metadata from single CSV file: data/lastfm_data.csv...


                                                                                

 - CSV file loaded: 839122 records
 - Starting cleaning and dropping duplicates. Look for a Spark job progress bar here.


                                                                                

 - Cleaned track metadata: 825713 unique tracks
[LOAD] Loading and filtering global tag list from data/list_of_tags.txt...


                                                                                

 - Filtered tag list size: 86 (min_count=10,000)
[PREP] Indexing User and Track IDs for ALS...


                                                                                

 - Indexed DataFrame prepared and cached.
[EVAL] Splitting interaction data for offline evaluation...


                                                                                

 - Train set size: 1159142 interactions


                                                                                

 - Test set size: 291791 interactions
[CONTENT] Building Content TF-IDF Features...


                                                                                

 - Content TF-IDF matrix built: 825713 records.
[ALS] Training PySpark MLlib ALS Model...


25/12/11 14:40:25 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 14:40:30 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 14:40:31 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 14:40:33 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 14:40:35 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 14:40:37 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 14:40:38 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 14:40:39 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 14:40:40 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/12/11 14:40:41 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 14:40:44 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 14:40:51 WARN 

 - ALS Model training complete.
[COOCC] Sampling users for faster co-occurrence calculation...
[COOCC] Calculating Item-Item Co-occurrence for top 50 similars...


25/12/11 14:46:44 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 14:46:45 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 14:46:47 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 14:46:47 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 14:46:48 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 14:46:48 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 14:46:51 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 14:46:52 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 14:46:54 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 14:46:55 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
25/12/11 14:46:57 WARN DAGScheduler: Broadcasting large task binary with size 13.5 MiB
                                           

 - Co-occurrence calculation complete. 52467 unique tracks found.

[STEP 6] Initializing Hybrid Recommender System...
[INIT] Building local content feature and track dictionary...


[Stage 986:>                                                       (0 + 8) / 15]