In [1]:
import os
import numpy as np
from datetime import datetime

from pyspark.sql import functions as F, SparkSession
from pyspark.sql.functions import concat_ws, split, col
from pyspark.sql.types import FloatType
from pyspark.ml.feature import (
    RegexTokenizer, StopWordsRemover,
    CountVectorizer
)
from pyspark.ml.recommendation import ALS

In [2]:
spark = SparkSession.builder \
    .appName("GoodreadsBookModel") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("WARN")

25/06/26 22:21:11 WARN Utils: Your hostname, Zwanes-MacBook.local resolves to a loopback address: 127.0.0.1; using 192.168.110.53 instead (on interface en0)
25/06/26 22:21:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/26 22:21:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data_dir = "../streamlit-app/data/"
csv_pattern = os.path.join(data_dir, "*.csv")

print(f"▶️ Reading all CSVs from {csv_pattern} …")

# Check if files exist
import glob
csv_files = glob.glob(csv_pattern)
print(f"Found {len(csv_files)} CSV files:")
for file in csv_files[:5]:  # Show first 5 files
    print(f"  - {file}")

if csv_files:
    df = spark.read.csv(csv_pattern, header=True, inferSchema=True)
    print(f"✅ Combined DataFrame has {df.count():,} rows and {len(df.columns)} columns")
    df.printSchema()
else:
    print("❌ No CSV files found! Make sure Streamlit has generated batch files.")
    print("Please run the Streamlit app in 'Full Dataset Processing' mode first.")

▶️ Reading all CSVs from ../streamlit-app/data/*.csv …
Found 37 CSV files:
  - ../streamlit-app/data/books_batch_015_20250620_033009.csv
  - ../streamlit-app/data/books_batch_014_20250620_033008.csv
  - ../streamlit-app/data/books_batch_022_20250620_033015.csv
  - ../streamlit-app/data/books_batch_021_20250620_033014.csv
  - ../streamlit-app/data/books_batch_037_20250620_033108_final.csv


[Stage 4:>                                                          (0 + 8) / 8]

✅ Combined DataFrame has 183,780 rows and 11 columns
root
 |-- Id: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Authors: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- PublishYear: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- RatingDistTotal: string (nullable = true)
 |-- PagesNumber: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- batch_id: string (nullable = true)



                                                                                

In [4]:
total_rows = df.count()

missing_counts = df.select([
    F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in df.columns
]).collect()[0].asDict()

summary_rows = [
    (col, cnt, cnt / total_rows * 100)
    for col, cnt in missing_counts.items()
    if cnt > 0
]
missing_summary_df = spark.createDataFrame(
    summary_rows,
    schema=["Column", "Missing Values", "Percentage"]
).orderBy(F.desc("Missing Values"))

print("Missing Values Summary:")
missing_summary_df.show(truncate=False)

                                                                                

Missing Values Summary:




+-----------+--------------+---------------------+
|Column     |Missing Values|Percentage           |
+-----------+--------------+---------------------+
|Description|5             |0.0027206442485580586|
|timestamp  |1             |5.441288497116117E-4 |
|batch_id   |1             |5.441288497116117E-4 |
+-----------+--------------+---------------------+



                                                                                

In [5]:
cols_to_drop = ["batch_id", "timestamp", "RatingDistTotal"]
df_clean = df.drop(*cols_to_drop)
df_clean = df_clean.dropna(how="any", subset=["Description"])
print(f"▶️ After cleaning: {df_clean.count():,} rows")

▶️ After cleaning: 183,775 rows


                                                                                

In [6]:
total_rows_clean = df_clean.count()

missing_counts_clean = df_clean.select([
    F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in df_clean.columns
]).collect()[0].asDict()

summary_rows_clean = [
    (col, cnt, cnt / total_rows_clean * 100)
    for col, cnt in missing_counts_clean.items()
    if cnt > 0
]
if summary_rows_clean:
    missing_summary_df_clean = spark.createDataFrame(
        summary_rows_clean,
        schema=["Column", "Missing Values", "Percentage"]
    ).orderBy(F.desc("Missing Values"))
    print("Missing Values Summary for df_clean:")
    missing_summary_df_clean.show(truncate=False)
else:
    print("No missing values found in df_clean.")



No missing values found in df_clean.


                                                                                

## Data Preprocessing

In [7]:
tokenizer = RegexTokenizer(inputCol="Description", outputCol="words", pattern="\\W")
df_tok = tokenizer.transform(df_clean)
remover  = StopWordsRemover(inputCol="words", outputCol="filtered_words")
df_tok   = remover.transform(df_tok)

df_tags = df_tok.withColumn(
    "tags",
    concat_ws(" ",
        col("filtered_words"), col("Authors"), col("Publisher"),
        col("Rating").cast("string"), col("PublishYear").cast("string"), col("PagesNumber").cast("string")
    )
)
df_tags = df_tags.withColumn("tags", split(col("tags"), " "))

cv = CountVectorizer(inputCol="tags", outputCol="features", vocabSize=10000, minDF=2)
cv_model = cv.fit(df_tags)
df_feat  = cv_model.transform(df_tags)
print("▶️ Sample features:")
df_feat.select("Id","Name","features").show(5, truncate=50)

25/06/26 22:21:37 WARN StopWordsRemover: Default locale set was [en_ID]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.
                                                                                

▶️ Sample features:
+-------+--------------------------------------------------+--------------------------------------------------+
|     Id|                                              Name|                                          features|
+-------+--------------------------------------------------+--------------------------------------------------+
|4411486|      Larousse Dictionary of Beliefs and Religions|(10000,[1,11,16,19,22,59,66,114,133,134,135,152...|
|4411489|Enhancing Industrial Performance: Experiences O...|(10000,[1,2,3,7,43,45,76,85,91,172,234,356,389,...|
|4411490|   The Brass Check: A Study of American Journalism|(10000,[3,6,9,10,42,65,118,126,354,381,501,560,...|
|4411491| Transient Temperatures in Engineering and Science|(10000,[0,2,10,14,68,76,88,90,95,142,156,184,29...|
|4411496|                   Citroen 2CV: The Complete Story|(10000,[3,17,26,37,103,259,280,360,398,460,478,...|
+-------+--------------------------------------------------+------------------------

In [8]:
df_tok.select("Id","Name","Description", "words", "filtered_words").show(5, truncate=50)
df_tags.select("Id","Name","tags").show(5, truncate=False)
df_feat.select("Id","Name","features").show(5, truncate=False)


+-------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+
|     Id|                                              Name|                                       Description|                                             words|                                    filtered_words|
+-------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+
|4411486|      Larousse Dictionary of Beliefs and Religions|More comprehensive than any other single volume...|[more, comprehensive, than, any, other, single,...|[comprehensive, single, volume, reference, subj...|
|4411489|Enhancing Industrial Performance: Experiences O...|This work focuses on the implementation of soci...|[this, work, focuses, on, the, im

In [9]:
from pyspark.ml.feature import BucketedRandomProjectionLSH

# Tune LSH hyperparameters
bucket_lengths = [2.0, 3.0]
hash_tables    = [4, 6]

lsh = BucketedRandomProjectionLSH(
    inputCol="features", outputCol="hashes",
    bucketLength=2.0,
    numHashTables=6
)
lsh_model = lsh.fit(df_feat)

# Example: find top-10 similar books to a given Id
target_id  = 4411879
target_vec = df_feat.filter(col("Id")==target_id).select("features").first()[0]
neighbors  = lsh_model.approxNearestNeighbors(df_feat, target_vec, 10)
print(f"Top 10 similar books to Id={target_id}:")
neighbors.select("Id","Name","distCol").orderBy("distCol").show(truncate=False)

25/06/26 22:21:49 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/06/26 22:21:49 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


Top 10 similar books to Id=4411879:




+-------+------------------------------------------------------------------------------------------------------------+------------------+
|Id     |Name                                                                                                        |distCol           |
+-------+------------------------------------------------------------------------------------------------------------+------------------+
|4411879|Knowing What Works in Health Care: A Roadmap for the Nation                                                 |0.0               |
|4490303|Healthy Medicine: A Guide Emergence Sensible. Comprehensive Care                                            |13.114877048604   |
|4799451|Introduction to Health Care Delivery: A Primer for Pharmacists                                              |13.228756555322953|
|4207719|The Quality Revolution And Health Care: A Primer For Purchasers And Providers                               |13.30413469565007 |
|4415234|Health Care Systems in Ja

                                                                                

## API Endpoint Models

### /favorite

In [10]:
# ===== CONTENT-BASED RECOMMENDATIONS =====

books_pdf = df_feat.select("Id", "Name", "Authors", "Rating", "Publisher", "Description", "features").toPandas()
print(f"✅ Converted {len(books_pdf)} books to Pandas DataFrame")

# Content-based similarity using LSH
def recommend_similar_books_lsh(book_id, top_n=10):
    """Find similar books using LSH model"""
    try:
        # Get target book features
        target_row = df_feat.filter(col("Id") == book_id).first()
        if not target_row:
            return []
        
        target_vec = target_row["features"]
        
        # Find similar books using LSH
        neighbors = lsh_model.approxNearestNeighbors(df_feat, target_vec, top_n + 1)
        similar_books = neighbors.filter(col("Id") != book_id).select("Id", "Name", "Rating", "distCol").orderBy("distCol").limit(top_n)
        
        return similar_books.toPandas()
    except Exception as e:
        print(f"Error in LSH recommendation: {e}")
        return []

# By author recommendations
def recommend_by_author(author, top_n=10):
    """Find books by the same author"""
    return (df_feat.filter(col("Authors").contains(author))
                  .orderBy(col("Rating").cast(FloatType()).desc())
                  .select("Id", "Name", "Authors", "Rating")
                  .limit(top_n)
                  .toPandas())

# ===== COLLABORATIVE FILTERING (ALS) =====

print("▶️ Creating synthetic user-book interactions...")

# Create synthetic ratings data
from pyspark.sql.functions import rand, when
import random

# Generate synthetic user interactions
num_users = 1000
num_interactions = 50000

# Create synthetic ratings
synthetic_ratings = []
book_ids = [row.Id for row in df_clean.select("Id").collect()]

for _ in range(num_interactions):
    user_id = random.randint(1, num_users)
    book_id = random.choice(book_ids)
    # Bias ratings toward higher ratings (more realistic)
    rating = random.choices([1, 2, 3, 4, 5], weights=[5, 10, 20, 35, 30])[0]
    synthetic_ratings.append((user_id, book_id, float(rating)))

ratings_df = spark.createDataFrame(synthetic_ratings, ["userId", "bookId", "rating"])
print(f"✅ Created {ratings_df.count():,} synthetic ratings")

# Train ALS model
print("▶️ Training ALS collaborative filtering model...")
als = ALS(userCol="userId", itemCol="bookId", ratingCol="rating",
          rank=50, maxIter=10, regParam=0.1, coldStartStrategy="drop")
als_model = als.fit(ratings_df)

def recommend_for_user(user_id, top_n=10):
    try:
        user_df = spark.createDataFrame([(user_id,)], ["userId"])
        recs = als_model.recommendForUserSubset(user_df, top_n)
        
        if recs.count() > 0:
            rec_items = recs.select("recommendations").first()["recommendations"]
            book_ids = [item["bookId"] for item in rec_items]
            
            # Get book details
            recommended_books = df_feat.filter(col("Id").isin(book_ids)).select("Id", "Name", "Authors", "Rating").toPandas()
            return recommended_books
        return []
    except Exception as e:
        print(f"Error in collaborative filtering: {e}")
        return []

print("✅ Recommendation models ready!")

                                                                                

✅ Converted 183775 books to Pandas DataFrame
▶️ Creating synthetic user-book interactions...


                                                                                

✅ Created 50,000 synthetic ratings
▶️ Training ALS collaborative filtering model...


25/06/26 22:22:17 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

✅ Recommendation models ready!


### /popular

In [11]:
from pyspark.sql.functions import col
from pyspark.sql.types import FloatType
import pandas as pd

def get_popular_books(k=20):
    """Get the most popular books sorted by rating - Fixed version"""
    
    # Use the existing books_pdf DataFrame but fix the column alignment issue
    # The issue was that during toPandas() conversion, columns got misaligned
    
    df_copy = books_pdf.copy()
    
    # Convert Rating column to numeric, handling the case where it might contain text
    df_copy["Rating_Numeric"] = pd.to_numeric(df_copy["Rating"], errors='coerce')
    
    # If Rating_Numeric has NaN values, it means the Rating column contains non-numeric data
    # In that case, we need to find the correct numeric column
    if df_copy["Rating_Numeric"].isna().all():
        # Check each column to find the one with numeric rating-like values
        for col_name in df_copy.columns:
            if col_name != "Rating":
                try:
                    numeric_col = pd.to_numeric(df_copy[col_name], errors='coerce')
                    # Check if this column has values that look like ratings (0-5 range)
                    if not numeric_col.isna().all() and numeric_col.min() >= 0 and numeric_col.max() <= 5:
                        print(f"⚠️  Found rating values in column '{col_name}' instead of 'Rating'")
                        df_copy["Rating_Numeric"] = numeric_col
                        break
                except:
                    continue
    
    # Sort by rating and return top k
    result = df_copy.nlargest(k, "Rating_Numeric")[["Id", "Name", "Authors", "Rating_Numeric"]]
    result = result.rename(columns={"Rating_Numeric": "Rating"})
    
    return result

### /search

In [12]:
def search_books(query, df_pd, top_n=20):
    mask = (df_pd["Name"].str.contains(query, case=False, na=False) | 
            df_pd["Description"].str.contains(query, case=False, na=False))
    return df_pd[mask].head(top_n)

## Model Testing & Validation

In [13]:
# ===== COMPREHENSIVE MODEL TESTING =====

print("🧪 Testing all recommendation functions...")
print("=" * 50)

# Test 1: Popular Books
print("\n1. 📈 Testing Popular Books:")
popular = get_popular_books(k=5)
print(popular[["Id", "Name", "Rating"]].head())

# Test 2: Search Function  
print("\n2. 🔍 Testing Search Function:")
search_results = search_books("Harry Potter", books_pdf, top_n=3)
if len(search_results) > 0:
    print(search_results[["Id", "Name", "Authors"]].head())
else:
    print("No Harry Potter books found, trying 'love'...")
    search_results = search_books("love", books_pdf, top_n=3)
    print(search_results[["Id", "Name", "Authors"]].head())

# Test 3: Author-based Recommendations
print("\n3. 👤 Testing Author-based Recommendations:")
# Get a sample author
sample_author = books_pdf["Authors"].iloc[0]
print(f"Finding books by author: {sample_author}")
author_recs = recommend_by_author(sample_author, top_n=3)
print(author_recs[["Id", "Name", "Authors", "Rating"]])

# Test 4: Content-based Recommendations (LSH)
print("\n4. 📚 Testing LSH Content-based Recommendations:")
# Get a random book ID
sample_book_id = books_pdf["Id"].iloc[0]
sample_book_name = books_pdf[books_pdf["Id"] == sample_book_id]["Name"].iloc[0]
print(f"Finding books similar to: '{sample_book_name}' (ID: {sample_book_id})")

similar_books = recommend_similar_books_lsh(sample_book_id, top_n=5)
if len(similar_books) > 0:
    print(similar_books[["Id", "Name", "distCol"]])
else:
    print("No similar books found")

# Test 5: Collaborative Filtering (ALS)
print("\n5. 🤖 Testing Collaborative Filtering (ALS):")
sample_user_id = 1
print(f"Finding recommendations for user ID: {sample_user_id}")
user_recs = recommend_for_user(sample_user_id, top_n=5)
if len(user_recs) > 0:
    print(user_recs[["Id", "Name", "Authors", "Rating"]])
else:
    print("No recommendations found for this user")

print("\n" + "=" * 50)
print("✅ All tests completed! Models are ready for deployment.")

# Summary statistics
print(f"\n📊 Model Summary:")
print(f"   📚 Total books: {len(books_pdf):,}")
print(f"   🎯 Features dimension: {len(cv_model.vocabulary):,}")
print(f"   🔍 LSH hash tables: {lsh.getNumHashTables()}")
print(f"   👥 ALS rank: {als.getRank()}")
print(f"   📈 Synthetic ratings: {ratings_df.count():,}")

🧪 Testing all recommendation functions...

1. 📈 Testing Popular Books:
          Id                                               Name  Rating
3    4411491  Transient Temperatures in Engineering and Science     5.0
52   4411745                    Mr. Carteret: And Other Stories     5.0
84   4411873                  Foggy Moggy Inn (Scaredy Cats S.)     5.0
87   4411879  Knowing What Works in Health Care: A Roadmap f...     5.0
103  4411935                                    At the Seashore     5.0

2. 🔍 Testing Search Function:
            Id                                               Name  \
6356   4114701                                    A Point of View   
22571  4143960  The Art Of Investigative Interviewing: A Human...   
30420  4366752  Selections from Harry Potter and the Goblet of...   

                  Authors  
6356          Clive James  
22571  Charles L. Yeschke  
30420       Patrick Doyle  

3. 👤 Testing Author-based Recommendations:
Finding books by author: Frank Wh

                                                                                

        Id                                          Name        Authors Rating
0  4755351              Understanding the Brahma Kumaris  Frank Whaling    2.6
1  4411486  Larousse Dictionary of Beliefs and Religions  Frank Whaling    0.0
2  4721402        Theory and Method in Religious Studies  Frank Whaling    0.0

4. 📚 Testing LSH Content-based Recommendations:
Finding books similar to: 'Larousse Dictionary of Beliefs and Religions' (ID: 4411486)


                                                                                

        Id                                               Name   distCol
0  4252279               Diccionario de Santa Teresa de Jesus  7.549834
1  4155217      Energy & High Performance Facility Sourcebook  7.549834
2  4515226                              Dictionary of Flavors  7.549834
3  4022096                                            Sikhism  7.549834
4  4107059  Relations Between the European Union and Latin...  7.615773

5. 🤖 Testing Collaborative Filtering (ALS):
Finding recommendations for user ID: 1


                                                                                

        Id                                               Name  \
0  4479884                                 Rescuing Christine   
1  4710748                                          Cockatoos   
2  4507713  The Golden Age of the Passenger Train: From St...   
3  4511424           The Cambridge Companion to Edith Wharton   
4  4589050  Dear Walt: Like Moses To Midian, He Left A Pla...   

             Authors Rating  
0        Alyssa Dean   2.75  
1  Werner Lantermann   3.68  
2         C.J. Riley    1.5  
3     Millicent Bell    4.0  
4       R.D. Frazier    5.0  

✅ All tests completed! Models are ready for deployment.

📊 Model Summary:
   📚 Total books: 183,775
   🎯 Features dimension: 10,000
   🔍 LSH hash tables: 6
   👥 ALS rank: 50
   📈 Synthetic ratings: 50,000


## Model Saving & Deployment

In [14]:
# ===== SAVE MODELS FOR API DEPLOYMENT =====

import os

# Create models directory
models_dir = "../api-backend/models"
os.makedirs(models_dir, exist_ok=True)

print("💾 Saving trained models for API deployment...")

# 1. Save Spark ML models
print("   🔧 Saving Spark ML models...")
als_model.write().overwrite().save(f"{models_dir}/als_model")
lsh_model.write().overwrite().save(f"{models_dir}/lsh_model")
cv_model.write().overwrite().save(f"{models_dir}/cv_model")

# 2. Save books data and features as Parquet for fast loading
print("   📊 Saving books data...")
df_feat.write.mode("overwrite").parquet(f"{models_dir}/books_features.parquet")

# Save books without the features column for Pandas compatibility
books_simple_pdf = books_pdf.drop(columns=['features'])
books_simple_pdf.to_parquet(f"{models_dir}/books_pandas.parquet")

# 3. Save additional data structures
print("   📋 Saving additional metadata...")
import pickle

# Save vocabulary for text processing
with open(f"{models_dir}/vocabulary.pkl", "wb") as f:
    pickle.dump(cv_model.vocabulary, f)

# Save model metadata
model_metadata = {
    "total_books": len(books_pdf),
    "feature_dim": len(cv_model.vocabulary),
    "lsh_hash_tables": lsh.getNumHashTables(),
    "lsh_bucket_length": lsh.getBucketLength(),
    "als_rank": als.getRank(),
    "als_reg_param": als.getRegParam(),
    "synthetic_ratings_count": ratings_df.count(),
    "training_date": datetime.now().isoformat()
}

with open(f"{models_dir}/model_metadata.pkl", "wb") as f:
    pickle.dump(model_metadata, f)

print("✅ All models saved successfully!")
print(f"📁 Models saved to: {os.path.abspath(models_dir)}")
print("\n📋 Saved components:")
print("   - als_model/ (Collaborative Filtering)")
print("   - lsh_model/ (Content-based Similarity)")  
print("   - cv_model/ (Text Vectorization)")
print("   - books_features.parquet (Books with ML features)")
print("   - books_pandas.parquet (Books in Pandas format)")
print("   - vocabulary.pkl (Text vocabulary)")
print("   - model_metadata.pkl (Model configuration)")

# Create a simple model loading helper
model_loader_code = '''
# Model Loading Helper for API
import pickle
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALSModel
from pyspark.ml.feature import BucketedRandomProjectionLSHModel, CountVectorizerModel
import pandas as pd

def load_models(models_dir):
    """Load all trained models for the API"""
    
    # Initialize Spark
    spark = SparkSession.builder.appName("GoodreadsAPI").getOrCreate()
    
    # Load Spark models
    als_model = ALSModel.load(f"{models_dir}/als_model")
    lsh_model = BucketedRandomProjectionLSHModel.load(f"{models_dir}/lsh_model")
    cv_model = CountVectorizerModel.load(f"{models_dir}/cv_model")
    
    # Load data
    books_df = spark.read.parquet(f"{models_dir}/books_features.parquet")
    books_pdf = pd.read_parquet(f"{models_dir}/books_pandas.parquet")
    
    # Load metadata
    with open(f"{models_dir}/model_metadata.pkl", "rb") as f:
        metadata = pickle.load(f)
    
    return {
        "spark": spark,
        "als_model": als_model,
        "lsh_model": lsh_model,
        "cv_model": cv_model,
        "books_df": books_df,
        "books_pdf": books_pdf,
        "metadata": metadata
    }
'''

with open(f"{models_dir}/model_loader.py", "w") as f:
    f.write(model_loader_code)

print("   - model_loader.py (Helper for loading models in API)")
print("\n🚀 Models are ready for deployment in the Flask API!")

💾 Saving trained models for API deployment...
   🔧 Saving Spark ML models...


25/06/26 22:23:08 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/06/26 22:23:10 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

   📊 Saving books data...


25/06/26 22:23:13 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

   📋 Saving additional metadata...
✅ All models saved successfully!
📁 Models saved to: /Users/hazwanadh/Code/Sem4/bigdata/goodreads-book-recommend-bigdata/api-backend/models

📋 Saved components:
   - als_model/ (Collaborative Filtering)
   - lsh_model/ (Content-based Similarity)
   - cv_model/ (Text Vectorization)
   - books_features.parquet (Books with ML features)
   - books_pandas.parquet (Books in Pandas format)
   - vocabulary.pkl (Text vocabulary)
   - model_metadata.pkl (Model configuration)
   - model_loader.py (Helper for loading models in API)

🚀 Models are ready for deployment in the Flask API!


In [15]:
# ===== FINAL VERIFICATION =====

print("🔍 Final verification: Testing if saved models can be loaded...")

# Test loading the model loader
import sys
sys.path.append("../api-backend/models")

try:
    exec(open("../api-backend/models/model_loader.py").read())
    print("✅ Model loader script is valid")
    
    # Test loading metadata
    import pickle
    with open("../api-backend/models/model_metadata.pkl", "rb") as f:
        metadata = pickle.load(f)
    print(f"✅ Model metadata loaded: {metadata['total_books']:,} books")
    
    # Test loading vocabulary
    with open("../api-backend/models/vocabulary.pkl", "rb") as f:
        vocab = pickle.load(f)
    print(f"✅ Vocabulary loaded: {len(vocab):,} terms")
    
    # Test loading Pandas data
    import pandas as pd
    books_test = pd.read_parquet("../api-backend/models/books_pandas.parquet")
    print(f"✅ Pandas books data loaded: {len(books_test):,} books")
    
    print("\n🎉 The model is ready for deployment!")
    print("\n📊 Final Statistics:")
    print(f"   🔢 Total books processed: {metadata['total_books']:,}")
    print(f"   🧠 ML features dimension: {metadata['feature_dim']:,}")
    print(f"   🎯 LSH hash tables: {metadata['lsh_hash_tables']}")
    print(f"   ⭐ ALS model rank: {metadata['als_rank']}")
    print(f"   📈 Training ratings: {metadata['synthetic_ratings_count']:,}")
    print(f"   📅 Training completed: {metadata['training_date']}")
    
    print("\n🚀 Next steps:")
    print("   1. Update the Flask API to use these models")
    print("   2. Deploy the API with model loading")
    print("   3. Connect the React frontend to the API")
    print("   4. Test end-to-end recommendations")
    
except Exception as e:
    print(f"❌ Error during verification: {e}")
    print("Please check the model saving process.")

🔍 Final verification: Testing if saved models can be loaded...
✅ Model loader script is valid
✅ Model metadata loaded: 183,775 books
✅ Vocabulary loaded: 10,000 terms
✅ Pandas books data loaded: 183,775 books

🎉 The model is ready for deployment!

📊 Final Statistics:
   🔢 Total books processed: 183,775
   🧠 ML features dimension: 10,000
   🎯 LSH hash tables: 6
   ⭐ ALS model rank: 50
   📈 Training ratings: 50,000
   📅 Training completed: 2025-06-26T22:23:28.510503

🚀 Next steps:
   1. Update the Flask API to use these models
   2. Deploy the API with model loading
   3. Connect the React frontend to the API
   4. Test end-to-end recommendations
