
# Content-Based Book Recommender — FIXED (no lambda in Pipeline)

This version avoids the `PicklingError: Can't pickle <lambda>` by removing the
`FunctionTransformer(lambda ...)`. We directly use `CountVectorizer` on the `text` column,
so artifacts save cleanly with `joblib.dump`.


In [31]:

# ============ Config ============
from pathlib import Path

DATA_PATH = Path(r"D:\Library Management System\LibraryData\AllBooks.csv")
FALLBACK_PATH = Path("./AllBooks.csv")

ARTIFACT_DIR = Path("./artifacts")
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

COL_ID = "isbn"
COL_TITLE = "title"
COL_AUTHORS = "author"
COL_GENRES = "genres"
COL_DESC = "description"

MIN_DF = 3
NGRAM_RANGE = (1, 2)
STOP_WORDS = "english"
TOP_K = 20


In [32]:

# ============ Imports ============
import re, json, joblib, numpy as np, pandas as pd
from pathlib import Path
from typing import Optional, Iterable, List, Tuple
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp

pd.set_option("display.max_colwidth", 200)


In [33]:

# ============ Load ============
def resolve_path(main: Path, fallback: Path) -> Path:
    if main.exists(): return main
    if fallback.exists(): return fallback
    raise FileNotFoundError(f"CSV not found at {main} or {fallback}")

CSV_PATH = resolve_path(DATA_PATH, FALLBACK_PATH)
books_raw = pd.read_csv(CSV_PATH, encoding="ISO-8859-1")
books_raw.head(3)


Unnamed: 0,title,series,author,rating,description,language,isbn,genres,bookFormat,pages,publisher,publishDate,numRatings,ratingsByStars,likedPercent,coverImg,bbeScore,bbeVotes,price
0,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,"WINNING MEANS FAME AND FORTUNE.LOSING MEANS CERTAIN DEATH.THE HUNGER GAMES HAVE BEGUN. . . .In the ruins of a place once known as North America lies the nation of Panem, a shining Capitol surround...",English,9780439023481,Young Adult;Fiction;Dystopia;Fantasy;Science Fiction;Romance;Adventure;Teen;Post Apocalyptic;Action;,Hardcover,374,Scholastic Press,09/14/08,6376780,3444695;1921313;745221;171994;93557;,96.0,https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1586722975l/2767052.jpg,2993816,30516,5.09
1,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPrÃ© (Illustrator)",4.5,"There is a door at the end of a silent corridor. And itâs haunting Harry Pottterâs dreams. Why else would he be waking in the middle of the night, screaming in terror?Harry has a lot on his mi...",English,9780439358071,Fantasy;Young Adult;Fiction;Magic;Childrens;Adventure;Audiobook;Middle Grade;Classics;Science Fiction Fantasy;,Paperback,870,Scholastic Inc.,09/28/04,2507623,1593642;637516;222366;39573;14526;,98.0,https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1546910265l/2.jpg,2632233,26923,7.38
2,Twilight,The Twilight Saga #1,Stephenie Meyer,3.6,"About three things I was absolutely positive.\n\nFirst, Edward was a vampire.\n\nSecond, there was a part of himâand I didn't know how dominant that part might beâthat thirsted for my blood.\n...",English,9780316015844,Young Adult;Fantasy;Romance;Vampires;Fiction;Paranormal;Paranormal Romance;Supernatural;Teen;Urban Fantasy;,Paperback,501,"Little, Brown and Company",09/06/06,4964519,1751460;1113682;1008686;542017;548674;,78.0,https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1361039443l/41865.jpg,1459448,14874,2.1


In [34]:
books.columns

Index(['title', 'series', 'author', 'rating', 'description', 'language',
       'isbn', 'genres', 'bookFormat', 'pages', 'publisher', 'publishDate',
       'numRatings', 'ratingsByStars', 'likedPercent', 'coverImg', 'bbeScore',
       'bbeVotes', 'price', 'text'],
      dtype='object')

In [35]:

# ============ Build text ============
def _to_str(x): return "" if pd.isna(x) else str(x)

def clean_text(s: str) -> str:
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def build_text_field(df: pd.DataFrame,
                     col_title: str,
                     col_authors: Optional[str] = None,
                     col_genres: Optional[str] = None,
                     col_desc: Optional[str] = None) -> pd.Series:
    parts = []
    parts.append(df[col_title].map(_to_str) if col_title in df.columns else "")
    if col_authors and col_authors in df.columns: parts.append(df[col_authors].map(_to_str))
    if col_genres and col_genres in df.columns: parts.append(df[col_genres].map(_to_str))
    if col_desc and col_desc in df.columns: parts.append(df[col_desc].map(_to_str))
    combo = parts[0]
    for p in parts[1:]: combo = combo + " " + p
    return combo.map(clean_text)

books = books_raw.copy()

# ensure unique id
if COL_ID not in books.columns:
    books[COL_ID] = np.arange(len(books), dtype=int)
else:
    books[COL_ID] = books[COL_ID].astype(str).fillna("")
    empties = books[COL_ID] == ""
    dups = books[COL_ID].duplicated(keep=False)
    if empties.any() or dups.any():
        surrogate = [f"row_{i}" for i in range(len(books))]
        books.loc[empties | dups, COL_ID] = np.array(surrogate)[(empties | dups).values]

books["text"] = build_text_field(books, COL_TITLE, COL_AUTHORS, COL_GENRES, COL_DESC)
books[[COL_ID, COL_TITLE, "text"]].head(5)


Unnamed: 0,isbn,title,text
0,9780439023481,The Hunger Games,the hunger games suzanne collins young adult fiction dystopia fantasy science fiction romance adventure teen post apocalyptic action winning means fame and fortune losing means certain death the h...
1,9780439358071,Harry Potter and the Order of the Phoenix,harry potter and the order of the phoenix j k rowling mary grandpr illustrator fantasy young adult fiction magic childrens adventure audiobook middle grade classics science fiction fantasy there i...
2,9780316015844,Twilight,twilight stephenie meyer young adult fantasy romance vampires fiction paranormal paranormal romance supernatural teen urban fantasy about three things i was absolutely positive first edward was a ...
3,9780345538376,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit and The Lord of the Rings,j r r tolkien 4 book boxed set the hobbit and the lord of the rings j r r tolkien fantasy fiction classics adventure science fiction fantasy epic fantasy high fantasy young adult literature magic ...
4,9780451527745,Alice's Adventures in Wonderland & Through the Looking-Glass,alice s adventures in wonderland through the looking glass lewis carroll john tenniel illustrator martin gardner introduction classics fantasy fiction childrens young adult literature adventure no...


In [36]:

# ============ Vectorize (NO Pipeline) ============
vectorizer = CountVectorizer(min_df=MIN_DF, ngram_range=NGRAM_RANGE, stop_words=STOP_WORDS)
X = vectorizer.fit_transform(books["text"])
X


<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 1559870 stored elements and shape (15785, 78181)>

In [37]:
# ============ Save artifacts (now picklable) ============
VECTORIZER_PATH = ARTIFACT_DIR / "vectorizer.joblib"
MATRIX_PATH = ARTIFACT_DIR / "X_csr.joblib"
ID_MAP_PATH = ARTIFACT_DIR / "id_maps.json"
ITEMS_PATH = ARTIFACT_DIR / "items.parquet"

joblib.dump(vectorizer, VECTORIZER_PATH)
joblib.dump(X.tocsr(), MATRIX_PATH)

id2idx = {bid: i for i, bid in enumerate(books[COL_ID].tolist())}
idx2id = {i: bid for bid, i in id2idx.items()}
with open(ID_MAP_PATH, "w", encoding="utf-8") as f:
    json.dump({"id2idx": id2idx, "idx2id": idx2id}, f)

# Save isbn, title, AND coverImg
books[[COL_ID, COL_TITLE, 'coverImg']].to_parquet(ITEMS_PATH, index=False)

print("Saved OK:")
for p in [VECTORIZER_PATH, MATRIX_PATH, ID_MAP_PATH, ITEMS_PATH]:
    print(" -", p)

Saved OK:
 - artifacts\vectorizer.joblib
 - artifacts\X_csr.joblib
 - artifacts\id_maps.json
 - artifacts\items.parquet


In [38]:

# ============ Recommender ============
class ContentRecommender:
    def __init__(self, X: sp.csr_matrix, id2idx: dict, idx2id: dict, items_df: pd.DataFrame):
        self.X = X
        self.id2idx = id2idx
        self.idx2id = idx2id
        self.items = items_df.set_index(COL_ID)

    @classmethod
    def from_artifacts(cls, matrix_path: Path, idmap_path: Path, items_path: Path):
        X = joblib.load(matrix_path)
        import json, pandas as pd
        with open(idmap_path, "r", encoding="utf-8") as f:
            maps = json.load(f)
        items = pd.read_parquet(items_path)
        return cls(X, maps["id2idx"], {int(k) if str(k).isdigit() else k: v for k,v in maps["idx2id"].items()}, items)

    def _topk_from_vector(self, v: sp.csr_matrix, k: int, exclude_idx: Optional[Iterable[int]] = None):
        sims = cosine_similarity(v, self.X).ravel()
        if exclude_idx:
            sims[list(exclude_idx)] = -1e9
        if k >= len(sims):
            order = np.argsort(-sims)
        else:
            order = np.argpartition(-sims, range(k))[:k]
            order = order[np.argsort(-sims[order])]
        return order.tolist(), sims[order].tolist()

    def similar_by_id(self, book_id: str, k: int = 20) -> pd.DataFrame:
        if book_id not in self.id2idx: raise KeyError(f"Unknown id: {book_id}")
        idx = self.id2idx[book_id]
        v = self.X[idx]
        top_idx, scores = self._topk_from_vector(v, k+1, exclude_idx=[idx])
        rec_ids = [self.idx2id[int(i)] for i in top_idx]
        out = self.items.loc[rec_ids].reset_index().copy()
        out["score"] = scores
        return out

    def similar_by_title(self, title_substring: str, k: int = 20) -> pd.DataFrame:
        cand = self.items[self.items[COL_TITLE].str.contains(title_substring, case=False, na=False)]
        if cand.empty: raise KeyError(f"No title matching '{title_substring}'")
        bid = cand.index[0]
        return self.similar_by_id(bid, k=k)

    def recommend_from_history(self, ids: List[str], k: int = 20) -> pd.DataFrame:
        idxs = [self.id2idx[i] for i in ids if i in self.id2idx]
        if not idxs: raise ValueError("None of the provided ids exist.")
        v = self.X[idxs].mean(axis=0)
        top_idx, scores = self._topk_from_vector(v, k=len(idxs)+k, exclude_idx=idxs)
        rec_ids = [self.idx2id[int(i)] for i in top_idx[:k]]
        out = self.items.loc[rec_ids].reset_index().copy()
        out["score"] = scores[:k]
        return out


In [39]:

# ============ Demo ============
recsys = ContentRecommender.from_artifacts(MATRIX_PATH, ID_MAP_PATH, ITEMS_PATH)
try:
    demo = recsys.similar_by_title("The Demigod Files", k=10)
    print(demo.head(10))
except Exception as e:
    print("Demo failed:", e)


            isbn  \
0  9781423121718   
1  9781423119500   
2  9781423163008   
3  9781484732786   
4  9780786856862   
5  9780979233142   
6  9781423153627   
7  9781423101475   
8  9780141382890   
9  9781423183655   

                                                                                                   title  \
0                                                      Percy Jackson & the Olympians: The Ultimate Guide   
1                                                              Percy Jackson and the Olympians Boxed Set   
2                                                                                    The Demigod Diaries   
3                                                Demigods & Magicians: Percy and Annabeth Meet the Kanes   
4                                                                                    The Sea of Monsters   
5  Demigods and Monsters: Your Favorite Authors on Rick Riordan's Percy Jackson and the Olympians Series   
6                      