In [26]:
import pandas as pd 
import numpy as np
from pathlib import Path

path = Path("../data/processed/books.csv")
df = pd.read_csv(path)
df.head()

Unnamed: 0,work_key,title,first_publish_year,authors,source_subject
0,/works/OL138052W,Alice's Adventures in Wonderland,1865,Lewis Carroll,fantasy
1,/works/OL18417W,The Wonderful Wizard of Oz,1899,L. Frank Baum,fantasy
2,/works/OL24034W,Treasure Island,1880,Robert Louis Stevenson,fantasy
3,/works/OL20600W,Gulliver's Travels,1726,Jonathan Swift,fantasy
4,/works/OL259010W,A Midsummer Night's Dream,1600,William Shakespeare,fantasy


In [27]:
df["text"] = (
    df["title"].fillna("") + " " +
    df["authors"].fillna("") + " " +
    df["source_subject"].fillna("")
)

df[["title", "text"]].head()

Unnamed: 0,title,text
0,Alice's Adventures in Wonderland,Alice's Adventures in Wonderland Lewis Carroll...
1,The Wonderful Wizard of Oz,The Wonderful Wizard of Oz L. Frank Baum fantasy
2,Treasure Island,Treasure Island Robert Louis Stevenson fantasy
3,Gulliver's Travels,Gulliver's Travels Jonathan Swift fantasy
4,A Midsummer Night's Dream,A Midsummer Night's Dream William Shakespeare ...


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=5000
)

X = vectorizer.fit_transform(df["text"])
X.shape

(734, 1854)

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(X)
similarity.shape


(734, 734)

In [36]:
def recommend(title, df, similarity, top_k=5):
    if title not in df["title"].values:
        return "Book not found"

    idx = df.index[df["title"] == title][0]
    scores = list(enumerate(similarity[idx]))
    
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    recommendations = []
    for i, score in scores[1: top_k + 1]:
        recommendations.append(df.iloc[i]["title"])

    return recommendations



In [38]:
book = df["title"].sample(1).values[0]
print(f"Recommendations for '{book}':")
rec= recommend(book, df, similarity)
print(rec)

Recommendations for 'Book of common prayer':
['The Jungle Book', 'The Book Of Three', 'The Case-Book of Sherlock Holmes', 'The sketch-book of Geoffrey Crayon, Esq', 'The Royal Book of Oz']


In [44]:
df["text_weighted"] = (
    df["title"].fillna("") + " " +
    df["authors"].fillna("") + " " +
    (df["source_subject"].fillna("") + " ") * 3
)

df[["title", "text_weighted"]].head()

Unnamed: 0,title,text_weighted
0,Alice's Adventures in Wonderland,Alice's Adventures in Wonderland Lewis Carroll...
1,The Wonderful Wizard of Oz,The Wonderful Wizard of Oz L. Frank Baum fanta...
2,Treasure Island,Treasure Island Robert Louis Stevenson fantasy...
3,Gulliver's Travels,Gulliver's Travels Jonathan Swift fantasy fant...
4,A Midsummer Night's Dream,A Midsummer Night's Dream William Shakespeare ...


In [40]:
X_w = vectorizer.fit_transform(df["text_weighted"])
X_w.shape


(734, 1854)

In [41]:
similarity_w = cosine_similarity(X_w)


In [42]:
book = df["title"].sample(1).values[0]
print(f"Recommendations for '{book}':")
print(recommend(book, df, similarity_w))


Recommendations for 'The Well at the World's End':
['The wood beyond the world', 'The house of the Wolfings', 'News from nowhere, or, An epoch of rest, being some chapters from a utopian romance', 'The Eye of the World', 'The Lost World']
