LOADING THE DATA

In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent   # notebooks -> project root
sys.path.insert(0, str(PROJECT_ROOT))

print("PROJECT_ROOT =", PROJECT_ROOT)

PROJECT_ROOT = /Users/mertdikdas/Documents/Okul/Ders/Projeler/Kişisel Projeler/book-recommend


In [2]:
import pandas as pd 
import numpy as np
from pathlib import Path
from src.database.pull_books_from_db import pull_books_from_db
from src.database.database import SessionLocal

# Load processed data
db = SessionLocal()
df = pull_books_from_db(db)
df.head()

Unnamed: 0,id,work_key,title,author,genre,description
0,1,/works/OL138052W,Alice's Adventures in Wonderland,Lewis Carroll,fantasy; science fiction; history; juvenile li...,"Alice (fictitious character : carroll), fictio..."
1,2,/works/OL18417W,The Wonderful Wizard of Oz,L. Frank Baum,fantasy; science fiction; history; juvenile li...,Witches; Toy and movable books; Spanish langua...
2,3,/works/OL24034W,Treasure Island,Robert Louis Stevenson,fantasy; history; literature; juvenile literature,Fiction; Treasure Island (Imaginary place); Tr...
3,4,/works/OL20600W,Gulliver's Travels,Jonathan Swift,fantasy; literature; short stories,YA; Young adult; Juvenile; Fiction; Fantasy; U...
4,5,/works/OL259010W,A Midsummer Night's Dream,William Shakespeare,fantasy; literature; juvenile literature; poetry,Drama; Courtship; Plays; Hippolyta (Greek myth...


PREPARING FOR RECOMMEND FOR ONE BOOK

In [3]:
# Create a combined text field for vectorization
df["text"] = (
    df["title"].fillna("") + " " +
    df["author"].fillna("") + " " +
    df["genre"].fillna("") + " " +
    df["description"].fillna("")
)

df[["title", "text"]].head()
from sklearn.feature_extraction.text import TfidfVectorizer
# Vectorize the text data
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=10000
)

X = vectorizer.fit_transform(df["text"])
X.shape
from sklearn.metrics.pairwise import cosine_similarity
# Compute cosine similarity matrix
similarity = cosine_similarity(X)
similarity.shape


(19411, 19411)

RECOMMEND FUNCTION

In [4]:
# Recommendation function for a single book
def recommend(title, df, similarity, top_k=5):
    if title not in df["title"].values:
        return "Book not found"

    idx = df.index[df["title"] == title][0]
    scores = list(enumerate(similarity[idx]))
    
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    recommendations = []
    for i, score in scores[1: top_k + 1]:
        recommendations.append(df.iloc[i]["title"])

    return recommendations



EXAMPLE

In [5]:
# Example usage
book = df["title"].sample(1).values[0]
print(f"Recommendations for '{book}':")
rec= recommend(book, df, similarity)
print(rec)

Recommendations for 'Life':
['Are you lonesome tonight?', 'Deadly intent', 'Another Miserable Love Song', 'The heroin diaries', 'World on a string']


IMPROVING WEIGHT AND PREPARING AGAIN

In [24]:
# Create a weighted text field with more emphasis on source_subject
df["text_weighted"] = (
    df["title"].fillna("") + " " +
    (df["author"].fillna("") + " ")*2 +
    (df["genre"].fillna("") + " ")*3 +
    df["description"].fillna("") 
)

df[["title", "text_weighted"]].head()
# Vectorize the weighted text data for improved recommendations
X_w = vectorizer.fit_transform(df["text_weighted"])
X_w.shape
# Compute cosine similarity matrix for weighted text
similarity_w = cosine_similarity(X_w)



EXAMPLE FOR IMPROVED

In [25]:
# Example usage with weighted similarity
book = df["title"].sample(1).values[0]
print(f"Recommendations for '{book}':")
print(recommend(book, df, similarity_w))


Recommendations for 'Mathematics':
['MathScape', 'Quick Review Math Handbook', 'Contemporary Mathematics in Context', 'MathMatters 1', 'Everyday Mathematics']


GETTING RATING DATA FROM rating.csv FILE

In [26]:
import pandas as pd
from pathlib import Path

ratings_path = Path("../data/ratings.csv")
print(f"Reading ratings from: {ratings_path}")
if not ratings_path.exists():
    raise FileNotFoundError(f"File not found: {ratings_path}")
ratings = pd.read_csv(ratings_path)
ratings.head()

Reading ratings from: ../data/ratings.csv


Unnamed: 0,user_id,work_key,rating
0,1,/works/OL20600W,4
1,1,/works/OL82563W,3
2,1,/works/OL1449046W,2
3,1,/works/OL893502W,4
4,1,/works/OL2671483W,2


TEST FOR MULTIPLE BOOK AND WITH RATINGS

In [27]:
# work_key to index mapping
work_to_idx = {
    k: i
    for i, k in enumerate(df["work_key"])
}
list(work_to_idx.items())[:5]

# User-test ratings
user_id = 1
user_ratings = ratings[ratings["user_id"] == user_id]
user_ratings

import numpy as np
item_vectors = []
weights = []
# Building user profile vector
for row in user_ratings.itertuples(index=False):
    work_key = row.work_key
    rating = row.rating
    idx = work_to_idx.get(work_key)

    v = X_w[idx].toarray()[0]
    item_vectors.append(v)
    weights.append(rating)
    
item_matrix = np.vstack(item_vectors)
print(item_matrix.shape)
weights = np.array(weights)
print(weights.shape)
user_profile = np.average(item_matrix, axis=0, weights=weights)
user_profile.shape
from sklearn.metrics.pairwise import cosine_similarity
# Compute similarity scores between user profile and all items
scores = cosine_similarity(user_profile.reshape(1, -1), X_w).ravel()
scores.shape  # (734,)


# Kullanıcının zaten rated ettiği kitapların indexleri
rated_idx = {work_to_idx[wk] for wk in user_ratings["work_key"]}

# (index, skor) listesi
idx_scores = list(enumerate(scores))

# zaten rated olanları filtrele
idx_scores = [
    (i, s) for i, s in idx_scores
    if i not in rated_idx
]

# yüksekten düşüğe sırala
idx_scores = sorted(idx_scores, key=lambda x: x[1], reverse=True)

top_k = 10
top_idx = [i for i, s in idx_scores[:top_k]]

df.iloc[top_idx][["title", "author", "genre"]]

    

(11, 10000)
(11,)


Unnamed: 0,title,author,genre
1165,Deathlands,James Axler,science fiction
208,Oryx and Crake,Margaret Atwood,fantasy; science fiction; romance
2454,Animal Instincts,Gena Showalter,romance
1662,Who?,Algis Budrys,science fiction
1989,The divide,Nicholas Evans,romance; literature
2344,Winter Roses,Diana Palmer,romance
2063,Nine Coaches Waiting,Mary Stewart,romance
2365,The Calling,Caridad Pineiro; Caridad Piñeiro,romance
2157,The Game of Kings,Dorothy Dunnett,romance
1404,Rogue moon,Algis Budrys,science fiction


FUNCTION FOR MULTIPLE BOOKS AND ITS RATINGS FOR SPECIFIC USER

In [28]:
# Recommendation function for a user based on their ratings
def recommend_for_user(user_id, df, X_w, ratings, work_to_idx, top_k):
    user_ratings = ratings[ratings["user_id"] == user_id]
    if user_ratings.empty:
        return df.sample(top_k)[["title", "authors", "source_subject"]]
    item_vectors = []
    weights = []
    for row in user_ratings.itertuples(index=False):
        work_key = row.work_key
        rating = row.rating
        idx = work_to_idx.get(work_key)

        v = X_w[idx].toarray()[0]
        item_vectors.append(v)
        weights.append(rating)
    item_matrix = np.vstack(item_vectors)
    weights = np.array(weights)
    user_profile = np.average(item_matrix, axis=0, weights=weights)
    print(pd.DataFrame(df[df["work_key"].isin(user_ratings["work_key"])])[["title", "author", "genre"]])
    similarity_scores = cosine_similarity(user_profile.reshape(1, -1), X_w).ravel()
    rated_idx = {work_to_idx[wk] for wk in user_ratings["work_key"]}
    idx_scores = list(enumerate(similarity_scores))
    idx_scores = [
        (i, s) for i, s in idx_scores
        if i not in rated_idx 
    ]
    top_idx = sorted(idx_scores, key=lambda x: x[1], reverse=True)
    top_idx = [i for i, s in top_idx[:top_k]]
    return df.iloc[top_idx][["title", "author", "genre"]]


In [29]:
ratings_path = Path("../data/ratings.csv")
print(f"Reading ratings from: {ratings_path}")
if not ratings_path.exists():
    raise FileNotFoundError(f"File not found: {ratings_path}")
ratings = pd.read_csv(ratings_path)

recommend_for_user(2, df, X_w, ratings, work_to_idx, top_k=20)


Reading ratings from: ../data/ratings.csv
            title       author       genre
3845  Martin Eden  Jack London  literature


Unnamed: 0,title,author,genre
1005,The Iron Heel,Jack London,science fiction
1095,The Scarlet Plague,Jack London,science fiction
10144,Three Lives,Gertrude Stein,short stories
4103,The ragged trousered philanthropists,Robert Tressell,literature
4102,Lanark,Alasdair Gray,literature
6961,Victorian writing and working women,Julia Swindells,autobiography
6765,Subjectivities,Regenia Gagnier,autobiography
1020,In the days of the comet,H. G. Wells,science fiction
6827,The autobiography of the working class,"Burnett, John",autobiography
4442,Summertime,J. M. Coetzee,literature
