LOADING THE DATA

In [None]:
import pandas as pd 
import numpy as np
from pathlib import Path

# Load processed data
path = Path("../data/processed/books.csv")
df = pd.read_csv(path)
df.head()

Unnamed: 0,work_key,title,first_publish_year,authors,source_subject
0,/works/OL138052W,Alice's Adventures in Wonderland,1865,Lewis Carroll,fantasy
1,/works/OL18417W,The Wonderful Wizard of Oz,1899,L. Frank Baum,fantasy
2,/works/OL24034W,Treasure Island,1880,Robert Louis Stevenson,fantasy
3,/works/OL20600W,Gulliver's Travels,1726,Jonathan Swift,fantasy
4,/works/OL259010W,A Midsummer Night's Dream,1600,William Shakespeare,fantasy


PREPARING FOR RECOMMEND FOR ONE BOOK

In [None]:
# Create a combined text field for vectorization
df["text"] = (
    df["title"].fillna("") + " " +
    df["authors"].fillna("") + " " +
    df["source_subject"].fillna("")
)

df[["title", "text"]].head()
from sklearn.feature_extraction.text import TfidfVectorizer
# Vectorize the text data
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=5000
)

X = vectorizer.fit_transform(df["text"])
X.shape
from sklearn.metrics.pairwise import cosine_similarity
# Compute cosine similarity matrix
similarity = cosine_similarity(X)
similarity.shape


(734, 1854)

RECOMMEND FUNCTION

In [None]:
# Recommendation function for a single book
def recommend(title, df, similarity, top_k=5):
    if title not in df["title"].values:
        return "Book not found"

    idx = df.index[df["title"] == title][0]
    scores = list(enumerate(similarity[idx]))
    
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    recommendations = []
    for i, score in scores[1: top_k + 1]:
        recommendations.append(df.iloc[i]["title"])

    return recommendations



EXAMPLE

In [None]:
# Example usage
book = df["title"].sample(1).values[0]
print(f"Recommendations for '{book}':")
rec= recommend(book, df, similarity)
print(rec)

Recommendations for 'Congo':
['Prey', 'Timeline', 'The Lost World', 'The Terminal Man', 'Jurassic Park']


IMPROVING WEIGHT AND PREPARING AGAIN

In [None]:
# Create a weighted text field with more emphasis on source_subject
df["text_weighted"] = (
    df["title"].fillna("") + " " +
    df["authors"].fillna("") + " " +
    (df["source_subject"].fillna("") + " ") * 3
)

df[["title", "text_weighted"]].head()
# Vectorize the weighted text data for improved recommendations
X_w = vectorizer.fit_transform(df["text_weighted"])
X_w.shape
# Compute cosine similarity matrix for weighted text
similarity_w = cosine_similarity(X_w)



(734, 1854)

EXAMPLE FOR IMPROVED

In [None]:
# Example usage with weighted similarity
book = df["title"].sample(1).values[0]
print(f"Recommendations for '{book}':")
print(recommend(book, df, similarity_w))


Recommendations for 'Flowers for Algernon':
['Foundation', 'The Gods Themselves', 'The Stand', 'Firestarter', 'On Writing']


GETTING RATING DATA FROM rating.csv FILE

In [84]:
import pandas as pd
from pathlib import Path

ratings_path = Path("../data/ratings.csv")
print(f"Reading ratings from: {ratings_path}")
if not ratings_path.exists():
    raise FileNotFoundError(f"File not found: {ratings_path}")
ratings = pd.read_csv(ratings_path)
ratings.head()

Reading ratings from: ../data/ratings.csv


Unnamed: 0,user_id,work_key,rating
0,1,/works/OL138052W,5
1,1,/works/OL18417W,4
2,1,/works/OL24034W,3


TEST FOR MULTIPLE BOOK AND WITH RATINGS

In [None]:
# work_key to index mapping
work_to_idx = {
    k: i
    for i, k in enumerate(df["work_key"])
}
list(work_to_idx.items())[:5]

# User-test ratings
user_id = 1
user_ratings = ratings[ratings["user_id"] == user_id]
user_ratings

import numpy as np
item_vectors = []
weights = []
# Building user profile vector
for row in user_ratings.itertuples(index=False):
    work_key = row.work_key
    rating = row.rating
    idx = work_to_idx.get(work_key)

    v = X_w[idx].toarray()[0]
    item_vectors.append(v)
    weights.append(rating)
    
item_matrix = np.vstack(item_vectors)
print(item_matrix.shape)
weights = np.array(weights)
print(weights.shape)
user_profile = np.average(item_matrix, axis=0, weights=weights)
user_profile.shape
from sklearn.metrics.pairwise import cosine_similarity
# Compute similarity scores between user profile and all items
scores = cosine_similarity(user_profile.reshape(1, -1), X_w).ravel()
scores.shape  # (734,)


# Kullanıcının zaten rated ettiği kitapların indexleri
rated_idx = {work_to_idx[wk] for wk in user_ratings["work_key"]}

# (index, skor) listesi
idx_scores = list(enumerate(scores))

# zaten rated olanları filtrele
idx_scores = [
    (i, s) for i, s in idx_scores
    if i not in rated_idx
]

# yüksekten düşüğe sırala
idx_scores = sorted(idx_scores, key=lambda x: x[1], reverse=True)

top_k = 10
top_idx = [i for i, s in idx_scores[:top_k]]

df.iloc[top_idx][["title", "authors", "source_subject"]]

    

(3, 1854)
(3,)


(1854,)

FUNCTION FOR MULTIPLE BOOKS AND ITS RATINGS FOR SPECIFIC USER

In [None]:
# Recommendation function for a user based on their ratings
def recommend_for_user(user_id, df, X_w, ratings, work_to_idx, top_k):
    user_ratings = ratings[ratings["user_id"] == user_id]
    if user_ratings.empty:
        return df.sample(top_k)[["title", "authors", "source_subject"]]
    item_vectors = []
    weights = []
    for row in user_ratings.itertuples(index=False):
        work_key = row.work_key
        rating = row.rating
        idx = work_to_idx.get(work_key)

        v = X_w[idx].toarray()[0]
        item_vectors.append(v)
        weights.append(rating)
    item_matrix = np.vstack(item_vectors)
    weights = np.array(weights)
    user_profile = np.average(item_matrix, axis=0, weights=weights)

    similarity_scores = cosine_similarity(user_profile.reshape(1, -1), X_w).ravel()
    rated_idx = {work_to_idx[wk] for wk in user_ratings["work_key"]}
    print(df[df["work_key"].isin(user_ratings["work_key"])].title)
    idx_scores = list(enumerate(similarity_scores))
    idx_scores = [
        (i, s) for i, s in idx_scores
        if i not in rated_idx 
    ]
    top_idx = sorted(idx_scores, key=lambda x: x[1], reverse=True)
    top_idx = [i for i, s in top_idx[:top_k]]
    return df.iloc[top_idx][["title", "authors", "source_subject"]]
recommend_for_user(1, df, X_w, ratings, work_to_idx, top_k=10)

0    Alice's Adventures in Wonderland
1          The Wonderful Wizard of Oz
2                     Treasure Island
Name: title, dtype: object


Unnamed: 0,title,authors,source_subject
18,Alice's Adventures in Wonderland / Through the...,Lewis Carroll,fantasy
126,The Nursery Alice,Lewis Carroll,fantasy
6,Through the Looking-Glass,Lewis Carroll,fantasy
95,The Last Battle,C. S. Lewis,fantasy
57,Sylvie and Bruno,Lewis Carroll,fantasy
81,Prince Caspian,C. S. Lewis,fantasy
23,Sky Island,L. Frank Baum,fantasy
157,The Chronicles of Narnia,C. S. Lewis,fantasy
66,The Horse and His Boy,C. S. Lewis,fantasy
71,The Magician's Nephew,C. S. Lewis,fantasy


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  user_ratings["rating"][1]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_ratings["rating"][1]=1


Unnamed: 0,user_id,work_key,rating
0,1,/works/OL138052W,5
1,1,/works/OL18417W,1
2,1,/works/OL24034W,3


In [128]:
for row in user_ratings.itertuples():
    print(df.iloc[work_to_idx[row.work_key]][["title", "authors", "source_subject"]])
    print("Rating:", row.rating)

title             Alice's Adventures in Wonderland
authors                              Lewis Carroll
source_subject                             fantasy
Name: 0, dtype: object
Rating: 5
title             The Wonderful Wizard of Oz
authors                        L. Frank Baum
source_subject                       fantasy
Name: 1, dtype: object
Rating: 1
title                    Treasure Island
authors           Robert Louis Stevenson
source_subject                   fantasy
Name: 2, dtype: object
Rating: 3


In [126]:
recommend_for_user(1, df, X_w, ratings, work_to_idx, top_k=10)


Unnamed: 0,title,authors,source_subject
18,Alice's Adventures in Wonderland / Through the...,Lewis Carroll,fantasy
126,The Nursery Alice,Lewis Carroll,fantasy
14,Dorothy and the Wizard in Oz,L. Frank Baum,fantasy
26,The Magic of Oz,L. Frank Baum,fantasy
19,The Road to Oz,L. Frank Baum,fantasy
11,Ozma of Oz,L. Frank Baum,fantasy
24,The Scarecrow of Oz,L. Frank Baum,fantasy
28,Rinkitink in Oz,L. Frank Baum,fantasy
33,Glinda of Oz,L. Frank Baum,fantasy
89,Little Wizard stories of Oz,L. Frank Baum,fantasy
