LOADING THE DATA

In [14]:
import pandas as pd 
import numpy as np
from pathlib import Path

# Load processed data
path = Path("../data/processed/books.csv")
df = pd.read_csv(path)
df.head()

Unnamed: 0,work_key,title,first_publish_year,authors,source_subject,subjects
0,/works/OL138052W,Alice's Adventures in Wonderland,1865,Lewis Carroll,fantasy,"Alice (fictitious character : carroll), fictio..."
1,/works/OL18417W,The Wonderful Wizard of Oz,1899,L. Frank Baum,fantasy,Witches; Toy and movable books; Spanish langua...
2,/works/OL24034W,Treasure Island,1880,Robert Louis Stevenson,fantasy,Fiction; Treasure Island (Imaginary place); Tr...
3,/works/OL20600W,Gulliver's Travels,1726,Jonathan Swift,fantasy,YA; Young adult; Juvenile; Fiction; Fantasy; U...
4,/works/OL259010W,A Midsummer Night's Dream,1600,William Shakespeare,fantasy,Drama; Courtship; Plays; Hippolyta (Greek myth...


PREPARING FOR RECOMMEND FOR ONE BOOK

In [15]:
# Create a combined text field for vectorization
df["text"] = (
    df["title"].fillna("") + " " +
    df["authors"].fillna("") + " " +
    df["source_subject"].fillna("") + " " +
    df["subjects"].fillna("")
)

df[["title", "text"]].head()
from sklearn.feature_extraction.text import TfidfVectorizer
# Vectorize the text data
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=10000
)

X = vectorizer.fit_transform(df["text"])
X.shape
from sklearn.metrics.pairwise import cosine_similarity
# Compute cosine similarity matrix
similarity = cosine_similarity(X)
similarity.shape


(9252, 9252)

RECOMMEND FUNCTION

In [16]:
# Recommendation function for a single book
def recommend(title, df, similarity, top_k=5):
    if title not in df["title"].values:
        return "Book not found"

    idx = df.index[df["title"] == title][0]
    scores = list(enumerate(similarity[idx]))
    
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    recommendations = []
    for i, score in scores[1: top_k + 1]:
        recommendations.append(df.iloc[i]["title"])

    return recommendations



EXAMPLE

In [17]:
# Example usage
book = df["title"].sample(1).values[0]
print(f"Recommendations for '{book}':")
rec= recommend(book, df, similarity)
print(rec)

Recommendations for 'Armageddon':
['The Dangerous Days of Daniel X', 'Demons and druids', 'Watch the skies', 'Game over', 'Planet of the Damned']


IMPROVING WEIGHT AND PREPARING AGAIN

In [34]:
# Create a weighted text field with more emphasis on source_subject
df["text_weighted"] = (
    df["title"].fillna("") + " " +
    df["authors"].fillna("") + " " +
    df["source_subject"].fillna("") + " " +
    df["subjects"].fillna("") 
)

df[["title", "text_weighted"]].head()
# Vectorize the weighted text data for improved recommendations
X_w = vectorizer.fit_transform(df["text_weighted"])
X_w.shape
# Compute cosine similarity matrix for weighted text
similarity_w = cosine_similarity(X_w)



EXAMPLE FOR IMPROVED

In [35]:
# Example usage with weighted similarity
book = df["title"].sample(1).values[0]
print(f"Recommendations for '{book}':")
print(recommend(book, df, similarity_w))


Recommendations for 'Twilight':
["Firestar's Quest", 'Midnight', 'Sunset', 'The Darkest Hour', 'Starlight']


GETTING RATING DATA FROM rating.csv FILE

In [40]:
import pandas as pd
from pathlib import Path

ratings_path = Path("../data/ratings.csv")
print(f"Reading ratings from: {ratings_path}")
if not ratings_path.exists():
    raise FileNotFoundError(f"File not found: {ratings_path}")
ratings = pd.read_csv(ratings_path)
ratings.head()

Reading ratings from: ../data/ratings.csv


Unnamed: 0,user_id,work_key,rating
0,1,/works/OL20600W,4
1,1,/works/OL82563W,3
2,1,/works/OL1449046W,2
3,1,/works/OL893502W,4
4,1,/works/OL2671483W,2


TEST FOR MULTIPLE BOOK AND WITH RATINGS

In [41]:
# work_key to index mapping
work_to_idx = {
    k: i
    for i, k in enumerate(df["work_key"])
}
list(work_to_idx.items())[:5]

# User-test ratings
user_id = 1
user_ratings = ratings[ratings["user_id"] == user_id]
user_ratings

import numpy as np
item_vectors = []
weights = []
# Building user profile vector
for row in user_ratings.itertuples(index=False):
    work_key = row.work_key
    rating = row.rating
    idx = work_to_idx.get(work_key)

    v = X_w[idx].toarray()[0]
    item_vectors.append(v)
    weights.append(rating)
    
item_matrix = np.vstack(item_vectors)
print(item_matrix.shape)
weights = np.array(weights)
print(weights.shape)
user_profile = np.average(item_matrix, axis=0, weights=weights)
user_profile.shape
from sklearn.metrics.pairwise import cosine_similarity
# Compute similarity scores between user profile and all items
scores = cosine_similarity(user_profile.reshape(1, -1), X_w).ravel()
scores.shape  # (734,)


# Kullanıcının zaten rated ettiği kitapların indexleri
rated_idx = {work_to_idx[wk] for wk in user_ratings["work_key"]}

# (index, skor) listesi
idx_scores = list(enumerate(scores))

# zaten rated olanları filtrele
idx_scores = [
    (i, s) for i, s in idx_scores
    if i not in rated_idx
]

# yüksekten düşüğe sırala
idx_scores = sorted(idx_scores, key=lambda x: x[1], reverse=True)

top_k = 10
top_idx = [i for i, s in idx_scores[:top_k]]

df.iloc[top_idx][["title", "authors", "source_subject"]]

    

(11, 10000)
(11,)


Unnamed: 0,title,authors,source_subject
1665,Who?,Algis Budrys,science_fiction
2344,Winter Roses,Diana Palmer,romance
2760,Suspicious,Heather Graham,romance
1989,The divide,Nicholas Evans,romance
2157,The Game of Kings,Dorothy Dunnett,romance
2651,Untamed,Diana Palmer,romance
2993,Le roman du masque de fer,Alexandre Dumas,history
2769,Secret Fantasy,Carly Phillips,romance
2771,Fortune is a woman,Elizabeth Adler,romance
2809,Rosevean,Iris Bromige,romance


FUNCTION FOR MULTIPLE BOOKS AND ITS RATINGS FOR SPECIFIC USER

In [53]:
# Recommendation function for a user based on their ratings
def recommend_for_user(user_id, df, X_w, ratings, work_to_idx, top_k):
    user_ratings = ratings[ratings["user_id"] == user_id]
    if user_ratings.empty:
        return df.sample(top_k)[["title", "authors", "source_subject"]]
    item_vectors = []
    weights = []
    for row in user_ratings.itertuples(index=False):
        work_key = row.work_key
        rating = row.rating
        idx = work_to_idx.get(work_key)

        v = X_w[idx].toarray()[0]
        item_vectors.append(v)
        weights.append(rating)
    item_matrix = np.vstack(item_vectors)
    weights = np.array(weights)
    user_profile = np.average(item_matrix, axis=0, weights=weights)
    print(pd.DataFrame(df[df["work_key"].isin(user_ratings["work_key"])])[["title", "authors", "source_subject"]])
    similarity_scores = cosine_similarity(user_profile.reshape(1, -1), X_w).ravel()
    rated_idx = {work_to_idx[wk] for wk in user_ratings["work_key"]}
    idx_scores = list(enumerate(similarity_scores))
    idx_scores = [
        (i, s) for i, s in idx_scores
        if i not in rated_idx 
    ]
    top_idx = sorted(idx_scores, key=lambda x: x[1], reverse=True)
    top_idx = [i for i, s in top_idx[:top_k]]
    return df.iloc[top_idx][["title", "authors", "source_subject"]]


In [56]:
recommends=recommend_for_user(1, df, X_w, ratings, work_to_idx, top_k=20)


                                         title             authors  \
3                           Gulliver's Travels      Jonathan Swift   
39    Harry Potter and the Philosopher's Stone       J. K. Rowling   
61                The Story of Doctor Dolittle        Hugh Lofting   
1230                          Heretics of Dune       Frank Herbert   
1417                              Killing time          Caleb Carr   
1717                   A Second Chance at Eden   Peter F. Hamilton   
2317                                 Checkmate     Dorothy Dunnett   
2501                           To Be A Husband     Carole Mortimer   
2625         The Brooding Frenchman’s Proposal     Rebecca Winters   
2653                            Montana Creeds   Linda Lael Miller   
3157                               Ailsa Paige  Robert W. Chambers   

       source_subject  
3             fantasy  
39            fantasy  
61            fantasy  
1230  science_fiction  
1417  science_fiction  
1717  science_f

In [57]:
print(recommends)

                          title                         authors  \
1665                       Who?                    Algis Budrys   
2344               Winter Roses                    Diana Palmer   
2760                 Suspicious                  Heather Graham   
1989                 The divide                  Nicholas Evans   
2157          The Game of Kings                 Dorothy Dunnett   
2651                    Untamed                    Diana Palmer   
2993  Le roman du masque de fer                 Alexandre Dumas   
2769             Secret Fantasy                  Carly Phillips   
2771         Fortune is a woman                 Elizabeth Adler   
2809                   Rosevean                    Iris Bromige   
1165                 Deathlands                     James Axler   
1464                   Restoree                  Anne McCaffrey   
1065                  Star Born                    Andre Norton   
1628               Soul Catcher                   Frank Herber