<a href="https://colab.research.google.com/github/Marcin19721205/BDCaseStudy/blob/main/BDCSZad7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
from pathlib import Path
import pandas as pd
import re
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from sklearn.preprocessing import normalize
from scipy import sparse

In [42]:
# wczytanie danych, zamiana  "-" -->"."
DATA_DIR = Path("sample_data")

books_path   = DATA_DIR / "Books.csv"
users_path   = DATA_DIR / "Users.csv"
ratings_path = DATA_DIR / "Ratings.csv"

books = pd.read_csv(books_path, sep=";", encoding="latin-1", engine="python", on_bad_lines="skip")
users = pd.read_csv(users_path, sep=";", encoding="latin-1", engine="python", on_bad_lines="skip")
ratings = pd.read_csv(ratings_path, sep=";", encoding="latin-1", engine="python", on_bad_lines="skip")

books.columns = books.columns.str.replace("-", ".", regex=False)
users.columns = users.columns.str.replace("-", ".", regex=False)
ratings.columns = ratings.columns.str.replace("-", ".", regex=False)

print("books  :", books.shape)
print("users  :", users.shape)
print("ratings:", ratings.shape)


books  : (270491, 8)
users  : (278177, 3)
ratings: (1149766, 3)


In [43]:
# naprawa Users.csv -> zapis kopii bez \"

DATA_DIR = Path("sample_data")
users_path = DATA_DIR / "Users.csv"
users_fixed_path = DATA_DIR / "Users_fixed.csv"

raw = users_path.read_text(encoding="latin-1", errors="ignore")  # w latin-1
raw_fixed = raw.replace('\\"', "")  # usuń \" z pól tekstowych

users_fixed_path.write_text(raw_fixed, encoding="latin-1")  # zapisz poprawioną kopię
print("saved:", users_fixed_path)


saved: sample_data/Users_fixed.csv


In [44]:
# wczytanie users z poprawionej kopii + '-' -> '.'
users = pd.read_csv(users_fixed_path, sep=";", encoding="latin-1", engine="python", on_bad_lines="skip")
users.columns = users.columns.str.replace("-", ".", regex=False)

print("users:", users.shape)


users: (278858, 3)


In [45]:
print("books  :", books.shape)
print("users  :", users.shape)
print("ratings:", ratings.shape)

books  : (270491, 8)
users  : (278858, 3)
ratings: (1149766, 3)


In [46]:
books.head()

Unnamed: 0,ISBN,Book.Title,Book.Author,Year.Of.Publication,Publisher,Image.URL.S,Image.URL.M,Image.URL.L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [47]:
users.head()

Unnamed: 0,User.ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [48]:
ratings.head()

Unnamed: 0,User.ID,ISBN,Book.Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [49]:
# content-based musi mieć treść książki (title/author/publisher/year) przypiętą do ocen
# dlatego łacze ratings z books po ISBN bez tego nie zbuduje wektorów TF-IDF
# W BX ocena 0 zwykle oznacza brak jawnej oceny, więc ją wyrzucamy, żeby nie psuć profilu użytkownika.
# Braki w metadanych zamien na "" żeby czyszczenie tekstu/TF-IDF nie wywaliło się na NaN.


ratings = ratings[ratings["Book.Rating"].between(1, 10)].copy()  # wyrzuć crap
ratings["ISBN"] = ratings["ISBN"].astype(str)  # ISBN jako tekst

books["ISBN"] = books["ISBN"].astype(str)  # ISBN jako tekst
books = books.drop_duplicates(subset=["ISBN"]).copy()  # duplikaty ISBN

for c in ["Book.Title", "Book.Author", "Publisher", "Year.Of.Publication"]:
    if c not in books.columns:  # fail fast jeśli nazwa kolumny inna
        raise KeyError(f"Brak kolumny w books: {c}")
    books[c] = books[c].fillna("").astype(str)  # braki -> pusty string

ratings_books = ratings.merge(
    books[["ISBN", "Book.Title", "Book.Author", "Publisher", "Year.Of.Publication"]],  # tylko potrzebne kolumny
    on="ISBN",
    how="inner",
)
print("ratings filtered:", ratings.shape)
print("books filtered     :", books.shape)
print("ratings_books   :", ratings_books.shape)


ratings filtered: (433659, 3)
books filtered     : (270491, 8)
ratings_books   : (382827, 7)


In [50]:
# jedno pole tekstowe "content" + normalizacja -lower, znaki spec, spacje
#na kopii

books_feat = books.copy()

books_feat["content"] = (  # title + author + publisher + year
    books_feat["Book.Title"].fillna("").astype(str) + " " +
    books_feat["Book.Author"].fillna("").astype(str) + " " +
    books_feat["Publisher"].fillna("").astype(str) + " " +
    books_feat["Year.Of.Publication"].fillna("").astype(str)
)

books_feat["content"] = books_feat["content"].str.lower()  # lower
books_feat["content"] = books_feat["content"].str.replace(r"[^a-z0-9\s]+", " ", regex=True)  # usuń znaki spec
books_feat["content"] = books_feat["content"].str.replace(r"\s+", " ", regex=True).str.strip()  # kompresja spacji

print(books_feat[["ISBN", "content"]].head())


         ISBN                                            content
0  0195153448  classical mythology mark p o morford oxford un...
1  0002005018  clara callan richard bruce wright harperflamin...
2  0060973129  decision in normandy carlo d este harperperenn...
3  0374157065  flu the story of the great influenza pandemic ...
4  0393045218  the mummies of urumchi e j w barber w w norton...


In [51]:
# sim do wektora profilu (N×1).
tfidf = TfidfVectorizer(min_df=2, max_df=0.8)
X_tfidf = tfidf.fit_transform(books_feat["content"])

print("X_tfidf:", X_tfidf.shape)

X_tfidf: (270491, 61857)


In [52]:
# wybór usera + liked (rating>=8) + profil = średnia ważona TF-IDF
user_id = ratings["User.ID"].value_counts().idxmax()  # najaktywniejszy user
u = ratings[ratings["User.ID"] == user_id].copy()  # oceny usera
liked = u[u["Book.Rating"] >= 8].copy()  # liked set (>=8)
if liked.empty:  # awaryjnie gdy brak >=8
    liked = u.sort_values("Book.Rating", ascending=False).head(20).copy()  # top-20

isbn2idx = pd.Series(books_feat.index.values, index=books_feat["ISBN"]).to_dict()  # ISBN -> indeks w books_feat
idx = [isbn2idx[i] for i in liked["ISBN"].astype(str) if i in isbn2idx]  # indeksy lubianych książek
w = liked.loc[liked["ISBN"].astype(str).isin(isbn2idx.keys()), "Book.Rating"].astype(float).values  # wagi = oceny

X_liked = X_tfidf[idx]  # (n_liked, n_terms)
user_profile = (X_liked.multiply(w.reshape(-1, 1)).sum(axis=0) / w.sum())  # (1, n_terms) średnia ważona

print("user_id naj naj:", user_id)
print("polubione:", len(idx))
print("user_profile shape:", user_profile.shape)


user_id naj naj: 11676
polubione: 3739
user_profile shape: (1, 61857)


In [53]:
# score każdej książki vs profil usera

rated_isbn = set(u["ISBN"].astype(str))  # ocenione
mask_unrated = ~books_feat["ISBN"].astype(str).isin(rated_isbn)  # nieocenione

X_unrated = X_tfidf[mask_unrated.values]  # (n_unrated, n_terms)
unrated_idx = books_feat.index[mask_unrated].to_numpy()  # indeksy

up = sparse.csr_matrix(user_profile)  # np.matrix

X_unrated_n = normalize(X_unrated, norm="l2", axis=1)  # L2
up_n = normalize(up, norm="l2", axis=1)  # L2

scores = (X_unrated_n @ up_n.T).toarray().ravel()

topk = 5  # top5
top_pos = np.argpartition(-scores, topk - 1)[:topk]  # top-k
top_pos = top_pos[np.argsort(-scores[top_pos])]  # sort top5

rec_idx = unrated_idx[top_pos]  # indeksy rekomendacji
recommendation = books_feat.loc[rec_idx, ["Book.Title", "Book.Author", "Year.Of.Publication", "Publisher"]].copy()  # metadane
recommendation["score"] = scores[top_pos]  # score

print(recommendation)


                                               Book.Title       Book.Author  \
236595  The Hallowed Isle : The Book Of The Sword And ...   Diana L. Paxson   
42974                                 The Children of Men        P.D. James   
107162  The Lost Books of the Bible and the Forgotten ...       Frank Crane   
152345  The Return of the King (The Lord of the Rings,...  J. R. R. Tolkien   
265422  The Return of the King (The Lord of the Rings,...  J. R. R. Tolkien   

       Year.Of.Publication                Publisher     score  
236595                2000                      Eos  0.322120  
42974                 2002             Warner Books  0.321934  
107162                1994  A &amp; B Book Dist Inc  0.316238  
152345                2001           Recorded Books  0.307669  
265422                2001           Recorded Books  0.307669  


In [54]:
# bez duplikatow
rated_isbn = set(u["ISBN"].astype(str))  # ocenione
mask_unrated = ~books_feat["ISBN"].astype(str).isin(rated_isbn)  # nieocenione

X_cand = X_tfidf[mask_unrated.values]  # (n_cand, n_terms)
cand_idx = books_feat.index[mask_unrated].to_numpy()  # indeksy

up = sparse.csr_matrix(user_profile)   # np.matrix

X_cand_n = normalize(X_cand, norm="l2", axis=1)  # L2
up_n = normalize(up, norm="l2", axis=1)  # L2

scores = (X_cand_n @ up_n.T).toarray().ravel()

order = np.argsort(-scores)  # malejąco
rec = books_feat.loc[cand_idx[order], ["Book.Title", "Book.Author", "Year.Of.Publication", "Publisher"]].copy()  # metadane
rec["score"] = scores[order]  # score

rec = rec.drop_duplicates(subset=["Book.Title", "Book.Author"]).head(5).copy()  # bez duplikatów
rec["category"] = "n/a"  # placeholder

print("---REKOMENDACJE---")
for i in range(len(rec)):
    r = rec.iloc[i]
    print(f"{i+1}: {r['Book.Title']}, {r['Book.Author']}, {r['Year.Of.Publication']}, {r['Publisher']}, {r['category']}")


---REKOMENDACJE---
1: The Hallowed Isle : The Book Of The Sword And The Book Of The Spear (The Book of the Sword and the Book of the Spear, Books 1 and 2), Diana L. Paxson, 2000, Eos, n/a
2: The Children of Men, P.D. James, 2002, Warner Books, n/a
3: The Lost Books of the Bible and the Forgotten Books of Eden, Frank Crane, 1994, A &amp; B Book Dist Inc, n/a
4: The Return of the King (The Lord of the Rings, Book 3), J. R. R. Tolkien, 2001, Recorded Books, n/a
5: The Heart of the Matter (Penguin Great Books of the 20th Century), Graham Greene, 1999, Penguin Books, n/a
