<a href="https://colab.research.google.com/github/Marcin19721205/BDCaseStudy/blob/main/BDCSZad8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pathlib import Path
import pandas as pd
import re
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from sklearn.preprocessing import normalize
from scipy import sparse
from scipy.sparse import csr_matrix


In [2]:
# wczytanie danych, zamiana  - -->.
DATA_DIR = Path("sample_data")

books_path   = DATA_DIR / "Books.csv"
users_path   = DATA_DIR / "Users.csv"
ratings_path = DATA_DIR / "Ratings.csv"

books = pd.read_csv(books_path, sep=";", encoding="latin-1", engine="python", on_bad_lines="skip")
users = pd.read_csv(users_path, sep=";", encoding="latin-1", engine="python", on_bad_lines="skip")
ratings = pd.read_csv(ratings_path, sep=";", encoding="latin-1", engine="python", on_bad_lines="skip")

books.columns = books.columns.str.replace("-", ".", regex=False)
users.columns = users.columns.str.replace("-", ".", regex=False)
ratings.columns = ratings.columns.str.replace("-", ".", regex=False)

print("books  :", books.shape)
print("users  :", users.shape)
print("ratings:", ratings.shape)


books  : (106608, 8)
users  : (278177, 3)
ratings: (1149766, 3)


In [3]:
# naprawa Users.csv -> zapis kopii bez \"

DATA_DIR = Path("sample_data")
users_path = DATA_DIR / "Users.csv"
users_fixed_path = DATA_DIR / "Users_fixed.csv"

raw = users_path.read_text(encoding="latin-1", errors="ignore")  # w latin-1
raw_fixed = raw.replace('\\"', "")  # usuń \" z pól tekstowych

users_fixed_path.write_text(raw_fixed, encoding="latin-1")  # zapisz poprawioną kopię
print("saved:", users_fixed_path)


saved: sample_data/Users_fixed.csv


In [4]:
# wczytanie users z poprawionej kopii + '-' -> '.'
users = pd.read_csv(users_fixed_path, sep=";", encoding="latin-1", engine="python", on_bad_lines="skip")
users.columns = users.columns.str.replace("-", ".", regex=False)

print("users:", users.shape)


users: (278858, 3)


In [5]:
print("books  :", books.shape)
print("users  :", users.shape)
print("ratings:", ratings.shape)

books  : (106608, 8)
users  : (278858, 3)
ratings: (1149766, 3)


In [6]:
books.head()

Unnamed: 0,ISBN,Book.Title,Book.Author,Year.Of.Publication,Publisher,Image.URL.S,Image.URL.M,Image.URL.L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [7]:
users.head()

Unnamed: 0,User.ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [8]:
ratings.head()

Unnamed: 0,User.ID,ISBN,Book.Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [9]:
# content-based musi mieć treść książki (title/author/publisher/year) przypiętą do ocen
# dlatego łacze ratings z books po ISBN bez tego nie zbuduje wektorów TF-IDF
# W BX ocena 0 zwykle oznacza brak jawnej oceny, więc ją wyrzucamy, żeby nie psuć profilu użytkownika.
# Braki w metadanych zamien na "" żeby czyszczenie tekstu/TF-IDF nie wywaliło się na NaN.


ratings = ratings[ratings["Book.Rating"].between(1, 10)].copy()  # wyrzuć crap
ratings["ISBN"] = ratings["ISBN"].astype(str)  # ISBN jako tekst

books["ISBN"] = books["ISBN"].astype(str)  # ISBN jako tekst
books = books.drop_duplicates(subset=["ISBN"]).copy()  # duplikaty ISBN

for c in ["Book.Title", "Book.Author", "Publisher", "Year.Of.Publication"]:
    if c not in books.columns:  # fail fast jeśli nazwa kolumny inna
        raise KeyError(f"Brak kolumny w books: {c}")
    books[c] = books[c].fillna("").astype(str)  # braki -> pusty string

ratings_books = ratings.merge(
    books[["ISBN", "Book.Title", "Book.Author", "Publisher", "Year.Of.Publication"]],  # tylko potrzebne kolumny
    on="ISBN",
    how="inner",
)
print("ratings filtered:", ratings.shape)
print("books filtered     :", books.shape)
print("ratings_books   :", ratings_books.shape)


ratings filtered: (433659, 3)
books filtered     : (106608, 8)
ratings_books   : (288976, 7)


In [10]:
# jedno pole tekstowe "content" + normalizacja -lower, znaki spec, spacje
#na kopii

books_feat = books.copy()

books_feat["content"] = (  # title + author + publisher + year
    books_feat["Book.Title"].fillna("").astype(str) + " " +
    books_feat["Book.Author"].fillna("").astype(str) + " " +
    books_feat["Publisher"].fillna("").astype(str) + " " +
    books_feat["Year.Of.Publication"].fillna("").astype(str)
)

books_feat["content"] = books_feat["content"].str.lower()  # lower
books_feat["content"] = books_feat["content"].str.replace(r"[^a-z0-9\s]+", " ", regex=True)  # usuń znaki spec
books_feat["content"] = books_feat["content"].str.replace(r"\s+", " ", regex=True).str.strip()  # kompresja spacji

print(books_feat[["ISBN", "content"]].head())


         ISBN                                            content
0  0195153448  classical mythology mark p o morford oxford un...
1  0002005018  clara callan richard bruce wright harperflamin...
2  0060973129  decision in normandy carlo d este harperperenn...
3  0374157065  flu the story of the great influenza pandemic ...
4  0393045218  the mummies of urumchi e j w barber w w norton...


In [12]:
#macierz user–item jako sparse
ratings_ui = ratings[["User.ID", "ISBN", "Book.Rating"]].copy()  # tylko potrzebne
ratings_ui["User.ID"] = ratings_ui["User.ID"].astype(int)  # user id jako int
ratings_ui["ISBN"] = ratings_ui["ISBN"].astype(str)  # isbn jako str
ratings_ui["Book.Rating"] = ratings_ui["Book.Rating"].astype(float)  # rating jako float

user_ids = ratings_ui["User.ID"].unique()  # unikalni userzy
isbn_ids = ratings_ui["ISBN"].unique()  # unikalne isbn

user2i = {u:i for i, u in enumerate(user_ids)}  # map user->row
isbn2j = {b:j for j, b in enumerate(isbn_ids)}  # map isbn->col

rows = ratings_ui["User.ID"].map(user2i).to_numpy()  # indeksy wierszy
cols = ratings_ui["ISBN"].map(isbn2j).to_numpy()  # indeksy kolumn
data = ratings_ui["Book.Rating"].to_numpy()  # wartości

R = csr_matrix((data, (rows, cols)), shape=(len(user_ids), len(isbn_ids)))  # (n_users, n_items)

print("R:", R.shape, "nnz:", R.nnz)


R: (77797, 185963) nnz: 433659


In [39]:
#wybór usera najaktywniejszego + seed itemy (>=8; jak pusto to top-10)
user_id = ratings_ui["User.ID"].value_counts().idxmax()  # najaktywniejszy
u_hist = ratings_ui[ratings_ui["User.ID"] == user_id].copy()  # historia usera

seed = u_hist[u_hist["Book.Rating"] >= 9.5].copy()  # seed (>=9)

seed = seed.sort_values("Book.Rating", ascending=False).copy()  #sortuj
print("user_id:", user_id)
print("ratings user:", len(u_hist))
print("seed items:", len(seed))
print(seed[["ISBN", "Book.Rating"]].head(10))


user_id: 11676
ratings user: 8522
seed items: 1287
             ISBN  Book.Rating
58923  X000000000         10.0
45350  000225929X         10.0
45380  0006481213         10.0
45384  0006490344         10.0
45390  0006498493         10.0
45394  0006512046         10.0
45463  0028625757         10.0
58735  9500718863         10.0
58731  9307166813         10.0
58675  9053905375         10.0


In [40]:
# cosine podobieństwo TYLKO seed->wszystkie itemy, na sparse; bez item×item

R_csc = csc_matrix(R)  # (n_users, n_items)

seed_cols = seed["ISBN"].map(isbn2j).dropna().astype(int).to_numpy()  # indeksy kolumn seed
seed_r = seed["Book.Rating"].to_numpy().astype(float)  # oceny usera dla seed

R_n = normalize(R_csc, norm="l2", axis=0)  # L2 po kolumnach (itemy)
S_seed = (R_n[:, seed_cols].T @ R_n).toarray()  # (n_seed, n_items) cosine(sim) seed->item

print("S_seed:", S_seed.shape)


S_seed: (1287, 185963)


In [41]:
#score dla wszystkich itemów z S_seed i ocen seed; normalizacja; filtruj ocenione + zero score

rated_cols = u_hist["ISBN"].map(isbn2j).dropna().astype(int).to_numpy()  # itemy już ocenione
num = (seed_r.reshape(-1, 1) * S_seed).sum(axis=0)  # suma(sim*rating) -> (n_items,)
den = np.abs(S_seed).sum(axis=0)  # suma |sim| -> (n_items,)

scores = num / np.where(den == 0, 1.0, den)  # normalizacja, bez dzielenia przez 0

scores[rated_cols] = -np.inf  # wywal już ocenione
scores[np.isclose(scores, 0.0)] = -np.inf  # opcjonalnie: wywal zero-score

topk = 5  # top-5
top_idx = np.argpartition(-scores, topk - 1)[:topk]  # indeksy kolumn itemów
top_idx = top_idx[np.argsort(-scores[top_idx])]  # dosortuj

top_isbn = np.array(isbn_ids)[top_idx]  # ISBN top-5
top_scores = scores[top_idx]  # score top-5

print(list(zip(top_isbn, top_scores)))


[('0316601950', np.float64(10.000000000000014)), ('0439136350', np.float64(10.000000000000014)), ('0679785892', np.float64(10.000000000000012)), ('0142001740', np.float64(10.00000000000001)), ('0679723161', np.float64(10.000000000000009))]


In [42]:
# top_isbn + score -> merge z books po ISBN + braki -> ""
top_df = pd.DataFrame({"ISBN": top_isbn.astype(str), "score": top_scores})  # top-k jako df

books["ISBN"] = books["ISBN"].astype(str)  # spójny typ
meta_cols = ["ISBN", "Book.Title", "Book.Author", "Year.Of.Publication", "Publisher"]  # wymagane

recommendation = top_df.merge(books[meta_cols], on="ISBN", how="left")  # metadane
for c in ["Book.Title", "Book.Author", "Year.Of.Publication", "Publisher"]:  # braki -> ""
    recommendation[c] = recommendation[c].fillna("").astype(str)

recommendation = recommendation.sort_values("score", ascending=False).copy()  # sortuj
print(recommendation)


         ISBN  score                                         Book.Title  \
0  0316601950   10.0                         The Pilot's Wife : A Novel   
1  0439136350   10.0  Harry Potter and the Prisoner of Azkaban (Book 3)   
2  0679785892   10.0  Fear and Loathing in Las Vegas : A Savage Jour...   
3  0142001740   10.0                            The Secret Life of Bees   
4  0679723161   10.0                     Lolita (Vintage International)   

          Book.Author Year.Of.Publication       Publisher  
0        Anita Shreve                1999  Back Bay Books  
1       J. K. Rowling                1999      Scholastic  
2  HUNTER S. THOMPSON                1998   Vintage Books  
3       Sue Monk Kidd                2003   Penguin Books  
4    VLADIMIR NABOKOV                1989         Vintage  


In [43]:
# rekomendacja
def visualizar_recomendacion(recommendation, n_books=5):  # druk rekomendacji
    if n_books > len(recommendation):  # limit
        n_books = len(recommendation)  # korekta

    if "category" not in recommendation.columns:  # placeholder
        recommendation = recommendation.copy()  # kopia
        recommendation["category"] = "n/a"  # brak kategorii

    print("---REKOMENDACJE---")  # nagłówek
    for i in range(n_books):  # top-n
        r = recommendation.iloc[i]  # wiersz
        print(f"{i+1}: {r['Book.Title']}, {r['Book.Author']}, {r['Year.Of.Publication']}, {r['Publisher']}, {r['category']}")  # format

visualizar_recomendacion(recommendation, n_books=5)


---REKOMENDACJE---
1: The Pilot's Wife : A Novel, Anita Shreve, 1999, Back Bay Books, n/a
2: Harry Potter and the Prisoner of Azkaban (Book 3), J. K. Rowling, 1999, Scholastic, n/a
3: Fear and Loathing in Las Vegas : A Savage Journey to the Heart of the American Dream, HUNTER S. THOMPSON, 1998, Vintage Books, n/a
4: The Secret Life of Bees, Sue Monk Kidd, 2003, Penguin Books, n/a
5: Lolita (Vintage International), VLADIMIR NABOKOV, 1989, Vintage, n/a
