<a href="https://colab.research.google.com/github/Marcin19721205/BDCaseStudy/blob/main/BDCSZad9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pathlib import Path
import pandas as pd
import re
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from sklearn.preprocessing import normalize
from scipy import sparse
from scipy.sparse import csr_matrix


In [2]:
# wczytanie danych, zamiana  - -->.
DATA_DIR = Path("sample_data")

books_path   = DATA_DIR / "Books.csv"
users_path   = DATA_DIR / "Users.csv"
ratings_path = DATA_DIR / "Ratings.csv"

books = pd.read_csv(books_path, sep=";", encoding="latin-1", engine="python", on_bad_lines="skip")
users = pd.read_csv(users_path, sep=";", encoding="latin-1", engine="python", on_bad_lines="skip")
ratings = pd.read_csv(ratings_path, sep=";", encoding="latin-1", engine="python", on_bad_lines="skip")

books.columns = books.columns.str.replace("-", ".", regex=False)
users.columns = users.columns.str.replace("-", ".", regex=False)
ratings.columns = ratings.columns.str.replace("-", ".", regex=False)

print("books  :", books.shape)
print("users  :", users.shape)
print("ratings:", ratings.shape)


books  : (22270, 8)
users  : (167274, 3)
ratings: (362805, 3)


In [3]:
# naprawa Users.csv -> zapis kopii bez \"

DATA_DIR = Path("sample_data")
users_path = DATA_DIR / "Users.csv"
users_fixed_path = DATA_DIR / "Users_fixed.csv"

raw = users_path.read_text(encoding="latin-1", errors="ignore")  # w latin-1
raw_fixed = raw.replace('\\"', "")  # usuń \" z pól tekstowych

users_fixed_path.write_text(raw_fixed, encoding="latin-1")  # zapisz poprawioną kopię
print("saved:", users_fixed_path)


saved: sample_data/Users_fixed.csv


In [4]:
# wczytanie users z poprawionej kopii + '-' -> '.'
users = pd.read_csv(users_fixed_path, sep=";", encoding="latin-1", engine="python", on_bad_lines="skip")
users.columns = users.columns.str.replace("-", ".", regex=False)

print("users:", users.shape)


users: (278858, 3)


In [5]:
print("books  :", books.shape)
print("users  :", users.shape)
print("ratings:", ratings.shape)

books  : (22270, 8)
users  : (278858, 3)
ratings: (362805, 3)


In [6]:
books.head()

Unnamed: 0,ISBN,Book.Title,Book.Author,Year.Of.Publication,Publisher,Image.URL.S,Image.URL.M,Image.URL.L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [7]:
users.head()

Unnamed: 0,User.ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [8]:
ratings.head()

Unnamed: 0,User.ID,ISBN,Book.Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [9]:
# f(implicit/brak) + dołącz tytuły po ISBN
ratings = ratings[ratings["Book.Rating"].astype(int).ne(0)].copy()  # wywal 0
ratings["Book.Rating"] = ratings["Book.Rating"].astype(np.int16)    # ciaśniejszy typ, mniej RAM

ratings_books = ratings.merge(books[["ISBN", "Book.Title", "Book.Author"]], on="ISBN", how="left")  # tytuły/autor


In [10]:
# progi na aktywność userów i popularność książek
min_user_ratings = 10  # min ocen na usera
min_book_ratings = 10  # min ocen na książkę

uc = ratings["User.ID"].value_counts()  # ile ocen na usera
bc = ratings["ISBN"].value_counts()     # ile ocen na ISBN

keep_users = uc[uc >= min_user_ratings].index
keep_books = bc[bc >= min_book_ratings].index

ratings = ratings[ratings["User.ID"].isin(keep_users) & ratings["ISBN"].isin(keep_books)].copy()  # filtr 2x

# opcjonalnie: ogranicz liczbę userów (jak dalej za dużo)
max_users = 30000  # None / 0 żeby wyłączyć
if max_users and ratings["User.ID"].nunique() > max_users:
    np.random.seed(42)  # powtarzalność
    sampled_users = np.random.choice(ratings["User.ID"].unique(), size=max_users, replace=False)  # losuj userów
    ratings = ratings[ratings["User.ID"].isin(sampled_users)].copy()  # podzbiór

ratings_books = ratings.merge(books[["ISBN", "Book.Title", "Book.Author"]], on="ISBN", how="left")  # odśwież merge


In [11]:
# User.ID/ISBN -> indeksy 0..n-1 + budowa rzadkiej macierzy UI (CSR)
u_ids = ratings["User.ID"].astype(int).unique()                 # unikalni userzy
i_ids = ratings["ISBN"].astype(str).unique()                    # unikalne ISBN

u_map = pd.Series(np.arange(len(u_ids), dtype=np.int32), index=u_ids)  # User.ID -> row
i_map = pd.Series(np.arange(len(i_ids), dtype=np.int32), index=i_ids)  # ISBN -> col

rows = u_map.loc[ratings["User.ID"].astype(int)].to_numpy()     # indeksy wierszy
cols = i_map.loc[ratings["ISBN"].astype(str)].to_numpy()        # indeksy kolumn
vals = ratings["Book.Rating"].astype(np.float32).to_numpy()     # wartości rating

R = csr_matrix((vals, (rows, cols)), shape=(len(u_ids), len(i_ids)))  # UI sparse


In [12]:
# fUU similarity = cosine na L2-znormalizowanych wektorach userów (wiersze macierzy R)
R_norm = normalize(R, norm="l2", axis=1)  # L2 per user (row) -> cosine to dot-product
UU = R_norm @ R_norm.T                   # (n_users x n_users) sparse-ish podobieństwa


In [21]:
# wybór target usera + print-check
target_user_id = None  # np. 276726

uc2 = ratings["User.ID"].value_counts()  # oceny per user
if target_user_id is None:
    target_user_id = int(uc2.index[0])   # najaktywniejszy

target_u = int(u_map.loc[int(target_user_id)])  # indeks w R/UU

print("target_user_id:", target_user_id)         # check
print("target_u:", target_u)                     # check
print("target_user_ratings:", int(uc2.loc[target_user_id]))  # check


target_user_id: 11676
target_u: 250
target_user_ratings: 712


In [22]:
#top-K sąsiadów po cosine(target, u) z UU (bez samego siebie, opcjonalnie sim>0)
K = 50  # liczba sąsiadów

s = UU.getrow(target_u).toarray().ravel()  # podobieństwa target -> wszyscy
s[target_u] = -np.inf                      # wywal samego siebie

nbr_idx = np.argpartition(-s, K)[:K]       # szybkie top-K (bez pełnego sortu)
nbr_idx = nbr_idx[np.argsort(-s[nbr_idx])] # posortuj w top-K malejąco

mask_pos = s[nbr_idx] > 0                  # tylko dodatnie
nbr_idx = nbr_idx[mask_pos]
nbr_sim = s[nbr_idx]

print("K_requested:", K)
print("K_used:", len(nbr_idx))
print("top5_sim:", nbr_sim[:5])


K_requested: 50
K_used: 50
top5_sim: [0.29212576 0.22743845 0.21418695 0.20559958 0.1982519 ]


In [23]:
# scoring książek = ważona średnia ocen sąsiadów, tylko dla książek nieocenionych przez target (min_votes)
min_votes = 2  # min ilu sąsiadów musi ocenić książkę

target_items = set(R.getrow(target_u).indices)  # kolumny ISBN ocenione przez target (UI)

num = {}    # item -> suma(sim*rating)
den = {}    # item -> suma(|sim|)
votes = {}  # item -> liczba głosów

for u_idx, sim in zip(nbr_idx, nbr_sim):  # po sąsiadach
    row = R.getrow(int(u_idx))            # sparse row usera
    for j, r in zip(row.indices, row.data):
        if j in target_items:
            continue                      # target już ma -> skip
        num[j] = num.get(j, 0.0) + float(sim) * float(r)
        den[j] = den.get(j, 0.0) + float(abs(sim))
        votes[j] = votes.get(j, 0) + 1

cand_items = [j for j, v in votes.items() if v >= min_votes and den.get(j, 0.0) > 0.0]  # filtr głosów + den>0
cand_scores = np.array([num[j] / den[j] for j in cand_items], dtype=np.float32)         # final score

order = np.argsort(-cand_scores)                # malejąco
cand_items = np.array(cand_items, dtype=np.int32)[order]
cand_scores = cand_scores[order]

print("candidates:", len(cand_items))
print("top5_scores:", cand_scores[:5])
print("top5_votes:", [votes[int(j)] for j in cand_items[:5]])


candidates: 236
top5_scores: [10. 10. 10. 10. 10.]
top5_votes: [2, 3, 2, 2, 2]


In [24]:
# Top-5 ISBN -> tytuły/autor
i_rev = pd.Series(i_map.index.values, index=i_map.values)  # col_idx -> ISBN

def visualizar_recommendacion(top_n=5):
    top_items = cand_items[:top_n]  # indeksy kolumn (ISBN) po score
    top_isbn = i_rev.loc[top_items].astype(str).tolist()  # ISBN
    out = (pd.DataFrame({"ISBN": top_isbn, "score": cand_scores[:top_n]})
             .merge(books[["ISBN", "Book.Title", "Book.Author"]], on="ISBN", how="left"))
    return out[["ISBN", "Book.Title", "Book.Author", "score"]].to_dict("records")  # lista

reco5 = visualizar_recommendacion(5)

print("reco5_len:", len(reco5))
print("reco5:", reco5)


reco5_len: 5
reco5: [{'ISBN': '1400032717', 'Book.Title': 'The Curious Incident of the Dog in the Night-Time (Vintage Contemporaries)', 'Book.Author': 'Mark Haddon', 'score': 10.0}, {'ISBN': '0743235150', 'Book.Title': "Everything's Eventual : 14 Dark Tales", 'Book.Author': 'Stephen King', 'score': 10.0}, {'ISBN': '059030271X', 'Book.Title': "Charlotte's Web", 'Book.Author': 'E. B. White', 'score': 10.0}, {'ISBN': '0425140032', 'Book.Title': 'Dragon Tears', 'Book.Author': 'Dean R. Koontz', 'score': 10.0}, {'ISBN': '0425129586', 'Book.Title': 'And Then There Were None', 'Book.Author': 'Agatha Christie', 'score': 10.0}]


In [25]:
# foreword: kontrola = żaden ISBN z rekomendacji nie może być już oceniony przez target + print-check
target_isbn = set(i_rev.loc[R.getrow(target_u).indices].astype(str))  # ISBN ocenione przez target
reco_isbn = [d["ISBN"] for d in reco5]                                # ISBN z rekomendacji

bad = [x for x in reco_isbn if x in target_isbn]  # przecieki

print("target_rated_isbn_cnt:", len(target_isbn))  # check
print("reco_isbn:", reco_isbn)                     # check
print("leak_isbn:", bad)                           # check

assert len(bad) == 0, f"LEAK: rekomendujesz ISBN już ocenione: {bad}"

if len(reco5) == 0:
    print("EMPTY_RECO: zwiększ K albo poluzuj min_user_ratings/min_book_ratings/min_votes")  # hint


target_rated_isbn_cnt: 712
reco_isbn: ['1400032717', '0743235150', '059030271X', '0425140032', '0425129586']
leak_isbn: []


In [27]:
# wydruk listy (1..N)
print("reco5_len:", len(reco5))
for k, d in enumerate(reco5, 1):
    print(f"{k}. {d.get('Book.Title','')} | {d.get('Book.Author','')} | ISBN={d.get('ISBN','')} | score={d.get('score','')}")


reco5_len: 5
1. The Curious Incident of the Dog in the Night-Time (Vintage Contemporaries) | Mark Haddon | ISBN=1400032717 | score=10.0
2. Everything's Eventual : 14 Dark Tales | Stephen King | ISBN=0743235150 | score=10.0
3. Charlotte's Web | E. B. White | ISBN=059030271X | score=10.0
4. Dragon Tears | Dean R. Koontz | ISBN=0425140032 | score=10.0
5. And Then There Were None | Agatha Christie | ISBN=0425129586 | score=10.0


/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////