In [1]:
# --- 1. SETUP ---
import pandas as pd
import numpy as np
import os

# Instalacja bibliotek do AI
os.system('pip install sentence-transformers')
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# --- 2. LOAD DATA (Twoje pliki) ---
print("--- LOADING ---")
file_grail = "perfumes_cleaned.csv"  # Graal (Zdjęcia)
file_stats = "fra_perfumes.csv"      # Statystyki (Oceny)

if not os.path.exists(file_grail) or not os.path.exists(file_stats):
    print("❌ BŁĄD: Wgraj pliki 'perfumes_cleaned.csv' i 'fra_perfumes.csv'")
else:
    df_grail = pd.read_csv(file_grail)
    df_stats = pd.read_csv(file_stats)
    print(f"✅ Wczytano: Graal ({len(df_grail)}) | Statystyki ({len(df_stats)})")

    # --- 3. PRECYZYJNE ŁĄCZENIE (MERGE) ---
    print("\n--- ŁĄCZENIE DANYCH ---")

    # 1. Normalizacja nazw (klucz do łączenia)
    df_grail['join_key'] = df_grail['Name'].astype(str).str.lower().str.strip()
    df_stats['join_key'] = df_stats['Name'].astype(str).str.lower().str.strip()

    # 2. Wybieramy tylko to, co potrzebne ze Statystyk
    # Rating Value -> Score
    # Gender -> Gender
    stats_subset = df_stats[['join_key', 'Rating Value', 'Gender']].copy()
    stats_subset.rename(columns={'Rating Value': 'Score'}, inplace=True)

    # 3. Łączymy (Baza to Graal, doklejamy Oceny)
    # How='left' oznacza: zachowaj wszystkie perfumy ze zdjęciami (Graal)
    df_final = pd.merge(df_grail, stats_subset, on='join_key', how='left')

    # --- 4. CZYSZCZENIE I UZUPEŁNIANIE ---
    # Uzupełniamy braki (jeśli perfum nie było w bazie ocen)
    df_final['Score'] = df_final['Score'].fillna(0.0)
    df_final['Gender'] = df_final['Gender'].fillna('Unisex')

    # Upewniamy się, że mamy kolumny tekstowe
    if 'Description' not in df_final.columns: df_final['Description'] = ""
    if 'Notes' not in df_final.columns: df_final['Notes'] = ""

    # Czyścimy tekst
    df_final['Description'] = df_final['Description'].fillna("")
    df_final['Notes'] = df_final['Notes'].fillna("")

    print(f"✅ Nowa Baza Danych: {len(df_final)} perfum.")
    print(f"   Zawiera kolumny: {list(df_final.columns)}")

    # --- 5. TWORZENIE MODELU AI ---
    print("\n--- TWORZENIE MODELU AI ---")

    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(df_final['Description'].tolist(), show_progress_bar=True)

    tfidf = TfidfVectorizer(stop_words='english', max_features=2000)
    matrix_notes = tfidf.fit_transform(df_final['Notes'].astype(str))

    sim_desc = cosine_similarity(embeddings, embeddings).astype(np.float32)
    sim_notes = cosine_similarity(matrix_notes, matrix_notes).astype(np.float32)

    hybrid_sim = (0.7 * sim_desc) + (0.3 * sim_notes)

    # --- 6. ZAPIS PLIKÓW (NOWE NAZWY) ---
    # Użyjemy nowej nazwy pliku, żeby nie pomylić ze starymi
    output_csv = "final_perfume_data.csv"
    output_npy = "hybrid_similarity.npy"

    df_final.to_csv(output_csv, index=False)
    np.save(output_npy, hybrid_sim)

    print("\n🎉 SUKCES! Pobierz te dwa pliki:")
    print(f"1. {output_csv} (To jest Twój Złoty Plik)")
    print(f"2. {output_npy}")

--- LOADING ---
✅ Wczytano: Graal (2191) | Statystyki (70103)

--- ŁĄCZENIE DANYCH ---
✅ Nowa Baza Danych: 2191 perfum.
   Zawiera kolumny: ['Name', 'Brand', 'Description', 'Notes', 'Image URL', 'clean_notes', 'join_key', 'Score', 'Gender']

--- TWORZENIE MODELU AI ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/69 [00:00<?, ?it/s]


🎉 SUKCES! Pobierz te dwa pliki:
1. final_perfume_data.csv (To jest Twój Złoty Plik)
2. hybrid_similarity.npy
