In [5]:
# --- 1. SETUP ---
import os
import pandas as pd
import numpy as np

# Install AI library
os.system('pip install sentence-transformers')
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# --- 2. LOAD DATA (USING YOUR EXACT FILENAMES) ---
file_fra = "fra_perfumes.csv"         # Has: Rating Value, Gender
file_img = "perfumes_cleaned.csv"     # Has: Image URL, Brand, clean_notes

if not os.path.exists(file_fra) or not os.path.exists(file_img):
    print("‚ùå ERROR: Please upload BOTH 'fra_perfumes.csv' and 'perfumes_cleaned.csv'!")
else:
    # Load with exact columns known
    df_fra = pd.read_csv(file_fra)
    df_img = pd.read_csv(file_img)

    print(f"‚úÖ Loaded: FRA ({len(df_fra)} rows) | IMG ({len(df_img)} rows)")

    # --- 3. MERGING STRATEGY ---
    print("Merging datasets based on 'Name'...")

    # Create a normalization key to ensure names match (e.g. "Chanel No 5" == "chanel no 5 ")
    df_fra['join_key'] = df_fra['Name'].astype(str).str.lower().str.strip()
    df_img['join_key'] = df_img['Name'].astype(str).str.lower().str.strip()

    # We take the Image dataset as the BASE (because we need images!)
    # We pull only specific columns from the FRA dataset: Rating and Gender
    cols_to_pull = ['join_key', 'Rating Value', 'Gender']

    # Perform the Merge
    merged = pd.merge(df_img, df_fra[cols_to_pull], on='join_key', how='left')

    # --- 4. CLEANING THE SUPER DATASET ---

    # Rename columns for the App
    merged.rename(columns={
        'Rating Value': 'Score',
        'clean_notes': 'Notes_Clean' # Standardizing for logic below
    }, inplace=True)

    # Fill Missing Scores with 0
    merged['Score'] = merged['Score'].fillna(0.0)

    # Fill Missing Gender
    merged['Gender'] = merged['Gender'].fillna('Unisex')

    # Ensure Notes column exists for display (If 'Notes' is missing, use 'Notes_Clean')
    if 'Notes' not in merged.columns:
        merged['Notes'] = merged['Notes_Clean']

    # Remove duplicates
    merged = merged.drop_duplicates(subset=['Name'])

    print(f"‚úÖ Super-Dataset Ready: {len(merged)} perfumes.")

    # --- 5. BUILDING AI MODEL ---
    print("Building AI Brain (This takes about 1 minute)...")

    # A. Semantic (Description)
    # Use the 'Description' column from the cleaned file
    model = SentenceTransformer('all-MiniLM-L6-v2')
    merged['Description'] = merged['Description'].fillna("")
    embeddings = model.encode(merged['Description'].tolist(), show_progress_bar=True)

    # B. Chemical (Notes)
    # Use 'Notes_Clean' from the cleaned file
    tfidf = TfidfVectorizer(stop_words='english', max_features=2000)
    matrix_notes = tfidf.fit_transform(merged['Notes_Clean'].fillna("").astype(str))

    # C. Similarity Math
    sim_desc = cosine_similarity(embeddings, embeddings).astype(np.float32)
    sim_notes = cosine_similarity(matrix_notes, matrix_notes).astype(np.float32)

    # Hybrid Score (70% Vibe, 30% Ingredients)
    hybrid_sim = (0.7 * sim_desc) + (0.3 * sim_notes)

    # --- 6. SAVE FINAL FILES ---
    # We save the CSV as 'perfumes_dataset.csv' so the App knows what to load
    merged.to_csv('perfumes_dataset.csv', index=False)
    np.save('hybrid_similarity.npy', hybrid_sim)

    print("\nüéâ SUCCESS! Download these files from the left panel:")
    print("1. perfumes_dataset.csv (The merged data)")
    print("2. hybrid_similarity.npy (The model)")

‚úÖ Loaded: FRA (70103 rows) | IMG (2191 rows)
Merging datasets based on 'Name'...
‚úÖ Super-Dataset Ready: 2184 perfumes.
Building AI Brain (This takes about 1 minute)...


Batches:   0%|          | 0/69 [00:00<?, ?it/s]


üéâ SUCCESS! Download these files from the left panel:
1. perfumes_dataset.csv (The merged data)
2. hybrid_similarity.npy (The model)
