In [1]:
# --- 1. SETUP ---
import os
os.system('pip install sentence-transformers')

import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# --- 2. LOAD DATA ---
print("Loading datasets...")
# The "Grail" (Rich Descriptions & Images)
df_grail = pd.read_csv("perfumes_cleaned.csv")
# The Stats (Ratings & Gender)
df_stats = pd.read_csv("fra_perfumes.csv")

print(f"Grail Rows: {len(df_grail)} | Stats Rows: {len(df_stats)}")

# --- 3. INTELLIGENT MERGE ---
print("Merging datasets...")

# Normalize names for matching
df_grail['match_key'] = df_grail['Name'].astype(str).str.lower().str.strip()
df_stats['match_key'] = df_stats['Name'].astype(str).str.lower().str.strip()

# Prepare Stats Data (Only keep what we need: Rating, Gender)
# Fix column names if needed
if 'Rating Value' in df_stats.columns: df_stats.rename(columns={'Rating Value': 'Score'}, inplace=True)
if 'Gender' in df_stats.columns: pass

cols_to_add = ['match_key', 'Score', 'Gender']
# Check if columns exist before selecting
cols_to_add = [c for c in cols_to_add if c in df_stats.columns]

# LEFT JOIN on the GRAIL (We keep all rich descriptions!)
df_final = pd.merge(df_grail, df_stats[cols_to_add], on='match_key', how='left')

# --- 4. FINAL CLEANING ---
# Fill missing ratings with 0
if 'Score' in df_final.columns: df_final['Score'] = df_final['Score'].fillna(0.0)
# Fill missing gender
if 'Gender' in df_final.columns: df_final['Gender'] = df_final['Gender'].fillna('Unisex')

# Ensure critical columns exist
if 'Description' not in df_final.columns: df_final['Description'] = ""
if 'Notes' not in df_final.columns: df_final['Notes'] = ""
if 'Image URL' not in df_final.columns: df_final['Image URL'] = ""

# Clean Text Function
def clean_text(text):
    if pd.isna(text): return ""
    return re.sub(r"[\[\]'\"/]", "", str(text)).strip()

df_final['Notes_Clean'] = df_final['Notes'].apply(clean_text)
df_final['Description'] = df_final['Description'].fillna(df_final['Notes_Clean'])

# Remove duplicates
df_final = df_final.drop_duplicates(subset=['Name'])

print(f"✅ Super-Dataset Ready: {len(df_final)} perfumes.")

# --- 5. BUILD AI MODEL (USING RICH DESCRIPTIONS) ---
print("Building AI Model (This uses the rich 'Grail' descriptions)...")

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df_final['Description'].tolist(), show_progress_bar=True)

tfidf = TfidfVectorizer(stop_words='english', max_features=2000)
matrix_notes = tfidf.fit_transform(df_final['Notes_Clean'])

# Similarity Calculation
sim_desc = cosine_similarity(embeddings, embeddings).astype(np.float32)
sim_notes = cosine_similarity(matrix_notes, matrix_notes).astype(np.float32)

# Hybrid (70% Vibe from Description, 30% Notes)
hybrid_sim = (0.7 * sim_desc) + (0.3 * sim_notes)

# --- 6. EXPORT ---
# Save as standard names
np.save('hybrid_similarity.npy', hybrid_sim)
df_final.to_csv('perfumes_dataset.csv', index=False)

print("\n🎉 SUCCESS! Download 'perfumes_dataset.csv' and 'hybrid_similarity.npy'")

Loading datasets...
Grail Rows: 2191 | Stats Rows: 70103
Merging datasets...
✅ Super-Dataset Ready: 2184 perfumes.
Building AI Model (This uses the rich 'Grail' descriptions)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/69 [00:00<?, ?it/s]


🎉 SUCCESS! Download 'perfumes_dataset.csv' and 'hybrid_similarity.npy'


In [2]:
# --- 1. SETUP ---
import os
os.system('pip install sentence-transformers')

import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# --- 2. LOAD DATA ---
print("Loading datasets...")
# The "Grail" (Rich Descriptions & Images)
df_grail = pd.read_csv("perfumes_cleaned.csv")
# The Stats (Ratings & Gender)
df_stats = pd.read_csv("fra_perfumes.csv")

print(f"Grail Rows: {len(df_grail)} | Stats Rows: {len(df_stats)}")

# --- 3. INTELLIGENT MERGE ---
print("Merging datasets...")

# Normalize names for matching
df_grail['match_key'] = df_grail['Name'].astype(str).str.lower().str.strip()
df_stats['match_key'] = df_stats['Name'].astype(str).str.lower().str.strip()

# Prepare Stats Data (Only keep what we need: Rating, Gender)
# Fix column names if needed
if 'Rating Value' in df_stats.columns: df_stats.rename(columns={'Rating Value': 'Score'}, inplace=True)
if 'Gender' in df_stats.columns: pass

cols_to_add = ['match_key', 'Score', 'Gender']
# Check if columns exist before selecting
cols_to_add = [c for c in cols_to_add if c in df_stats.columns]

# LEFT JOIN on the GRAIL (We keep all rich descriptions!)
df_final = pd.merge(df_grail, df_stats[cols_to_add], on='match_key', how='left')

# --- 4. FINAL CLEANING ---
# Fill missing ratings with 0
if 'Score' in df_final.columns: df_final['Score'] = df_final['Score'].fillna(0.0)
# Fill missing gender
if 'Gender' in df_final.columns: df_final['Gender'] = df_final['Gender'].fillna('Unisex')

# Ensure critical columns exist
if 'Description' not in df_final.columns: df_final['Description'] = ""
if 'Notes' not in df_final.columns: df_final['Notes'] = ""
if 'Image URL' not in df_final.columns: df_final['Image URL'] = ""

# Clean Text Function
def clean_text(text):
    if pd.isna(text): return ""
    return re.sub(r"[\[\]'\"/]", "", str(text)).strip()

df_final['Notes_Clean'] = df_final['Notes'].apply(clean_text)
df_final['Description'] = df_final['Description'].fillna(df_final['Notes_Clean'])

# Remove duplicates
df_final = df_final.drop_duplicates(subset=['Name'])

print(f"✅ Super-Dataset Ready: {len(df_final)} perfumes.")

# --- 5. BUILD AI MODEL (USING RICH DESCRIPTIONS) ---
print("Building AI Model (This uses the rich 'Grail' descriptions)...")

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df_final['Description'].tolist(), show_progress_bar=True)

tfidf = TfidfVectorizer(stop_words='english', max_features=2000)
matrix_notes = tfidf.fit_transform(df_final['Notes_Clean'])

# Similarity Calculation
sim_desc = cosine_similarity(embeddings, embeddings).astype(np.float32)
sim_notes = cosine_similarity(matrix_notes, matrix_notes).astype(np.float32)

# Hybrid (70% Vibe from Description, 30% Notes)
hybrid_sim = (0.7 * sim_desc) + (0.3 * sim_notes)

# --- 6. EXPORT ---
# Save as standard names
np.save('hybrid_similarity.npy', hybrid_sim)
df_final.to_csv('perfumes_dataset.csv', index=False)

print("\n🎉 SUCCESS! Download 'perfumes_dataset.csv' and 'hybrid_similarity.npy'")

Loading datasets...
Grail Rows: 2191 | Stats Rows: 70103
Merging datasets...
✅ Super-Dataset Ready: 2184 perfumes.
Building AI Model (This uses the rich 'Grail' descriptions)...


Batches:   0%|          | 0/69 [00:00<?, ?it/s]


🎉 SUCCESS! Download 'perfumes_dataset.csv' and 'hybrid_similarity.npy'
