In [22]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from rapidfuzz import fuzz, process
from collections import Counter

In [23]:
df = pd.read_csv("../data/fragrantica_cleaned.csv", sep=';')

In [24]:
print(df.head())

                                                 url  \
0  https://www.fragrantica.com/perfume/xerjoff/ac...   
1  https://www.fragrantica.com/perfume/jean-paul-...   
2  https://www.fragrantica.com/perfume/jean-paul-...   
3  https://www.fragrantica.com/perfume/bruno-bana...   
4  https://www.fragrantica.com/perfume/jean-paul-...   

                          Perfume               Brand  Country  Gender  \
0  accento-overdose-pride-edition             xerjoff    Italy  unisex   
1            classique-pride-2024  jean-paul-gaultier   France   women   
2            classique-pride-2023  jean-paul-gaultier   France  unisex   
3               pride-edition-man        bruno-banani  Germany     men   
4         le-male-pride-collector  jean-paul-gaultier   France     men   

  Rating Value  Rating Count    Year  \
0         1,42           201  2022.0   
1         1,86            70  2024.0   
2         1,91           285  2023.0   
3         1,92            59  2019.0   
4         1,93    

In [25]:
def normalize_text(text):
    return text.lower().replace("-", " ").strip()

In [26]:
def clean_and_split(text):
    if pd.isna(text):
        return []
    return [x.strip().lower() for x in text.split(',')]

In [27]:
df['Top'] = df['Top'].apply(clean_and_split)
df['Middle'] = df['Middle'].apply(clean_and_split)
df['Base'] = df['Base'].apply(clean_and_split)

In [28]:
def fuzzy_match_perfume(query, choices, limit=5):
    return process.extract(query, choices, scorer=fuzz.token_set_ratio, limit=limit)

In [29]:
note_synonyms = {
    "bourbon vanilla": "vanilla", "madagascar vanilla": "vanilla", "vanilla absolute": "vanilla",
    "ambergris": "amber", "amberwood": "amber", "amber resin": "amber",
    "rose de mai": "rose", "damask rose": "rose", "turkish rose": "rose", "bulgarian rose": "rose",
    "tonka bean": "tonka", "benzoin resin": "benzoin", "white musk": "musk",
    "musk ketone": "musk", "cacao": "chocolate", "cocoa": "chocolate",
    "oud wood": "oud", "agarwood": "oud", "patchouli leaf": "patchouli",
    "cashmeran": "cashmere wood", "sandalwood oil": "sandalwood", "vetiver oil": "vetiver",
    "green apple": "apple", "bergamot peel": "bergamot", "lemon zest": "lemon",
    "mandarin orange": "mandarin", "tangerine": "mandarin", "orange blossom absolute": "orange blossom",
    "pink peppercorn": "pink pepper", "pepper essence": "pepper"
}

def normalize_notes(note_list):
    normalized = []
    for note in note_list:
        note_clean = note.strip().lower()
        if note_clean in note_synonyms:
            normalized.append(note_synonyms[note_clean])
        else:
            normalized.append(note_clean)
    return normalized

# Apply normalization
df['Top'] = df['Top'].apply(normalize_notes)
df['Middle'] = df['Middle'].apply(normalize_notes)
df['Base'] = df['Base'].apply(normalize_notes)
df['All_Notes'] = df['Top'] + df['Middle'] + df['Base']


In [30]:
accord_cols = ['mainaccord1','mainaccord2','mainaccord3','mainaccord4','mainaccord5']
df['Accords'] = df[accord_cols].apply(lambda x: [str(i).lower() for i in x if pd.notna(i)], axis=1)

In [31]:
mlb_notes = MultiLabelBinarizer()
notes_matrix = mlb_notes.fit_transform(df['All_Notes'])

mlb_accords = MultiLabelBinarizer()
accords_matrix = mlb_accords.fit_transform(df['Accords'])

In [32]:
def jaccard_similarity(set1, set2):
    if not set1 or not set2:
        return 0
    intersection = len(set(set1).intersection(set(set2)))
    union = len(set(set1).union(set(set2)))
    return intersection / union

In [33]:
accords_sim_matrix = cosine_similarity(accords_matrix)

In [34]:
def compute_note_frequencies(df):
    """Compute note frequency across all perfumes."""
    all_notes = []
    for col in ['Top', 'Middle', 'Base']:
        df[col] = df[col].fillna('').apply(lambda x: [n.strip().lower() for n in str(x).split(',') if n.strip()])
        all_notes.extend([note for notes in df[col] for note in notes])
    freq = Counter(all_notes)
    max_freq = max(freq.values())
    # Inverse frequency weighting: rarer notes get higher weight
    rarity_weights = {note: max_freq / freq[note] for note in freq}
    return rarity_weights
rarity_weights = compute_note_frequencies(df)

In [35]:
def weighted_note_similarity(target_notes, candidate_notes, rarity_weights):
    """Compute weighted similarity between two sets of notes with rarity boost."""
    if not target_notes or not candidate_notes:
        return 0
    intersection = set(target_notes) & set(candidate_notes)
    if not intersection:
        return 0
    weighted_intersection = sum(rarity_weights.get(n, 1) for n in intersection)
    weighted_union = sum(rarity_weights.get(n, 1) for n in set(target_notes) | set(candidate_notes))
    return weighted_intersection / weighted_union

Similar frags finder functions

V1.1-og one: raw matching notes and accords. \
V1.2: improved V1 layer to layer notes + accounting for flankers \
V2-cross layer: also accounts for if the note is in some diff layer \
V3-weighing notes version: assigns higher value to rarer notes so generic notes just dont take over \
V4: Integrating everything with percentages (v2 x v3 integration) (current version)

In [36]:
#v1.1-og one: raw notes matching+accords
def get_similar_fragrances(perfume_query, top_n=10, weight_notes=0.4, weight_accords=0.6):
    # Normalize input
    perfume_query = normalize_text(perfume_query)
    
    # Combine Brand + Perfume for better search
    df['SearchKey'] = df['Brand'].str.lower() + " " + df['Perfume'].str.lower().str.replace("-", " ")
    
    # Fuzzy match
    matches = fuzzy_match_perfume(perfume_query, df['SearchKey'].tolist(), limit=5)
    
    if not matches:
        return f"No match found for '{perfume_query}'."
    
    # Pick best match
    best_match_name = matches[0][0]
    idx = df[df['SearchKey'] == best_match_name].index[0]
    
    print(f" Matched input to: {df.iloc[idx]['Perfume']} by {df.iloc[idx]['Brand']} (Score: {matches[0][1]}%)")
    
    # Get target perfume data
    target_notes = df.iloc[idx]['All_Notes']
    
    # Compute Jaccard similarity for notes
    note_similarities = [jaccard_similarity(target_notes, notes) for notes in df['All_Notes']]
    
    # Accord similarity from precomputed matrix
    accord_similarities = accords_sim_matrix[idx]
    
    # Combined score
    combined_score = weight_notes * np.array(note_similarities) + weight_accords * np.array(accord_similarities)
    
    # Normalize to 0–100
    combined_score = (combined_score / combined_score.max()) * 100
    
    df_scores = df.copy()
    df_scores['Similarity'] = combined_score
    
    # Sort & filter
    df_scores = df_scores.sort_values(by='Similarity', ascending=False)
    df_scores = df_scores[df_scores['Perfume'] != df.iloc[idx]['Perfume']]
    
    return df_scores[['Perfume','Brand','Year','Gender','Similarity','Rating Value','url']].head(top_n)


In [37]:
#v1.2-improved V1 accounting for flankers
def weighted_jaccard(list1, list2):
    """Compute Jaccard similarity between two lists."""
    set1, set2 = set(list1), set(list2)
    return len(set1 & set2) / len(set1 | set2) if set1 and set2 else 0

def get_similar_fragrances_v12(perfume_query, top_n=10,
                                    w_top=0.2, w_middle=0.3, w_base=0.5,
                                    w_accords=0.5, brand_penalty=0.95,
                                    dupe_brands_boost=1.05,
                                    dupe_brands=["lattafa", "armaf", "afnan"]):
    perfume_query = normalize_text(perfume_query)
    df['SearchKey'] = df['Brand'].str.lower() + " " + df['Perfume'].str.lower().str.replace("-", " ")
    
    matches = fuzzy_match_perfume(perfume_query, df['SearchKey'].tolist(), limit=5)
    if not matches:
        return f"No match found for '{perfume_query}'."
    
    best_match_name = matches[0][0]
    idx = df[df['SearchKey'] == best_match_name].index[0]
    
    print(f"✅ Matched input to: {df.iloc[idx]['Perfume']} by {df.iloc[idx]['Brand']} (Score: {matches[0][1]}%)")
    print("V1.2-improved V1 accounting for flankers")
    
    target = df.iloc[idx]
    
    # Compute weighted note similarity for all rows
    top_sim = df['Top'].apply(lambda x: weighted_jaccard(target['Top'], x))
    mid_sim = df['Middle'].apply(lambda x: weighted_jaccard(target['Middle'], x))
    base_sim = df['Base'].apply(lambda x: weighted_jaccard(target['Base'], x))
    
    weighted_notes = (w_top * top_sim + w_middle * mid_sim + w_base * base_sim)
    
    # Accord similarity
    accord_sim = accords_sim_matrix[idx]
    
    # Combine
    combined_score = weighted_notes + (w_accords * accord_sim)
    
    # Apply brand penalty
    same_brand_mask = (df['Brand'] == target['Brand'])
    combined_score[same_brand_mask] *= brand_penalty
    
    # Dupe brand boost
    dupe_brand_mask = df['Brand'].str.lower().isin(dupe_brands)
    combined_score[dupe_brand_mask] *= dupe_brands_boost
    
    # Normalize to 0-100
    combined_score = (combined_score / combined_score.max()) * 100
    
    df_scores = df.copy()
    df_scores['Similarity'] = combined_score
    df_scores = df_scores.sort_values(by='Similarity', ascending=False)
    df_scores = df_scores[df_scores['Perfume'] != target['Perfume']]
    
    return df_scores[['Perfume','Brand','Year','Gender','Similarity','Rating Value','url']].head(top_n)


In [38]:
#cross layer similarity function
def flexible_note_similarity(p1_notes, p2_notes):
    """
    Compare notes across layers with partial credit for cross-layer matches.
    """
    same_layer_score = len(set(p1_notes) & set(p2_notes))
    total = len(set(p1_notes) | set(p2_notes))
    return same_layer_score / total if total > 0 else 0


def cross_layer_similarity(top1, mid1, base1, top2, mid2, base2,
                           same_weight=1.0, cross_weight=0.5,
                           w_top=0.15, w_mid=0.25, w_base=0.6):
    """
    Compute weighted similarity considering same-layer and cross-layer matches.
    """
    layers1 = {"top": set(top1), "mid": set(mid1), "base": set(base1)}
    layers2 = {"top": set(top2), "mid": set(mid2), "base": set(base2)}
    
    score = 0
    max_score = 0
    
    layer_weights = {"top": w_top, "mid": w_mid, "base": w_base}
    
    for layer, notes_l1 in layers1.items():
        weight = layer_weights[layer]
        for note in notes_l1:
            max_score += weight * same_weight  # best case
            if note in layers2[layer]:
                score += weight * same_weight
            elif note in layers2["top"] or note in layers2["mid"] or note in layers2["base"]:
                score += weight * cross_weight
    
    return score / max_score if max_score > 0 else 0

In [39]:
#v2-cross layer: also accounts for if the note is in some diff layer
def cross_layer_similarity(top1, mid1, base1, top2, mid2, base2,
                           same_weight=1.0, cross_weight=0.5,
                           w_top=0.15, w_mid=0.25, w_base=0.6):
    """
    Compute weighted similarity considering same-layer and cross-layer matches.
    """
    layers1 = {"top": set(top1), "mid": set(mid1), "base": set(base1)}
    layers2 = {"top": set(top2), "mid": set(mid2), "base": set(base2)}
    
    score = 0
    max_score = 0
    
    layer_weights = {"top": w_top, "mid": w_mid, "base": w_base}
    
    for layer, notes_l1 in layers1.items():
        weight = layer_weights[layer]
        for note in notes_l1:
            max_score += weight  # assume best case match in same layer
            if note in layers2[layer]:
                score += weight * same_weight
            elif note in layers2["top"] or note in layers2["mid"] or note in layers2["base"]:
                score += weight * cross_weight
    
    return score / max_score if max_score > 0 else 0

def weighted_jaccard(list1, list2):
    set1, set2 = set(list1), set(list2)
    return len(set1 & set2) / len(set1 | set2) if set1 and set2 else 0

def get_similar_fragrances_v2(perfume_query, top_n=10,
                                    w_accords=0.6,
                                    same_weight=1.0, cross_weight=0.5,
                                    w_top=0.15, w_mid=0.25, w_base=0.6,
                                    brand_penalty=0.95, dupe_brands_boost=1.05,
                                    dupe_brands=["lattafa", "armaf", "afnan"]):
    """
    Finds similar fragrances using cross-layer note matching + accord similarity.
    """
    perfume_query = perfume_query.lower().replace("-", " ")
    df['SearchKey'] = df['Brand'].str.lower() + " " + df['Perfume'].str.lower().str.replace("-", " ")
    
    matches = process.extract(perfume_query, df['SearchKey'].tolist(), limit=5)
    if not matches:
        return f"No match found for '{perfume_query}'."
    
    best_match_name = matches[0][0]
    idx = df[df['SearchKey'] == best_match_name].index[0]
    
    print(f"✅ Matched input to: {df.iloc[idx]['Perfume']} by {df.iloc[idx]['Brand']} (Score: {matches[0][1]:.1f}%)")
    print("V2-cross layer: also accounts for if the note is in some diff layer")
    
    target = df.iloc[idx]
    
    # Compute similarity for all perfumes
    scores = []
    for i, row in df.iterrows():
        if row['Perfume'] == target['Perfume']:
            continue
        
        note_score = cross_layer_similarity(target['Top'], target['Middle'], target['Base'],
                                            row['Top'], row['Middle'], row['Base'],
                                            same_weight, cross_weight, w_top, w_mid, w_base)
        
        accords_1 = [target['mainaccord1'], target['mainaccord2'], target['mainaccord3']]
        accords_2 = [row['mainaccord1'], row['mainaccord2'], row['mainaccord3']]
        accord_score = weighted_jaccard(accords_1, accords_2)
        
        combined_score = (note_score + (w_accords * accord_score)) / (1 + w_accords)
        
        # Apply brand penalty/boost
        if row['Brand'] == target['Brand']:
            combined_score *= brand_penalty
        if row['Brand'].lower() in dupe_brands:
            combined_score *= dupe_brands_boost
        
        scores.append((i, combined_score))
    
    # Normalize to 0-100
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    max_score = scores[0][1] if scores else 1
    scores = [(i, (score / max_score) * 100) for i, score in scores]
    
    # Build result DataFrame
    top_indices = [i for i, _ in scores[:top_n]]
    sim_values = [round(s, 2) for _, s in scores[:top_n]]
    
    df_result = df.iloc[top_indices].copy()
    df_result['Similarity'] = sim_values
    
    return df_result[['Perfume', 'Brand', 'Year', 'Gender', 'Similarity', 'Rating Value', 'url']]


In [40]:
#v3-weighing notes version: assigns higher value to rarer notes so generic notes just dont take over
def get_similar_fragrances_v3(perfume_query, df=df, rarity_weights=rarity_weights,
                                    top_n=10,
                                    w_top=0.2, w_middle=0.3, w_base=0.5,
                                    w_accords=0.4, brand_penalty=0.95,
                                    gender_weight=True, dupe_boost=True,
                                    dupe_brands=["lattafa", "armaf", "afnan"]):

    # Build searchable key
    df['SearchKey'] = (df['Brand'].str.lower() + " " +
                       df['Perfume'].str.lower().str.replace("-", " "))

    matches = fuzzy_match_perfume(normalize_text(perfume_query), df['SearchKey'].tolist(), limit=1)
    if not matches:
        return f"No match found for '{perfume_query}'."

    best_match_name = matches[0][0]
    idx = df[df['SearchKey'] == best_match_name].index[0]
    print(f"✅ Matched input to: {df.iloc[idx]['Perfume']} by {df.iloc[idx]['Brand']} (Score: {matches[0][1]}%)")
    print("V3-weighing notes version: assigns higher value to rarer notes so generic notes just dont take over (current version)")

    target = df.iloc[idx]

    # Compute similarity for each perfume
    scores = []
    for i, row in df.iterrows():
        if i == idx:
            continue

        # Notes similarity (cross-layer)
        target_all_notes = target['Top'] + target['Middle'] + target['Base']
        candidate_all_notes = row['Top'] + row['Middle'] + row['Base']
        note_sim = weighted_note_similarity(target_all_notes, candidate_all_notes, rarity_weights)

        # Accord similarity (Jaccard)
        target_accords = {a for a in [target.get('mainaccord1'), target.get('mainaccord2'),
                                      target.get('mainaccord3'), target.get('mainaccord4'), target.get('mainaccord5')] if a}
        candidate_accords = {a for a in [row.get('mainaccord1'), row.get('mainaccord2'),
                                          row.get('mainaccord3'), row.get('mainaccord4'), row.get('mainaccord5')] if a}
        accord_sim = len(target_accords & candidate_accords) / len(target_accords | candidate_accords) if target_accords and candidate_accords else 0

        # Combine
        combined_score = note_sim + (accord_sim * w_accords)

        # Apply brand penalty
        if row['Brand'] == target['Brand']:
            combined_score *= brand_penalty

        # Dupe brand boost
        if dupe_boost and str(row['Brand']).lower() in dupe_brands:
            combined_score *= 1.05

        # Gender weighting
        if gender_weight:
            if row['Gender'] == target['Gender']:
                combined_score *= 1.1
            elif 'unisex' in [row['Gender'], target['Gender']]:
                combined_score *= 1.05
            else:
                combined_score *= 0.95

        scores.append((i, combined_score))

    # Sort results
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    top_results = []
    for idx_score in scores[:top_n]:
        row = df.iloc[idx_score[0]]
        top_results.append({
            'Perfume': row['Perfume'],
            'Brand': row['Brand'],
            'Year': row['Year'],
            'Gender': row['Gender'],
            'Similarity': round(idx_score[1] * 100, 2),
            'Rating Value': row['Rating Value'],
            'url': row['url']
        })

    return pd.DataFrame(top_results)

In [41]:
# v4.0: improved v3 with more robust accord matching and better handling of edge cases
def compute_rarity_weights(df):
    all_notes = df['All_Notes'].explode().dropna()
    note_counts = all_notes.value_counts()
    max_count = note_counts.max()
    rarity_weights = {note: 1 + np.log(max_count / count) for note, count in note_counts.items()}
    return rarity_weights
rarity_weights = compute_rarity_weights(df)
def calculate_note_similarity(notes_a, notes_b, rarity_weights):
    """
    notes_a, notes_b = dict with 'top', 'mid', 'base'
    rarity_weights = dict of note: weight
    Returns weighted cross-layer similarity score (0-100)
    """

    score = 0
    max_score = 0
    cross_layer_penalty = 0.7  # if in different layer

    all_notes_a = []
    all_notes_b = []

    for layer in ['top', 'mid', 'base']:
        all_notes_a.extend(notes_a.get(layer, []))
        all_notes_b.extend(notes_b.get(layer, []))

    # Unique sets for normalization
    all_notes_a = list(set(all_notes_a))
    all_notes_b = list(set(all_notes_b))

    for note in all_notes_a:
        weight = rarity_weights.get(note, 1)
        max_score += weight  # for normalization
        if note in all_notes_b:
            score += weight
        else:
            # cross-layer check
            for layer_b in ['top', 'mid', 'base']:
                if note in notes_b.get(layer_b, []):
                    score += weight * cross_layer_penalty
                    break

    for note in all_notes_b:
        if note not in all_notes_a:
            max_score += rarity_weights.get(note, 1)

    return (score / max_score) * 100 if max_score > 0 else 0

# -------------------------
# 3. Accords Similarity
# -------------------------
def calculate_accord_similarity(accords_a, accords_b):
    if not accords_a or not accords_b:
        return 0
    set_a = set(accords_a)
    set_b = set(accords_b)
    return (len(set_a & set_b) / len(set_a | set_b)) * 100

# -------------------------
# 4. Compare Two Perfumes (V4)
# -------------------------
def compare_two_perfumes_v4(name1, name2, df=df, rarity_weights=rarity_weights):
    # Fuzzy match names
    idx1 = df[df['Perfume'].str.contains(name1, case=False, na=False)].index
    idx2 = df[df['Perfume'].str.contains(name2, case=False, na=False)].index

    if len(idx1) == 0 or len(idx2) == 0:
        return {"error": "Perfume not found"}

    idx1, idx2 = idx1[0], idx2[0]
    p1 = df.iloc[idx1]
    p2 = df.iloc[idx2]

    notes_a = {'top': p1['Top'], 'mid': p1['Middle'], 'base': p1['Base']}
    notes_b = {'top': p2['Top'], 'mid': p2['Middle'], 'base': p2['Base']}

    note_sim = calculate_note_similarity(notes_a, notes_b, rarity_weights)
    accord_sim = calculate_accord_similarity(p1['Accords'], p2['Accords'])

    # Final weighted score
    overall = (note_sim * 0.8) + (accord_sim * 0.2)

    return {
        "Perfume 1": f"{p1['Perfume']} by {p1['Brand']}",
        "Perfume 2": f"{p2['Perfume']} by {p2['Brand']}",
        "Overall Similarity (%)": round(overall, 2),
        "Notes Similarity (%)": round(note_sim, 2),
        "Accords Similarity (%)": round(accord_sim, 2),
        "Unique Notes in Perfume 1": list(set(p1['All_Notes']) - set(p2['All_Notes'])),
        "Unique Notes in Perfume 2": list(set(p2['All_Notes']) - set(p1['All_Notes'])),
        "URL 1": p1['url'],
        "URL 2": p2['url']
    }
def get_similar_fragrances_v4(input_perfume, df=df, rarity_weights=rarity_weights, top_n=15):
    idx = df[df['Perfume'].str.contains(input_perfume, case=False, na=False)].index
    if len(idx) == 0:
        return {"error": "Perfume not found"}
    idx = idx[0]
    p_base = df.iloc[idx]
    notes_a = {'top': p_base['Top'], 'mid': p_base['Middle'], 'base': p_base['Base']}

    results = []
    for i, row in df.iterrows():
        if i == idx:
            continue
        notes_b = {'top': row['Top'], 'mid': row['Middle'], 'base': row['Base']}
        note_sim = calculate_note_similarity(notes_a, notes_b, rarity_weights)
        accord_sim = calculate_accord_similarity(p_base['Accords'], row['Accords'])
        overall = (note_sim * 0.8) + (accord_sim * 0.2)

        results.append({
            "Perfume": row['Perfume'],
            "Brand": row['Brand'],
            "Year": row['Year'],
            "Gender": row['Gender'],
            "Similarity": round(overall, 2),
            "Rating Value": row['Rating Value'],
            "url": row['url']
        })
    print(f"✅ Matched input to: {df.iloc[idx]['Perfume']} by {df.iloc[idx]['Brand']} (Score: {fuzz.token_set_ratio(input_perfume, df.iloc[idx]['SearchKey']):.1f}%)")
    print("V4-weighing notes version: assigns higher value to rarer notes so generic notes just dont take over (current version)")
    return pd.DataFrame(sorted(results, key=lambda x: x['Similarity'], reverse=True)).head(top_n)

In [None]:
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from rapidfuzz import fuzz
import re

# --------------------------
# 1. Normalize & Synonym Map
# --------------------------
SYNONYMS = {
    "ambergris": "amber",
    "vanilla absolute": "vanilla",
    "oud": "agarwood",
    "agar": "agarwood",
    "benzoin resin": "benzoin"
}

def normalize_notes(note_list):
    normalized = []
    for note in note_list:
        note = note.strip().lower()
        if note in SYNONYMS:
            note = SYNONYMS[note]
        normalized.append(note)
    return normalized

# --------------------------
# 2. Compute rarity weights
# --------------------------
def compute_rarity_weights(df):
    all_notes = []
    for col in ['Top', 'Middle', 'Base']:
        df[col] = df[col].fillna('').apply(lambda x: [n.strip().lower() for n in str(x).split(',') if n.strip()])
        all_notes.extend([note for notes in df[col] for note in notes])
    freq = Counter(all_notes)
    max_freq = max(freq.values())
    rarity_weights = {note: np.log(1 + (max_freq / freq[note])) for note in freq}  # log scaling
    return rarity_weights

# --------------------------
# 3. Build Note Vectors
# --------------------------
def build_note_vector(note_dict, all_notes, rarity_weights):
    vec = np.zeros(len(all_notes))
    for layer in note_dict.values():
        for note in layer:
            if note in all_notes:
                idx = all_notes.index(note)
                vec[idx] += rarity_weights.get(note, 1.0)
    return vec

# --------------------------
# 4. Compute similarity V5
# --------------------------
def calculate_similarity_v5(notes_a, notes_b, rarity_weights, all_notes, accords_a, accords_b, brand_match=False, year_match=False):
    # Create weighted vectors
    vec_a = build_note_vector(notes_a, all_notes, rarity_weights)
    vec_b = build_note_vector(notes_b, all_notes, rarity_weights)

    # Cosine similarity for notes
    note_sim = cosine_similarity([vec_a], [vec_b])[0][0]

    # Accord similarity
    accord_overlap = len(set(accords_a) & set(accords_b)) / max(len(set(accords_a) | set(accords_b)), 1)

    # Combine with weights
    score = (note_sim * 0.7) + (accord_overlap * 0.2)
    if brand_match: score += 0.05
    if year_match: score += 0.05
    return min(score * 100, 100)

# --------------------------
# 5. Main search function
# --------------------------
def get_similar_fragrances_v5(input_perfume, df, rarity_weights, top_n=15, gender_filter=None):
    # Normalize columns
    df['SearchKey'] = df['Perfume'].str.lower()
    idx = df[df['SearchKey'].str.contains(input_perfume.lower(), regex=False)].index
    if len(idx) == 0:
        raise ValueError("Perfume not found in dataset.")
    idx = idx[0]

    base = df.iloc[idx]
    notes_a = {'top': normalize_notes(base['Top']), 'mid': normalize_notes(base['Middle']), 'base': normalize_notes(base['Base'])}
    accords_a = [a.strip().lower() for a in str(base['Accords']).split(',') if a.strip()]

    all_notes = sorted(set(note for col in ['Top', 'Middle', 'Base'] for notes in df[col] for note in notes))

    results = []
    for i, row in df.iterrows():
        if i == idx: continue
        if gender_filter and str(row['Gender']).lower() != gender_filter.lower():
            continue
        notes_b = {'top': normalize_notes(row['Top']), 'mid': normalize_notes(row['Middle']), 'base': normalize_notes(row['Base'])}
        accords_b = [a.strip().lower() for a in str(row['Accords']).split(',') if a.strip()]
        brand_match = (row['Brand'].strip().lower() == base['Brand'].strip().lower())
        year_match = (row['Year'] == base['Year'])
        score = calculate_similarity_v5(notes_a, notes_b, rarity_weights, all_notes, accords_a, accords_b, brand_match, year_match)
        results.append((row['Perfume'], row['Brand'], row['Year'], row['Gender'], score, row['Rating Value'], row['url']))

    # Sort results
    results.sort(key=lambda x: x[4], reverse=True)
    print(f"✅ Matched input to: {df.iloc[idx]['Perfume']} by {df.iloc[idx]['Brand']} (Score: {fuzz.token_set_ratio(input_perfume, df.iloc[idx]['SearchKey']):.1f}%)")
    print("V5s")
    return pd.DataFrame(results[:top_n], columns=['Perfume', 'Brand', 'Year', 'Gender', 'Similarity', 'Rating Value', 'url'])


In [48]:
rarity_weights = compute_rarity_weights(df)
results = get_similar_fragrances_v5("althair", df, rarity_weights, top_n=15, gender_filter=None)
print(results)

                                Perfume                  Brand    Year  \
0                           liquid-brun        fragrance-world  2024.0   
1                      signorina-libera    salvatore-ferragamo  2023.0   
2    l-eau-d-issey-pour-homme-wood-wood           issey-miyake  2019.0   
3                                wujood                 zimaya  2023.0   
4   red-temptation-winter-eau-de-parfum                   zara  2023.0   
5                        altesse-mysore            alexandre-j  2017.0   
6                             rose-shot       olfactive-studio  2019.0   
7      black-xs-l-aphrodisiaque-for-men           paco-rabanne  2013.0   
8          guess-1981-los-angeles-women                  guess  2019.0   
9              botica-214-fiji-paradise            o-boticario  2023.0   
10                 neroli-supercritique  les-eaux-primordiales  2023.0   
11                        zimaya-wujood                  afnan     NaN   
12                        khamrah-qahw

In [None]:
results = get_similar_fragrances_v4("althair", top_n=15)
print(results)

✅ Matched input to: althair by parfums-de-marly (Score: 100.0%)
V4-weighing notes version: assigns higher value to rarer notes so generic notes just dont take over (current version)
                     Perfume                  Brand    Year  Gender  \
0                liquid-brun        fragrance-world  2024.0   women   
1            paris-sao-paulo                 carven  2017.0   women   
2         touch-grigio-perla               la-perla  2000.0     men   
3     the-best-bro-s-one-way               faberlic  2022.0     men   
4              khamrah-qahwa       lattafa-perfumes  2023.0  unisex   
5            style-pleasures               mahogany  2019.0   women   
6       boss-bottled-intense              hugo-boss  2015.0     men   
7       neroli-supercritique  les-eaux-primordiales  2023.0  unisex   
8       arabian-nights-black            arabian-oud     NaN  unisex   
9         lacoste-pour-homme     lacoste-fragrances  2002.0     men   
10  joop-homme-eau-de-parfum         

In [None]:
results = get_similar_fragrances_v3("althair", top_n=15)
print(results)

✅ Matched input to: althair by parfums-de-marly (Score: 100.0%)
V3-weighing notes version: assigns higher value to rarer notes so generic notes just dont take over (current version)
               Perfume                 Brand    Year  Gender  Similarity  \
0          liquid-brun       fragrance-world  2024.0   women      133.00   
1        zimaya-wujood                 afnan     NaN     men       46.09   
2               wujood                zimaya  2023.0     men       42.42   
3   l-aventure-intense  al-haramain-perfumes  2019.0     men       39.79   
4           l-aventure  al-haramain-perfumes  2016.0     men       39.55   
5             tuberosa             carthusia  2019.0  unisex       38.45   
6     signorina-libera   salvatore-ferragamo  2023.0   women       37.01   
7             udv-wild       ulric-de-varens  2020.0     men       36.57   
8        baldinini-man             baldinini  2008.0     men       36.37   
9           heroic-man               la-rive  2022.0     m

In [None]:
results = get_similar_fragrances_v2("althair", top_n=15)
print(results)

✅ Matched input to: althair by parfums-de-marly (Score: 90.0%)
V2-cross layer: also accounts for if the note is in some diff layer
                          Perfume                                Brand  \
24044                 liquid-brun                      fragrance-world   
16120     good-girl-velvet-fatale                     carolina-herrera   
23542               khamrah-qahwa                     lattafa-perfumes   
11200    good-girl-fantastic-pink                     carolina-herrera   
15528         gingembre-petillant                              versace   
20505                        1970  rosendo-mateu-olfactive-expressions   
22777                     khamrah                     lattafa-perfumes   
19822                    lucifero                      antonio-maretti   
15780               rouge-smoking                          bdk-parfums   
17657                  ii-praline                          nobile-1942   
11704              habano-vanilla                      

In [None]:
def compare_two_perfumes_v3(name1, name2, df=df, rarity_weights=rarity_weights):
    # Match first perfume
    df['SearchKey'] = (df['Brand'].str.lower() + " " +
                       df['Perfume'].str.lower().str.replace("-", " "))
    match1 = fuzzy_match_perfume(normalize_text(name1), df['SearchKey'].tolist(), limit=1)
    match2 = fuzzy_match_perfume(normalize_text(name2), df['SearchKey'].tolist(), limit=1)
    if not match1 or not match2:
        return "One or both perfumes not found."

    idx1 = df[df['SearchKey'] == match1[0][0]].index[0]
    idx2 = df[df['SearchKey'] == match2[0][0]].index[0]

    p1, p2 = df.iloc[idx1], df.iloc[idx2]

    # Notes similarity (cross-layer)
    p1_notes = p1['Top'] + p1['Middle'] + p1['Base']
    p2_notes = p2['Top'] + p2['Middle'] + p2['Base']
    note_sim = weighted_note_similarity(p1_notes, p2_notes, rarity_weights)

    # Accords similarity
    p1_acc = {a for a in [p1.get('mainaccord1'), p1.get('mainaccord2'), p1.get('mainaccord3'), p1.get('mainaccord4'), p1.get('mainaccord5')] if a}
    p2_acc = {a for a in [p2.get('mainaccord1'), p2.get('mainaccord2'), p2.get('mainaccord3'), p2.get('mainaccord4'), p2.get('mainaccord5')] if a}
    acc_sim = len(p1_acc & p2_acc) / len(p1_acc | p2_acc) if p1_acc and p2_acc else 0

    score = (note_sim + acc_sim * 0.4) * 100

    return {
        'Perfume 1': f"{p1['Perfume']} by {p1['Brand']}",
        'Perfume 2': f"{p2['Perfume']} by {p2['Brand']}",
        'Overall Similarity (%)': round(score, 2),
        'Unique Notes in Perfume 1': list(set(p1_notes) - set(p2_notes)),
        'Unique Notes in Perfume 2': list(set(p2_notes) - set(p1_notes)),
        'URL 1': p1['url'],
        'URL 2': p2['url']
    }

In [None]:
from rapidfuzz import process

def compare_two_perfumes(name1, name2,
                                   w_top=0.15, w_middle=0.25, w_base=0.6, w_accords=0.6):
    # Create search key for fuzzy matching
    df['SearchKey'] = df['Brand'].str.lower() + " " + df['Perfume'].str.lower().str.replace("-", " ")
    
    # Fuzzy match both inputs
    match1 = process.extractOne(name1.lower(), df['SearchKey'].tolist())
    match2 = process.extractOne(name2.lower(), df['SearchKey'].tolist())
    
    if not match1 or not match2:
        return f"Could not match one or both perfumes."
    
    idx1 = df[df['SearchKey'] == match1[0]].index[0]
    idx2 = df[df['SearchKey'] == match2[0]].index[0]
    
    p1 = df.iloc[idx1]
    p2 = df.iloc[idx2]
    
    print(f"✅ Matched {name1} → {p1['Perfume']} by {p1['Brand']} (Score: {match1[1]:.1f}%)")
    print(f"✅ Matched {name2} → {p2['Perfume']} by {p2['Brand']} (Score: {match2[1]:.1f}%)\n")
    
    # Compute note similarities
    top_sim = weighted_jaccard(p1['Top'], p2['Top'])
    mid_sim = weighted_jaccard(p1['Middle'], p2['Middle'])
    base_sim = weighted_jaccard(p1['Base'], p2['Base'])
    
    weighted_notes_score = (w_top * top_sim + w_middle * mid_sim + w_base * base_sim)
    
    # Accord similarity
    accords_1 = [p1['mainaccord1'], p1['mainaccord2'], p1['mainaccord3']]
    accords_2 = [p2['mainaccord1'], p2['mainaccord2'], p2['mainaccord3']]
    accords_sim = weighted_jaccard(accords_1, accords_2)
    
    # Final combined score
    final_score = (weighted_notes_score + (w_accords * accords_sim)) / (1 + w_accords)
    final_score_percent = round(final_score * 100, 2)
    
    # Differences in notes
    notes1 = set(p1['All_Notes'])
    notes2 = set(p2['All_Notes'])
    unique_to_1 = notes1 - notes2
    unique_to_2 = notes2 - notes1
    
    result = {
        "Perfume 1": f"{p1['Perfume']} by {p1['Brand']}",
        "Perfume 2": f"{p2['Perfume']} by {p2['Brand']}",
        "Overall Similarity (%)": final_score_percent,
        "Top Notes Similarity (%)": round(top_sim * 100, 2),
        "Middle Notes Similarity (%)": round(mid_sim * 100, 2),
        "Base Notes Similarity (%)": round(base_sim * 100, 2),
        "Accords Similarity (%)": round(accords_sim * 100, 2),
        "Unique Notes in Perfume 1": list(unique_to_1),
        "Unique Notes in Perfume 2": list(unique_to_2),
        "URL 1": p1['url'],
        "URL 2": p2['url']
    }
    
    return result

In [None]:
def compare_two_perfumes_advanced(name1, name2,
                                   same_weight=1.0, cross_weight=0.5,
                                   w_top=0.15, w_mid=0.25, w_base=0.6, w_accords=0.6):
    df['SearchKey'] = df['Brand'].str.lower() + " " + df['Perfume'].str.lower().str.replace("-", " ")
    
    # Fuzzy match both inputs
    match1 = process.extractOne(name1.lower(), df['SearchKey'].tolist())
    match2 = process.extractOne(name2.lower(), df['SearchKey'].tolist())
    
    if not match1 or not match2:
        return f"Could not match one or both perfumes."
    
    idx1 = df[df['SearchKey'] == match1[0]].index[0]
    idx2 = df[df['SearchKey'] == match2[0]].index[0]
    
    p1 = df.iloc[idx1]
    p2 = df.iloc[idx2]
    
    print(f"✅ Matched {name1} → {p1['Perfume']} by {p1['Brand']} (Score: {match1[1]:.1f}%)")
    print(f"✅ Matched {name2} → {p2['Perfume']} by {p2['Brand']} (Score: {match2[1]:.1f}%)\n")
    
    # Cross-layer similarity for notes
    note_score = cross_layer_similarity(p1['Top'], p1['Middle'], p1['Base'],
                                        p2['Top'], p2['Middle'], p2['Base'],
                                        same_weight, cross_weight, w_top, w_mid, w_base)
    
    # Accord similarity
    accords_1 = [p1['mainaccord1'], p1['mainaccord2'], p1['mainaccord3']]
    accords_2 = [p2['mainaccord1'], p2['mainaccord2'], p2['mainaccord3']]
    accords_sim = weighted_jaccard(accords_1, accords_2)
    
    # Final combined score
    final_score = (note_score + (w_accords * accords_sim)) / (1 + w_accords)
    final_score_percent = round(final_score * 100, 2)
    
    # Differences in notes (all layers combined)
    notes1 = set(p1['Top'] + p1['Middle'] + p1['Base'])
    notes2 = set(p2['Top'] + p2['Middle'] + p2['Base'])
    unique_to_1 = notes1 - notes2
    unique_to_2 = notes2 - notes1
    
    result = {
        "Perfume 1": f"{p1['Perfume']} by {p1['Brand']}",
        "Perfume 2": f"{p2['Perfume']} by {p2['Brand']}",
        "Overall Similarity (%)": final_score_percent,
        "Notes Similarity (Cross-Layer)": round(note_score * 100, 2),
        "Accords Similarity (%)": round(accords_sim * 100, 2),
        "Unique Notes in Perfume 1": list(unique_to_1),
        "Unique Notes in Perfume 2": list(unique_to_2),
        "URL 1": p1['url'],
        "URL 2": p2['url']
    }
    
    return result

In [None]:
results = compare_two_perfumes_v3("prada Candy Prada women", "scandal-le-parfum")
print(results)

{'Perfume 1': 'prada-candy by prada', 'Perfume 2': 'scandal-le-parfum by jean-paul-gaultier', 'Overall Similarity (%)': 17.14, 'Unique Notes in Perfume 1': ["'musk']", "['caramel']", "['powdery notes'", "['benzoin'", "'vanilla']"], 'Unique Notes in Perfume 2': ["'salt']", "['jasmine']", "['vanilla']", "['caramel'"], 'URL 1': 'https://www.fragrantica.com/perfume/prada/prada-candy-12426.html', 'URL 2': 'https://www.fragrantica.com/perfume/jean-paul-gaultier/scandal-le-parfum-74914.html'}


In [None]:
results = compare_two_perfumes_advanced("prada Candy Prada women", "scandal-le-parfum")
print(results)

✅ Matched prada Candy Prada women → prada-candy by prada (Score: 95.0%)
✅ Matched scandal-le-parfum → scandal-le-parfum by jean-paul-gaultier (Score: 79.4%)

{'Perfume 1': 'prada-candy by prada', 'Perfume 2': 'scandal-le-parfum by jean-paul-gaultier', 'Overall Similarity (%)': 7.5, 'Notes Similarity (Cross-Layer)': 0.0, 'Accords Similarity (%)': 20.0, 'Unique Notes in Perfume 1': ["'musk']", "['caramel']", "['powdery notes'", "['benzoin'", "'vanilla']"], 'Unique Notes in Perfume 2': ["'salt']", "['jasmine']", "['vanilla']", "['caramel'"], 'URL 1': 'https://www.fragrantica.com/perfume/prada/prada-candy-12426.html', 'URL 2': 'https://www.fragrantica.com/perfume/jean-paul-gaultier/scandal-le-parfum-74914.html'}
