In [118]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from rapidfuzz import fuzz, process

In [119]:
df = pd.read_csv("../data/fragrantica_cleaned.csv", sep=';')

In [120]:
print(df.head())

                                                 url  \
0  https://www.fragrantica.com/perfume/xerjoff/ac...   
1  https://www.fragrantica.com/perfume/jean-paul-...   
2  https://www.fragrantica.com/perfume/jean-paul-...   
3  https://www.fragrantica.com/perfume/bruno-bana...   
4  https://www.fragrantica.com/perfume/jean-paul-...   

                          Perfume               Brand  Country  Gender  \
0  accento-overdose-pride-edition             xerjoff    Italy  unisex   
1            classique-pride-2024  jean-paul-gaultier   France   women   
2            classique-pride-2023  jean-paul-gaultier   France  unisex   
3               pride-edition-man        bruno-banani  Germany     men   
4         le-male-pride-collector  jean-paul-gaultier   France     men   

  Rating Value  Rating Count    Year  \
0         1,42           201  2022.0   
1         1,86            70  2024.0   
2         1,91           285  2023.0   
3         1,92            59  2019.0   
4         1,93    

In [121]:
def clean_and_split(text):
    if pd.isna(text):
        return []
    return [x.strip().lower() for x in text.split(',')]

In [122]:
df['Top'] = df['Top'].apply(clean_and_split)
df['Middle'] = df['Middle'].apply(clean_and_split)
df['Base'] = df['Base'].apply(clean_and_split)

In [123]:
# ==========================
# Synonym Dictionary
# ==========================
note_synonyms = {
    "bourbon vanilla": "vanilla", "madagascar vanilla": "vanilla", "vanilla absolute": "vanilla",
    "ambergris": "amber", "amberwood": "amber", "amber resin": "amber",
    "rose de mai": "rose", "damask rose": "rose", "turkish rose": "rose", "bulgarian rose": "rose",
    "tonka bean": "tonka", "benzoin resin": "benzoin", "white musk": "musk",
    "musk ketone": "musk", "cacao": "chocolate", "cocoa": "chocolate",
    "oud wood": "oud", "agarwood": "oud", "patchouli leaf": "patchouli",
    "cashmeran": "cashmere wood", "sandalwood oil": "sandalwood", "vetiver oil": "vetiver",
    "green apple": "apple", "bergamot peel": "bergamot", "lemon zest": "lemon",
    "mandarin orange": "mandarin", "tangerine": "mandarin", "orange blossom absolute": "orange blossom",
    "pink peppercorn": "pink pepper", "pepper essence": "pepper"
    # Add more as needed
}

def normalize_notes(note_list):
    normalized = []
    for note in note_list:
        note_clean = note.strip().lower()
        if note_clean in note_synonyms:
            normalized.append(note_synonyms[note_clean])
        else:
            normalized.append(note_clean)
    return normalized

# Apply normalization
df['Top'] = df['Top'].apply(normalize_notes)
df['Middle'] = df['Middle'].apply(normalize_notes)
df['Base'] = df['Base'].apply(normalize_notes)
df['All_Notes'] = df['Top'] + df['Middle'] + df['Base']


In [124]:
df['All_Notes'] = df['Top'] + df['Middle'] + df['Base']

In [125]:
accord_cols = ['mainaccord1','mainaccord2','mainaccord3','mainaccord4','mainaccord5']
df['Accords'] = df[accord_cols].apply(lambda x: [str(i).lower() for i in x if pd.notna(i)], axis=1)

In [126]:
mlb_notes = MultiLabelBinarizer()
notes_matrix = mlb_notes.fit_transform(df['All_Notes'])

mlb_accords = MultiLabelBinarizer()
accords_matrix = mlb_accords.fit_transform(df['Accords'])

In [127]:
def normalize_text(text):
    return text.lower().replace("-", " ").strip()

In [128]:
def jaccard_similarity(set1, set2):
    if not set1 or not set2:
        return 0
    intersection = len(set(set1).intersection(set(set2)))
    union = len(set(set1).union(set(set2)))
    return intersection / union

In [129]:
def flexible_note_similarity(p1_notes, p2_notes):
    """
    Compare notes across layers with partial credit for cross-layer matches.
    """
    same_layer_score = len(set(p1_notes) & set(p2_notes))
    total = len(set(p1_notes) | set(p2_notes))
    return same_layer_score / total if total > 0 else 0


def cross_layer_similarity(top1, mid1, base1, top2, mid2, base2,
                           same_weight=1.0, cross_weight=0.5,
                           w_top=0.15, w_mid=0.25, w_base=0.6):
    """
    Compute weighted similarity considering same-layer and cross-layer matches.
    """
    layers1 = {"top": set(top1), "mid": set(mid1), "base": set(base1)}
    layers2 = {"top": set(top2), "mid": set(mid2), "base": set(base2)}
    
    score = 0
    max_score = 0
    
    layer_weights = {"top": w_top, "mid": w_mid, "base": w_base}
    
    for layer, notes_l1 in layers1.items():
        weight = layer_weights[layer]
        for note in notes_l1:
            max_score += weight * same_weight  # best case
            if note in layers2[layer]:
                score += weight * same_weight
            elif note in layers2["top"] or note in layers2["mid"] or note in layers2["base"]:
                score += weight * cross_weight
    
    return score / max_score if max_score > 0 else 0

In [130]:
def fuzzy_match_perfume(query, choices, limit=5):
    return process.extract(query, choices, scorer=fuzz.token_set_ratio, limit=limit)

In [131]:
accords_sim_matrix = cosine_similarity(accords_matrix)

In [132]:
def get_similar_fragrances(perfume_query, top_n=10, weight_notes=0.4, weight_accords=0.6):
    # Normalize input
    perfume_query = normalize_text(perfume_query)
    
    # Combine Brand + Perfume for better search
    df['SearchKey'] = df['Brand'].str.lower() + " " + df['Perfume'].str.lower().str.replace("-", " ")
    
    # Fuzzy match
    matches = fuzzy_match_perfume(perfume_query, df['SearchKey'].tolist(), limit=5)
    
    if not matches:
        return f"No match found for '{perfume_query}'."
    
    # Pick best match
    best_match_name = matches[0][0]
    idx = df[df['SearchKey'] == best_match_name].index[0]
    
    print(f" Matched input to: {df.iloc[idx]['Perfume']} by {df.iloc[idx]['Brand']} (Score: {matches[0][1]}%)")
    
    # Get target perfume data
    target_notes = df.iloc[idx]['All_Notes']
    
    # Compute Jaccard similarity for notes
    note_similarities = [jaccard_similarity(target_notes, notes) for notes in df['All_Notes']]
    
    # Accord similarity from precomputed matrix
    accord_similarities = accords_sim_matrix[idx]
    
    # Combined score
    combined_score = weight_notes * np.array(note_similarities) + weight_accords * np.array(accord_similarities)
    
    # Normalize to 0–100
    combined_score = (combined_score / combined_score.max()) * 100
    
    df_scores = df.copy()
    df_scores['Similarity'] = combined_score
    
    # Sort & filter
    df_scores = df_scores.sort_values(by='Similarity', ascending=False)
    df_scores = df_scores[df_scores['Perfume'] != df.iloc[idx]['Perfume']]
    
    return df_scores[['Perfume','Brand','Year','Gender','Similarity','Rating Value','url']].head(top_n)


In [133]:
def weighted_jaccard(list1, list2):
    """Compute Jaccard similarity between two lists."""
    set1, set2 = set(list1), set(list2)
    return len(set1 & set2) / len(set1 | set2) if set1 and set2 else 0

def get_similar_fragrances_advanced(perfume_query, top_n=10,
                                    w_top=0.2, w_middle=0.3, w_base=0.5,
                                    w_accords=0.5, brand_penalty=0.95,
                                    dupe_brands_boost=1.05,
                                    dupe_brands=["lattafa", "armaf", "afnan"]):
    perfume_query = normalize_text(perfume_query)
    df['SearchKey'] = df['Brand'].str.lower() + " " + df['Perfume'].str.lower().str.replace("-", " ")
    
    matches = fuzzy_match_perfume(perfume_query, df['SearchKey'].tolist(), limit=5)
    if not matches:
        return f"No match found for '{perfume_query}'."
    
    best_match_name = matches[0][0]
    idx = df[df['SearchKey'] == best_match_name].index[0]
    
    print(f"✅ Matched input to: {df.iloc[idx]['Perfume']} by {df.iloc[idx]['Brand']} (Score: {matches[0][1]}%)")
    
    target = df.iloc[idx]
    
    # Compute weighted note similarity for all rows
    top_sim = df['Top'].apply(lambda x: weighted_jaccard(target['Top'], x))
    mid_sim = df['Middle'].apply(lambda x: weighted_jaccard(target['Middle'], x))
    base_sim = df['Base'].apply(lambda x: weighted_jaccard(target['Base'], x))
    
    weighted_notes = (w_top * top_sim + w_middle * mid_sim + w_base * base_sim)
    
    # Accord similarity
    accord_sim = accords_sim_matrix[idx]
    
    # Combine
    combined_score = weighted_notes + (w_accords * accord_sim)
    
    # Apply brand penalty
    same_brand_mask = (df['Brand'] == target['Brand'])
    combined_score[same_brand_mask] *= brand_penalty
    
    # Dupe brand boost
    dupe_brand_mask = df['Brand'].str.lower().isin(dupe_brands)
    combined_score[dupe_brand_mask] *= dupe_brands_boost
    
    # Normalize to 0-100
    combined_score = (combined_score / combined_score.max()) * 100
    
    df_scores = df.copy()
    df_scores['Similarity'] = combined_score
    df_scores = df_scores.sort_values(by='Similarity', ascending=False)
    df_scores = df_scores[df_scores['Perfume'] != target['Perfume']]
    
    return df_scores[['Perfume','Brand','Year','Gender','Similarity','Rating Value','url']].head(top_n)


In [134]:
results = get_similar_fragrances_advanced("prada Candy Prada women", top_n=15)
print(results)


✅ Matched input to: prada-candy by prada (Score: 100.0%)
                                                Perfume  \
22971                   prada-candy-collector-s-edition   
19504                                   vanille-vanille   
15534                                       la-capitale   
7970                                            vanille   
21729  illusions-noires-le-premier-parfum-eau-de-minuit   
3943                                       vanille-coco   
19717                                     new-look-1947   
19088                             nike-ultra-pink-woman   
11219                                  bijou-romantique   
18260                                  sensual-decadent   
10380                                     secret-genius   
2014                                   pacha-ibiza-glam   
15442       emporio-armani-diamonds-black-carat-for-her   
2820                                           silenzio   
9805                                 soleil-de-provence   

In [135]:
from rapidfuzz import process

def compare_two_perfumes(name1, name2,
                                   w_top=0.15, w_middle=0.25, w_base=0.6, w_accords=0.6):
    # Create search key for fuzzy matching
    df['SearchKey'] = df['Brand'].str.lower() + " " + df['Perfume'].str.lower().str.replace("-", " ")
    
    # Fuzzy match both inputs
    match1 = process.extractOne(name1.lower(), df['SearchKey'].tolist())
    match2 = process.extractOne(name2.lower(), df['SearchKey'].tolist())
    
    if not match1 or not match2:
        return f"Could not match one or both perfumes."
    
    idx1 = df[df['SearchKey'] == match1[0]].index[0]
    idx2 = df[df['SearchKey'] == match2[0]].index[0]
    
    p1 = df.iloc[idx1]
    p2 = df.iloc[idx2]
    
    print(f"✅ Matched {name1} → {p1['Perfume']} by {p1['Brand']} (Score: {match1[1]:.1f}%)")
    print(f"✅ Matched {name2} → {p2['Perfume']} by {p2['Brand']} (Score: {match2[1]:.1f}%)\n")
    
    # Compute note similarities
    top_sim = weighted_jaccard(p1['Top'], p2['Top'])
    mid_sim = weighted_jaccard(p1['Middle'], p2['Middle'])
    base_sim = weighted_jaccard(p1['Base'], p2['Base'])
    
    weighted_notes_score = (w_top * top_sim + w_middle * mid_sim + w_base * base_sim)
    
    # Accord similarity
    accords_1 = [p1['mainaccord1'], p1['mainaccord2'], p1['mainaccord3']]
    accords_2 = [p2['mainaccord1'], p2['mainaccord2'], p2['mainaccord3']]
    accords_sim = weighted_jaccard(accords_1, accords_2)
    
    # Final combined score
    final_score = (weighted_notes_score + (w_accords * accords_sim)) / (1 + w_accords)
    final_score_percent = round(final_score * 100, 2)
    
    # Differences in notes
    notes1 = set(p1['All_Notes'])
    notes2 = set(p2['All_Notes'])
    unique_to_1 = notes1 - notes2
    unique_to_2 = notes2 - notes1
    
    result = {
        "Perfume 1": f"{p1['Perfume']} by {p1['Brand']}",
        "Perfume 2": f"{p2['Perfume']} by {p2['Brand']}",
        "Overall Similarity (%)": final_score_percent,
        "Top Notes Similarity (%)": round(top_sim * 100, 2),
        "Middle Notes Similarity (%)": round(mid_sim * 100, 2),
        "Base Notes Similarity (%)": round(base_sim * 100, 2),
        "Accords Similarity (%)": round(accords_sim * 100, 2),
        "Unique Notes in Perfume 1": list(unique_to_1),
        "Unique Notes in Perfume 2": list(unique_to_2),
        "URL 1": p1['url'],
        "URL 2": p2['url']
    }
    
    return result

In [137]:
def compare_two_perfumes_advanced(name1, name2,
                                   same_weight=1.0, cross_weight=0.5,
                                   w_top=0.15, w_mid=0.25, w_base=0.6, w_accords=0.6):
    df['SearchKey'] = df['Brand'].str.lower() + " " + df['Perfume'].str.lower().str.replace("-", " ")
    
    # Fuzzy match both inputs
    match1 = process.extractOne(name1.lower(), df['SearchKey'].tolist())
    match2 = process.extractOne(name2.lower(), df['SearchKey'].tolist())
    
    if not match1 or not match2:
        return f"Could not match one or both perfumes."
    
    idx1 = df[df['SearchKey'] == match1[0]].index[0]
    idx2 = df[df['SearchKey'] == match2[0]].index[0]
    
    p1 = df.iloc[idx1]
    p2 = df.iloc[idx2]
    
    print(f"✅ Matched {name1} → {p1['Perfume']} by {p1['Brand']} (Score: {match1[1]:.1f}%)")
    print(f"✅ Matched {name2} → {p2['Perfume']} by {p2['Brand']} (Score: {match2[1]:.1f}%)\n")
    
    # Cross-layer similarity for notes
    note_score = cross_layer_similarity(p1['Top'], p1['Middle'], p1['Base'],
                                        p2['Top'], p2['Middle'], p2['Base'],
                                        same_weight, cross_weight, w_top, w_mid, w_base)
    
    # Accord similarity
    accords_1 = [p1['mainaccord1'], p1['mainaccord2'], p1['mainaccord3']]
    accords_2 = [p2['mainaccord1'], p2['mainaccord2'], p2['mainaccord3']]
    accords_sim = weighted_jaccard(accords_1, accords_2)
    
    # Final combined score
    final_score = (note_score + (w_accords * accords_sim)) / (1 + w_accords)
    final_score_percent = round(final_score * 100, 2)
    
    # Differences in notes (all layers combined)
    notes1 = set(p1['Top'] + p1['Middle'] + p1['Base'])
    notes2 = set(p2['Top'] + p2['Middle'] + p2['Base'])
    unique_to_1 = notes1 - notes2
    unique_to_2 = notes2 - notes1
    
    result = {
        "Perfume 1": f"{p1['Perfume']} by {p1['Brand']}",
        "Perfume 2": f"{p2['Perfume']} by {p2['Brand']}",
        "Overall Similarity (%)": final_score_percent,
        "Notes Similarity (Cross-Layer)": round(note_score * 100, 2),
        "Accords Similarity (%)": round(accords_sim * 100, 2),
        "Unique Notes in Perfume 1": list(unique_to_1),
        "Unique Notes in Perfume 2": list(unique_to_2),
        "URL 1": p1['url'],
        "URL 2": p2['url']
    }
    
    return result

In [136]:
results = compare_two_perfumes("prada Candy Prada women", "scandal-le-parfum")
print(results)

✅ Matched prada Candy Prada women → prada-candy by prada (Score: 95.0%)
✅ Matched scandal-le-parfum → scandal-le-parfum by jean-paul-gaultier (Score: 79.4%)

{'Perfume 1': 'prada-candy by prada', 'Perfume 2': 'scandal-le-parfum by jean-paul-gaultier', 'Overall Similarity (%)': 26.25, 'Top Notes Similarity (%)': 0.0, 'Middle Notes Similarity (%)': 0.0, 'Base Notes Similarity (%)': 50.0, 'Accords Similarity (%)': 20.0, 'Unique Notes in Perfume 1': ['powdery notes', 'musk', 'benzoin'], 'Unique Notes in Perfume 2': ['jasmine', 'salt'], 'URL 1': 'https://www.fragrantica.com/perfume/prada/prada-candy-12426.html', 'URL 2': 'https://www.fragrantica.com/perfume/jean-paul-gaultier/scandal-le-parfum-74914.html'}


In [138]:
results = compare_two_perfumes_advanced("prada Candy Prada women", "scandal-le-parfum")
print(results)

✅ Matched prada Candy Prada women → prada-candy by prada (Score: 95.0%)
✅ Matched scandal-le-parfum → scandal-le-parfum by jean-paul-gaultier (Score: 79.4%)

{'Perfume 1': 'prada-candy by prada', 'Perfume 2': 'scandal-le-parfum by jean-paul-gaultier', 'Overall Similarity (%)': 30.3, 'Notes Similarity (Cross-Layer)': 36.49, 'Accords Similarity (%)': 20.0, 'Unique Notes in Perfume 1': ['powdery notes', 'musk', 'benzoin'], 'Unique Notes in Perfume 2': ['jasmine', 'salt'], 'URL 1': 'https://www.fragrantica.com/perfume/prada/prada-candy-12426.html', 'URL 2': 'https://www.fragrantica.com/perfume/jean-paul-gaultier/scandal-le-parfum-74914.html'}
