In [50]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from rapidfuzz import fuzz, process

In [51]:
df = pd.read_csv("../data/fragrantica_cleaned.csv", sep=';')

In [52]:
print(df.head())

                                                 url  \
0  https://www.fragrantica.com/perfume/xerjoff/ac...   
1  https://www.fragrantica.com/perfume/jean-paul-...   
2  https://www.fragrantica.com/perfume/jean-paul-...   
3  https://www.fragrantica.com/perfume/bruno-bana...   
4  https://www.fragrantica.com/perfume/jean-paul-...   

                          Perfume               Brand  Country  Gender  \
0  accento-overdose-pride-edition             xerjoff    Italy  unisex   
1            classique-pride-2024  jean-paul-gaultier   France   women   
2            classique-pride-2023  jean-paul-gaultier   France  unisex   
3               pride-edition-man        bruno-banani  Germany     men   
4         le-male-pride-collector  jean-paul-gaultier   France     men   

  Rating Value  Rating Count    Year  \
0         1,42           201  2022.0   
1         1,86            70  2024.0   
2         1,91           285  2023.0   
3         1,92            59  2019.0   
4         1,93    

In [53]:
def clean_and_split(text):
    if pd.isna(text):
        return []
    return [x.strip().lower() for x in text.split(',')]

In [54]:
df['Top'] = df['Top'].apply(clean_and_split)
df['Middle'] = df['Middle'].apply(clean_and_split)
df['Base'] = df['Base'].apply(clean_and_split)

In [55]:
# ==========================
# Synonym Dictionary
# ==========================
note_synonyms = {
    "bourbon vanilla": "vanilla",
    "madagascar vanilla": "vanilla",
    "ambergris": "amber",
    "amberwood": "amber",
    "rose de mai": "rose",
    "bulgarian rose": "rose",
    "damask rose": "rose",
    "tonka bean": "tonka",
    "benzoin resin": "benzoin",
    "white musk": "musk",
    "musk ketone": "musk",
    "cacao": "chocolate",
    "cocoa": "chocolate",
    "oud wood": "oud",
    "agarwood": "oud",
    "patchouli leaf": "patchouli"
    # Add more as needed
}

def normalize_notes(note_list):
    normalized = []
    for note in note_list:
        note_clean = note.strip().lower()
        if note_clean in note_synonyms:
            normalized.append(note_synonyms[note_clean])
        else:
            normalized.append(note_clean)
    return normalized

# Apply normalization
df['Top'] = df['Top'].apply(normalize_notes)
df['Middle'] = df['Middle'].apply(normalize_notes)
df['Base'] = df['Base'].apply(normalize_notes)
df['All_Notes'] = df['Top'] + df['Middle'] + df['Base']


In [56]:
df['All_Notes'] = df['Top'] + df['Middle'] + df['Base']

In [57]:
accord_cols = ['mainaccord1','mainaccord2','mainaccord3','mainaccord4','mainaccord5']
df['Accords'] = df[accord_cols].apply(lambda x: [str(i).lower() for i in x if pd.notna(i)], axis=1)

In [58]:
mlb_notes = MultiLabelBinarizer()
notes_matrix = mlb_notes.fit_transform(df['All_Notes'])

mlb_accords = MultiLabelBinarizer()
accords_matrix = mlb_accords.fit_transform(df['Accords'])

In [59]:
def normalize_text(text):
    return text.lower().replace("-", " ").strip()

In [60]:
def jaccard_similarity(set1, set2):
    if not set1 or not set2:
        return 0
    intersection = len(set(set1).intersection(set(set2)))
    union = len(set(set1).union(set(set2)))
    return intersection / union

In [61]:
def fuzzy_match_perfume(query, choices, limit=5):
    return process.extract(query, choices, scorer=fuzz.token_set_ratio, limit=limit)

In [62]:
accords_sim_matrix = cosine_similarity(accords_matrix)

In [71]:
def get_similar_fragrances(perfume_query, top_n=10, weight_notes=0.6, weight_accords=0.4):
    # Normalize input
    perfume_query = normalize_text(perfume_query)
    
    # Combine Brand + Perfume for better search
    df['SearchKey'] = df['Brand'].str.lower() + " " + df['Perfume'].str.lower().str.replace("-", " ")
    
    # Fuzzy match
    matches = fuzzy_match_perfume(perfume_query, df['SearchKey'].tolist(), limit=5)
    
    if not matches:
        return f"No match found for '{perfume_query}'."
    
    # Pick best match
    best_match_name = matches[0][0]
    idx = df[df['SearchKey'] == best_match_name].index[0]
    
    print(f" Matched input to: {df.iloc[idx]['Perfume']} by {df.iloc[idx]['Brand']} (Score: {matches[0][1]}%)")
    
    # Get target perfume data
    target_notes = df.iloc[idx]['All_Notes']
    
    # Compute Jaccard similarity for notes
    note_similarities = [jaccard_similarity(target_notes, notes) for notes in df['All_Notes']]
    
    # Accord similarity from precomputed matrix
    accord_similarities = accords_sim_matrix[idx]
    
    # Combined score
    combined_score = weight_notes * np.array(note_similarities) + weight_accords * np.array(accord_similarities)
    
    # Normalize to 0–100
    combined_score = (combined_score / combined_score.max()) * 100
    
    df_scores = df.copy()
    df_scores['Similarity'] = combined_score
    
    # Sort & filter
    df_scores = df_scores.sort_values(by='Similarity', ascending=False)
    df_scores = df_scores[df_scores['Perfume'] != df.iloc[idx]['Perfume']]
    
    return df_scores[['Perfume','Brand','Year','Gender','Similarity','Rating Value','url']].head(top_n)


In [84]:
results = get_similar_fragrances("Khadlaj", top_n=5)
print(results)


 Matched input to: zulfa by khalis (Score: 52.63157894736842%)
                          Perfume             Brand    Year  Gender  \
3984        sakura-cherry-blossom  jo-malone-london  2011.0   women   
19337                        mira            korres  2015.0   women   
6000      eau-de-fleur-de-camelia             kenzo  2011.0   women   
10471  chloe-eau-de-toilette-2015             chloe  2015.0   women   
23777               desert-falcon       arabian-oud     NaN  unisex   

       Similarity Rating Value  \
3984    52.000000         3,71   
19337   52.000000         4,19   
6000    52.000000         3,79   
10471   50.909091         3,93   
23777   49.142857         4,54   

                                                     url  
3984   https://www.fragrantica.com/perfume/jo-malone-...  
19337  https://www.fragrantica.com/perfume/korres/mir...  
6000   https://www.fragrantica.com/perfume/kenzo/eau-...  
10471  https://www.fragrantica.com/perfume/chloe/chlo...  
23777  htt

In [65]:
def compare_two_perfumes(name1, name2):
    idx1 = df[df['Perfume'].str.contains(name1, case=False, na=False)].index[0]
    idx2 = df[df['Perfume'].str.contains(name2, case=False, na=False)].index[0]
    
    notes_sim = jaccard_similarity(df.iloc[idx1]['All_Notes'], df.iloc[idx2]['All_Notes'])
    accords_sim = cosine_similarity(accords_matrix[idx1].reshape(1, -1), accords_matrix[idx2].reshape(1, -1))[0][0]
    combined = 0.7 * notes_sim + 0.3 * accords_sim
    
    return {
        "Perfume 1": df.iloc[idx1]['Perfume'],
        "Perfume 2": df.iloc[idx2]['Perfume'],
        "Notes Similarity": round(notes_sim, 3),
        "Accords Similarity": round(accords_sim, 3),
        "Combined Score": round(combined, 3)
    }

In [67]:
results = compare_two_perfumes("althair", "qahwa")
print(results)

{'Perfume 1': 'althair', 'Perfume 2': 'khamrah-qahwa', 'Notes Similarity': 0.312, 'Accords Similarity': 0.8, 'Combined Score': 0.459}
