# Import Library


In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
import numpy as np
import pickle

# Load Dataset

In [2]:
df = pd.read_csv('/content/dataset_parfum_gabungan.csv')
df

Unnamed: 0,No,ID_Perfume,perfume,brand,top notes,mid notes,base notes,is_lokal
0,1,HRMN-0001,glitch,mykonos,"sicilian bergamot, apple, grapefruit, charred ...","Lavender, artemesia, lily of the valley, pink ...","white musk, cedarwood, moss, amber, patchouli,...",True
1,2,HRMN-0002,invade,mykonos,"pink pepper, lavender absolute, juniper.","cashmeran, cinnamon bark, caramel.","amber, madagascar vanilla, drywoods.",True
2,3,HRMN-0003,Cafe Drops,mykonos,"Orchid, Jasmine, coffee","Vanilla, Caramel, Tonka Bean, Milk","Amber, Musk, Wood, Patchouli",True
3,4,HRMN-0004,Pink Drops,mykonos,"Strawberry Preserve, Almond, Caramel","Milk, Heliotrope","Vanilla, White musk",True
4,5,HRMN-0005,Pandan Sticky Rice,mykonos,"Rice, Almond, Lactonic","Jasmine, Ylang ylang, Iily of the valley, Pandan","Ambery, vanilla, sandalwood",True
...,...,...,...,...,...,...,...,...
25122,25123,FRGN-25123,floratta,o-boticario,"ozonic notes, peach","lily-of-the-valley, jasmine","musk, amber",False
25123,25124,FRGN-25124,cheval-d-arabie,sultan-pasha-attars,"taif rose, indian oud, white rose, olibanum, e...","bulgarian rose, indian oud, hay, leather, cive...","white amber, leather, hay, elemi resin, kyara ...",False
25124,25125,FRGN-25125,khaox,darkbeat-parfums,"mint, lime, rum","lily-of-the-valley, fig leaf, cardamom","musk, oakmoss, amber, patchouli",False
25125,25126,FRGN-25126,aoud-no-1,parfumerie-bruckner,"apple, peach, saffron","plum, orange blossom, jasmine","agarwood (oud), sandalwood, vanilla",False


# Cleaning & Preprocessing

In [3]:
# Menghapus kolom No
df = df.drop('No', axis=1)

In [4]:
# Menggunakan Kolom-kolom Notes
df['Notes'] = df['top notes'].fillna('') + ', ' + df['mid notes'].fillna('') + ', ' + df['base notes'].fillna('')
df = df.drop(['top notes', 'mid notes', 'base notes'], axis=1)
df.head()

Unnamed: 0,ID_Perfume,perfume,brand,is_lokal,Notes
0,HRMN-0001,glitch,mykonos,True,"sicilian bergamot, apple, grapefruit, charred ..."
1,HRMN-0002,invade,mykonos,True,"pink pepper, lavender absolute, juniper., cash..."
2,HRMN-0003,Cafe Drops,mykonos,True,"Orchid, Jasmine, coffee, Vanilla, Caramel, Ton..."
3,HRMN-0004,Pink Drops,mykonos,True,"Strawberry Preserve, Almond, Caramel, Milk, He..."
4,HRMN-0005,Pandan Sticky Rice,mykonos,True,"Rice, Almond, Lactonic, Jasmine, Ylang ylang, ..."


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25127 entries, 0 to 25126
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID_Perfume  25127 non-null  object
 1   perfume     25127 non-null  object
 2   brand       25127 non-null  object
 3   is_lokal    25127 non-null  bool  
 4   Notes       25127 non-null  object
dtypes: bool(1), object(4)
memory usage: 809.9+ KB


In [6]:
# Mengecek Missing value
df.isna().sum()

Unnamed: 0,0
ID_Perfume,0
perfume,0
brand,0
is_lokal,0
Notes,0


In [7]:
# Melihat data yang duplikat
df.duplicated().sum()

np.int64(0)

In [8]:
# mengecek Ada berapa brand
df['brand'].nunique()

1194

In [9]:
df['Notes'] = df['Notes'].str.lower()

In [10]:
df

Unnamed: 0,ID_Perfume,perfume,brand,is_lokal,Notes
0,HRMN-0001,glitch,mykonos,True,"sicilian bergamot, apple, grapefruit, charred ..."
1,HRMN-0002,invade,mykonos,True,"pink pepper, lavender absolute, juniper., cash..."
2,HRMN-0003,Cafe Drops,mykonos,True,"orchid, jasmine, coffee, vanilla, caramel, ton..."
3,HRMN-0004,Pink Drops,mykonos,True,"strawberry preserve, almond, caramel, milk, he..."
4,HRMN-0005,Pandan Sticky Rice,mykonos,True,"rice, almond, lactonic, jasmine, ylang ylang, ..."
...,...,...,...,...,...
25122,FRGN-25123,floratta,o-boticario,False,"ozonic notes, peach, lily-of-the-valley, jasmi..."
25123,FRGN-25124,cheval-d-arabie,sultan-pasha-attars,False,"taif rose, indian oud, white rose, olibanum, e..."
25124,FRGN-25125,khaox,darkbeat-parfums,False,"mint, lime, rum, lily-of-the-valley, fig leaf,..."
25125,FRGN-25126,aoud-no-1,parfumerie-bruckner,False,"apple, peach, saffron, plum, orange blossom, j..."


# Model

In [11]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['Notes'])

In [12]:
# Konversi ke tensorflow
tfidf_tensor = tf.convert_to_tensor(tfidf_matrix.toarray(), dtype=tf.float32)

In [13]:
# Hitung Cosine Similarity dengan TensorFlow
def cosine_similarity_tf(a, b):
    a_norm = tf.nn.l2_normalize(a, axis=1)
    b_norm = tf.nn.l2_normalize(b, axis=1)
    return tf.matmul(a_norm, b_norm, transpose_b=True)


In [14]:
indices = pd.Series(df.index, index=df['perfume'].str.lower()).drop_duplicates()


In [15]:
def rekomendasi_lokal_tf(nama_parfum, top_n=10):
    nama_parfum = nama_parfum.lower()
    if nama_parfum not in indices:
        return f"Parfum '{nama_parfum}' tidak ditemukan."

    idx = indices[nama_parfum]

    # Ambil vektor dari parfum input
    input_vec = tfidf_tensor[idx:idx+1]

    # Hitung similarity dengan semua parfum (menggunakan TensorFlow)
    sim_scores = cosine_similarity_tf(input_vec, tfidf_tensor).numpy().flatten()

    # Hilangkan dirinya sendiri
    sim_scores[idx] = -1

    # Urutkan dari skor tertinggi ke rendah
    sorted_indices = np.argsort(sim_scores)[::-1]

    # Filter hanya parfum lokal
    lokal_indices = [i for i in sorted_indices if df.loc[i, 'is_lokal']]
    top_lokal = lokal_indices[:top_n]

    # Siapkan hasil output
    hasil = df.iloc[top_lokal][['ID_Perfume', 'perfume', 'brand', 'Notes']].copy()
    hasil['similarity'] = [round(sim_scores[i], 3) for i in top_lokal]

    return hasil.reset_index(drop=True)


In [16]:
rekomendasi_lokal_tf("dior-homme-2005", top_n=5)


Unnamed: 0,ID_Perfume,perfume,brand,Notes,similarity
0,HRMN-1047,St. Tropez,Saint Hars,"bergamot, casablanca lily, tahitian vanilla, a...",0.353
1,HRMN-0813,24 Hours,Mandalika,"cardamom, lemon, pear, iris, cinnamon, lavende...",0.334
2,HRMN-0280,Explorer,Bonjour,"bergamot, pink pepper, clary sage, haitian vet...",0.332
3,HRMN-0084,Petruk,Project 1945,"lemon, apple, lavender, pepper, leather, carda...",0.329
4,HRMN-0353,Mindfulness,MOEN,"verbena, iris, lavender, mint, cardamom, viole...",0.325


# Save Model

In [18]:
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

np.save("tfidf_tensor.npy", tfidf_tensor.numpy())
df.to_csv("parfum_metadata.csv", index=False)
