In [15]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool

# Show Bokeh plots in notebook
output_notebook()

# Load data
df = pd.read_csv("C:/Users/nisaf/Documents/datasets/cosmetics.csv")

# Display sample rows (✅ Required by Task 1)
from IPython.display import display
display(df.sample(5))

# Explore dataset
print(df.columns)
print("All Labels:", df['Label'].unique())
print("Dry Skin Values:", df['Dry'].unique())



Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive
230,Moisturizer,DR. DENNIS GROSS SKINCARE,Hyaluronic Marine Oil-Free Moisture Cushion,60,4.2,"Water, Cyclopentasiloxane, Cetearyl Alcohol, D...",1,1,1,1,1
559,Cleanser,ESTÉE LAUDER,Perfectly Clean Multi-Action Toning Lotion/Ref...,26,4.5,Perfectly Clean Mlt-Act Tng Ltn/Rf Division: E...,0,0,0,0,0
326,Cleanser,YOUTH TO THE PEOPLE,Superfood Antioxidant Cleanser,36,4.4,"Water, Sodium Cocoyl Glutamate, Cocamidopropyl...",0,0,0,0,0
1115,Eye cream,KIEHL'S SINCE 1851,Midnight Recovery Eye,37,3.9,"Water, Helianthus Annuus Seed Oil, Sunflower S...",1,1,1,1,1
1357,Sun protect,FIRST AID BEAUTY,Ultra Repair® Pure Mineral Sunscreen Moisturiz...,30,3.6,"Water, Butyloctyl Salicylate, Cetearyl Alcohol...",1,1,1,1,1


Index(['Label', 'Brand', 'Name', 'Price', 'Rank', 'Ingredients', 'Combination',
       'Dry', 'Normal', 'Oily', 'Sensitive'],
      dtype='object')
All Labels: ['Moisturizer' 'Cleanser' 'Treatment' 'Face Mask' 'Eye cream'
 'Sun protect']
Dry Skin Values: [1 0]


In [16]:
moisturizers = df[df['Label'].str.lower() == 'moisturizer']
moisturizers_dry = moisturizers[moisturizers['Dry'] == 1].reset_index(drop=True)

print("Filtered shape:", moisturizers_dry.shape)
moisturizers_dry.head()


Filtered shape: (190, 11)


Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1


In [17]:
ingredient_idx = {}
corpus = []
idx = 0

for i in range(len(moisturizers_dry)):
    ingredients = moisturizers_dry['Ingredients'][i]
    if pd.isnull(ingredients):
        tokens = []
    else:
        ingredients_lower = ingredients.lower()
        tokens = ingredients_lower.split(', ')
    corpus.append(tokens)
    for ingredient in tokens:
        if ingredient not in ingredient_idx:
            ingredient_idx[ingredient] = idx
            idx += 1


In [18]:
M = len(corpus)
N = len(ingredient_idx)
A = np.zeros((M, N))

def oh_encoder(tokens):
    x = np.zeros(N)
    for ingredient in tokens:
        if ingredient in ingredient_idx:
            x[ingredient_idx[ingredient]] = 1
    return x

for i, tokens in enumerate(corpus):
    A[i, :] = oh_encoder(tokens)


In [19]:
if M < 2:
    print("Not enough samples to run t-SNE.")
else:
    model = TSNE(n_components=2, perplexity=min(30, M - 1), random_state=42)
    tsne_features = model.fit_transform(A)
    moisturizers_dry['X'] = tsne_features[:, 0]
    moisturizers_dry['Y'] = tsne_features[:, 1]


In [24]:
model = TSNE(n_components=2, learning_rate=200, random_state=42)
tsne_features = model.fit_transform(A)
moisturizers_dry['X'] = tsne_features[:, 0]
moisturizers_dry['Y'] = tsne_features[:, 1]
display(moisturizers_dry.head())


Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,X,Y
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1,-411.486328,185.858353
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1,-132.647842,-56.937164
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0,-314.180389,-219.254761
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1,-304.903503,314.024048
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1,246.277191,-351.825836


In [None]:
source = ColumnDataSource(moisturizers_dry)

plot = figure(title="Dry Skin Moisturizers by Ingredient Similarity",
              x_axis_label='t-SNE X',
              y_axis_label='t-SNE Y',
              width=800, height=600)

plot.scatter(x='X', y='Y', source=source, marker='circle', size=10, color='#FF7373', alpha=0.7)

hover = HoverTool(tooltips=[
    ("Name", "@Name"),
    ("Brand", "@Brand"),
    ("Price", "@Price"),
    ("Rank", "@Rank")
])
plot.add_tools(hover)

show(plot)


NameError: name 'ColumnDataSource' is not defined

In [22]:
cosmetic_1 = moisturizers_dry[moisturizers_dry['Name'] == "Color Control Cushion Compact Broad Spectrum SPF 50+"]
cosmetic_2 = moisturizers_dry[moisturizers_dry['Name'] == "BB Cushion Hydra Radiance SPF 50"]

display(cosmetic_1)
print(cosmetic_1.Ingredients.values)
display(cosmetic_2)
print(cosmetic_2.Ingredients.values)


Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,X,Y
45,Moisturizer,AMOREPACIFIC,Color Control Cushion Compact Broad Spectrum S...,60,4.0,"Phyllostachis Bambusoides Juice, Cyclopentasil...",1,1,1,1,1,-9.419198,-335.968231


['Phyllostachis Bambusoides Juice, Cyclopentasiloxane, Cyclohexasiloxane, Peg-10 Dimethicone, Phenyl Trimethicone, Butylene Glycol, Butylene Glycol Dicaprylate/Dicaprate, Alcohol, Arbutin, Lauryl Peg-9 Polydimethylsiloxyethyl Dimethicone, Acrylates/Ethylhexyl Acrylate/Dimethicone Methacrylate Copolymer, Polyhydroxystearic Acid, Sodium Chloride, Polymethyl Methacrylate, Aluminium Hydroxide, Stearic Acid, Disteardimonium Hectorite, Triethoxycaprylylsilane, Ethylhexyl Palmitate, Lecithin, Isostearic Acid, Isopropyl Palmitate, Phenoxyethanol, Polyglyceryl-3 Polyricinoleate, Acrylates/Stearyl Acrylate/Dimethicone Methacrylate Copolymer, Dimethicone, Disodium Edta, Trimethylsiloxysilicate, Ethylhexyglycerin, Dimethicone/Vinyl Dimethicone Crosspolymer, Water, Silica, Camellia Japonica Seed Oil, Camillia Sinensis Leaf Extract, Caprylyl Glycol, 1,2-Hexanediol, Fragrance, Titanium Dioxide, Iron Oxides (Ci 77492, Ci 77491, Ci77499).']


Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,X,Y
55,Moisturizer,LANEIGE,BB Cushion Hydra Radiance SPF 50,38,4.3,"Water, Cyclopentasiloxane, Zinc Oxide (CI 7794...",1,1,1,1,1,-31.170496,-359.229309


['Water, Cyclopentasiloxane, Zinc Oxide (CI 77947), Ethylhexyl Methoxycinnamate, PEG-10 Dimethicone, Cyclohexasiloxane, Phenyl Trimethicone, Iron Oxides (CI 77492), Butylene Glycol Dicaprylate/Dicaprate, Niacinamide, Lauryl PEG-9 Polydimethylsiloxyethyl Dimethicone, Acrylates/Ethylhexyl Acrylate/Dimethicone Methacrylate Copolymer, Titanium Dioxide (CI 77891 , Iron Oxides (CI 77491), Butylene Glycol, Sodium Chloride, Iron Oxides (CI 77499), Aluminum Hydroxide, HDI/Trimethylol Hexyllactone Crosspolymer, Stearic Acid, Methyl Methacrylate Crosspolymer, Triethoxycaprylylsilane, Phenoxyethanol, Fragrance, Disteardimonium Hectorite, Caprylyl Glycol, Yeast Extract, Acrylates/Stearyl Acrylate/Dimethicone Methacrylate Copolymer, Dimethicone, Trimethylsiloxysilicate, Polysorbate 80, Disodium EDTA, Hydrogenated Lecithin, Dimethicone/Vinyl Dimethicone Crosspolymer, Mica (CI 77019), Silica, 1,2-Hexanediol, Polypropylsilsesquioxane, Chenopodium Quinoa Seed Extract, Magnesium Sulfate, Calcium Chloride

In [23]:
from scipy.spatial.distance import cdist

def recommend_similar_products(index, top_n=5):
    product = tsne_features[index].reshape(1, -1)
    distances = cdist(product, tsne_features, 'euclidean').flatten()
    nearest = distances.argsort()[1:top_n+1]
    return moisturizers_dry.iloc[nearest][['Name', 'Brand', 'Price', 'Rank']]

# Example usage
recommend_similar_products(0)


Unnamed: 0,Name,Brand,Price,Rank
15,Crème de la Mer Mini,LA MER,85,4.1
122,CC Cream Daily Correct Broad Spectrum SPF 35+ ...,SUPERGOOP!,34,4.4
87,Peat Miracle Revital Cream,BELIF,58,4.7
111,The Moisturizing Soft Lotion,LA MER,270,3.6
138,Vine[activ] Overnight Detox Oil,CAUDALIE,50,4.5
