# Project Exploration

In this notebook, we will implement basic data loading and preprocessing steps. Additionally, we will visualize the network we construct.

## Load and Preprocess the data

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [26]:
# Load the data
product_df = pd.read_csv('../data/product_info_cleaned.csv')
product_df.head()

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,variation_type,variation_value,...,contains_Citric Acid,contains_Mica,contains_Potassium Sorbate,contains_Water,contains_Caprylic/Capric Triglyceride,contains_Sodium Benzoate,contains_Citronellol,contains_Tocopheryl Acetate,contains_Propanediol,contains_Geraniol
0,P473671,Fragrance Discovery Set,6342,19-69,6320,3.6364,11.0,Unknown size,Unknown,Unknown,...,False,False,False,False,False,False,True,False,False,True
1,P473668,La Habana Eau de Parfum,6342,19-69,3827,4.1538,13.0,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,False,False,False,False,False,False,False,False,False,False
2,P473662,Rainbow Bar Eau de Parfum,6342,19-69,3253,4.25,16.0,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,False,False,False,False,False,False,False,False,False,False
3,P473660,Kasbah Eau de Parfum,6342,19-69,3018,4.4762,21.0,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,False,False,False,False,False,False,True,False,False,True
4,P473658,Purple Haze Eau de Parfum,6342,19-69,2691,3.2308,13.0,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,False,False,False,False,False,False,False,False,False,True


In [27]:
# Drop the columns that are not needed
columns = product_df.columns
filtered_columns = [col for col in columns if 'contains_' not in col and \
    'child_' not in col and 'price_per_' not in col and 'variation_' not in col and \
    'size_' not in col]
print(filtered_columns)
product_df = product_df[filtered_columns]
product_df.head()

['product_id', 'product_name', 'brand_id', 'brand_name', 'loves_count', 'rating', 'reviews', 'size', 'ingredients', 'price_usd', 'value_price_usd', 'sale_price_usd', 'limited_edition', 'new', 'online_only', 'out_of_stock', 'sephora_exclusive', 'highlights', 'primary_category', 'secondary_category', 'tertiary_category']


Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,ingredients,price_usd,...,sale_price_usd,limited_edition,new,online_only,out_of_stock,sephora_exclusive,highlights,primary_category,secondary_category,tertiary_category
0,P473671,Fragrance Discovery Set,6342,19-69,6320,3.6364,11.0,Unknown size,"['Capri Eau de Parfum:', 'Alcohol Denat. (SD A...",35.0,...,,0,0,1,0,0,"['Unisex/ Genderless Scent', 'Warm &Spicy Scen...",Fragrance,Value & Gift Sets,Perfume Gift Sets
1,P473668,La Habana Eau de Parfum,6342,19-69,3827,4.1538,13.0,3.4 oz/ 100 mL,"['Alcohol Denat. (SD Alcohol 39C)', 'Parfum (F...",195.0,...,,0,0,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume
2,P473662,Rainbow Bar Eau de Parfum,6342,19-69,3253,4.25,16.0,3.4 oz/ 100 mL,"['Alcohol Denat. (SD Alcohol 39C)', 'Parfum (F...",195.0,...,,0,0,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume
3,P473660,Kasbah Eau de Parfum,6342,19-69,3018,4.4762,21.0,3.4 oz/ 100 mL,"['Alcohol Denat. (SD Alcohol 39C)', 'Parfum (F...",195.0,...,,0,0,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume
4,P473658,Purple Haze Eau de Parfum,6342,19-69,2691,3.2308,13.0,3.4 oz/ 100 mL,"['Alcohol Denat. (SD Alcohol 39C)', 'Parfum (F...",195.0,...,,0,0,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume


In [28]:
product_df[product_df['product_name'] == 'Fragrance Discovery Set']

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,ingredients,price_usd,...,sale_price_usd,limited_edition,new,online_only,out_of_stock,sephora_exclusive,highlights,primary_category,secondary_category,tertiary_category
0,P473671,Fragrance Discovery Set,6342,19-69,6320,3.6364,11.0,Unknown size,"['Capri Eau de Parfum:', 'Alcohol Denat. (SD A...",35.0,...,,0,0,1,0,0,"['Unisex/ Genderless Scent', 'Warm &Spicy Scen...",Fragrance,Value & Gift Sets,Perfume Gift Sets
749,P501710,Fragrance Discovery Set,7101,Boy Smells,9084,4.537,54.0,Unknown size,"['Woodphoria Eau de Parfum:', 'Alcohol Denat.'...",25.0,...,,0,0,0,0,1,"['Vegan', 'Unisex/ Genderless Scent', 'Without...",Fragrance,Value & Gift Sets,Perfume Gift Sets
999,P503667,Fragrance Discovery Set,8007,By Rosie Jane,4335,4.7635,148.0,Unknown size,"['Dylan Eau de Parfum:', 'Cane Sugar Alcohol',...",25.0,...,,0,0,1,0,0,"['Vegan', 'Layerable Scent', 'Clean + Planet P...",Fragrance,Value & Gift Sets,Perfume Gift Sets


In [29]:
product_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8494 entries, 0 to 8493
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   product_id          8494 non-null   object 
 1   product_name        8494 non-null   object 
 2   brand_id            8494 non-null   int64  
 3   brand_name          8494 non-null   object 
 4   loves_count         8494 non-null   int64  
 5   rating              8216 non-null   float64
 6   reviews             8216 non-null   float64
 7   size                8494 non-null   object 
 8   ingredients         8494 non-null   object 
 9   price_usd           8494 non-null   float64
 10  value_price_usd     8494 non-null   float64
 11  sale_price_usd      270 non-null    float64
 12  limited_edition     8494 non-null   int64  
 13  new                 8494 non-null   int64  
 14  online_only         8494 non-null   int64  
 15  out_of_stock        8494 non-null   int64  
 16  sephor

In [30]:
product_df['primary_category'].value_counts()

primary_category
Skincare           2420
Makeup             2369
Hair               1464
Fragrance          1432
Bath & Body         405
Mini Size           288
Men                  60
Tools & Brushes      52
Gifts                 4
Name: count, dtype: int64

In [31]:
# Simiplify the naming of the price column
product_df['price'] = product_df['price_usd']

In [32]:
# Turn the product_id into integer 
product_df['product_id'] = product_df['product_id'].str[1:].astype(np.int64)

In [33]:
import os

if not os.path.exists('data/sephora.csv'):
    product_df.to_csv('data/sephora.csv', index=False)

## Visualization

In [54]:
import plotly.graph_objects as go
import networkx as nx

def visualize_graph_interactive(graph, sample_size=100):
    """
    Visualize the product graph interactively using Plotly.
    
    Parameters:
    - graph (networkx.Graph): The graph to visualize.
    - sample_size (int): Number of nodes to sample for visualization (for large graphs).
    """
    # Sample the graph if it's too large
    if graph.number_of_nodes() > sample_size:
        sampled_nodes = list(graph.nodes)[:sample_size]
        graph = graph.subgraph(sampled_nodes)

    # Generate positions for nodes
    pos = nx.spring_layout(graph, seed=42)

    # Extract edges and nodes
    edge_x = []
    edge_y = []
    for edge in graph.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=2, color='#888'),
        hoverinfo='none',
        mode='lines'
    )

    node_x = []
    node_y = []
    node_text = []
    for node in graph.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        node_text.append(f"Name: {graph.nodes[node]['name']}<br>Rating: {graph.nodes[node]['rating']}<br>Category: {graph.nodes[node]['primary_category']}")

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers',
        hoverinfo='text',
        marker=dict(
            size=10,
            color='#1f78b4',
            line_width=2
        ),
        text=node_text
    )

    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title='Interactive Product Graph',
                        titlefont_size=16,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=0, l=0, r=0, t=40),
                        xaxis=dict(showgrid=False, zeroline=False),
                        yaxis=dict(showgrid=False, zeroline=False))
                    )
    fig.show()


In [57]:
import DataStructure as ds

data = ds.load_and_preprocess_data('data/sephora.csv')
product_graph = ds.build_graph(data)
visualize_graph_interactive(product_graph, sample_size=1000)

TF-IDF vector shape: (8494, 400)
Normalized numerical features shape: (8494, 3)
