## Import Necessary Packages

In [None]:
!pip install colorthief pandas pillow matplotlib

## Import Dataset

In [None]:
import pandas as pd

df = pd.read_csv("Sephora_Lip_Products_Image_Data.csv")

# Preview the dataset
print(df.columns)
print(df[['brand', 'price','name', 'image']].head())

print(df['image'].head()) 

## EDA - Create visualizations to explore the trends and distributions within the dataset

## Pre-processing Image Data 

In [None]:
import pandas as pd
import requests
import os
from urllib.parse import urlparse

# Load your CSV
df = pd.read_csv("Sephora_Lip_Products_Image_Data.csv")

# Choose the right column name
image_column = "image"  # adjust if it's named differently
SAVE_DIR = "working_lip_images"
os.makedirs(SAVE_DIR, exist_ok=True)

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0 Safari/537.36",
    "Referer": "https://www.sephora.com/"
}

for i, row in df.iterrows():
    url = str(row[image_column]).strip()
    try:
        if not url.startswith("http"):
            print(f"Skipping invalid URL: {url}")
            continue

        # Clean URL
        parsed = urlparse(url)
        ext = os.path.splitext(parsed.path)[-1]
        filename = f"lip_{i}{ext if ext else '.jpg'}"
        filepath = os.path.join(SAVE_DIR, filename)

        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            with open(filepath, 'wb') as f:
                f.write(response.content)
            print(f"[✓] Saved: {filename}")
            df.at[i, 'local_path'] = filepath
        else:
            print(f"Status {response.status_code}: {url}")

    except Exception as e:
        print(f"Error: {e} | URL: {url}")

## Lip Product Shade/Color Analysis and Visualizations

In [None]:
from colorthief import ColorThief
import colorsys

def rgb_to_hsl(r, g, b):
    r /= 255
    g /= 255
    b /= 255
    h, l, s = colorsys.rgb_to_hls(r, g, b)
    return int(h * 360), s, l

def classify_shade(h, s, l):
    if s < 0.2 and l > 0.7:
        return "Nude"
    elif 0 <= h <= 10 or 350 <= h <= 360:
        return "Red"
    elif 10 < h <= 40:
        return "Coral"
    elif 240 <= h <= 280:
        return "Purple"
    elif 280 < h <= 330:
        return "Berry"
    elif 330 < h < 350:
        return "Pink"
    else:
        return "Other"

dominant_colors = []
shade_classes = []

for path in df['local_path']:
    try:
        ct = ColorThief(path)
        rgb = ct.get_color(quality=1)
        h, s, l = rgb_to_hsl(*rgb)
        shade = classify_shade(h, s, l)
        dominant_colors.append(rgb)
        shade_classes.append(shade)
    except Exception as e:
        dominant_colors.append((0,0,0))
        shade_classes.append("Unknown")

df['dominant_rgb'] = dominant_colors
df['shade'] = shade_classes

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Filter out 'Unknown' shades for clarity
df_filtered = df[df['shade'] != 'Unknown']

# Count plot
plt.figure(figsize=(10, 6))
sns.countplot(data=df_filtered, x='shade', order=df_filtered['shade'].value_counts().index, palette='pastel')

plt.title('Distribution of Lip Product Shades')
plt.xlabel('Lip Shade')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

pivot = df.groupby(['brand', 'shade']).size().unstack(fill_value=0)
pivot.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='tab10')
plt.title("Lip Color Families by Brand")
plt.ylabel("Number of Products")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Logistic Regression to Predict Product Popularity by Shades

In [None]:
import os
import pandas as pd
import numpy as np
from colorthief import ColorThief
import colorsys
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

df = df.dropna(subset=['rating', 'quantity_rating'])
df['popularity_score'] = df['rating'] * np.log1p(df['quantity_rating'])
threshold = df['popularity_score'].median()
df['label'] = (df['popularity_score'] > threshold).astype(int)

# Step 3: Train logistic regression using shade as categorical variable
X = pd.get_dummies(df['shade'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 4: Evaluate model
y_pred = model.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Step 5: Visualize average popularity by shade
shade_popularity = df.groupby('shade')['label'].mean().sort_values(ascending=False).reset_index()
plt.figure(figsize=(10, 5))
sns.barplot(data=shade_popularity, x='shade', y='label')
plt.title("Average Popularity by Shade")
plt.ylabel("Proportion Labeled Popular")
plt.xlabel("Shade")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## CNN Model to Predict Product Popularity by Lip Product Image

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

# Step 1: Load and Clean Data
df = df.dropna(subset=['rating', 'quantity_rating'])

# Step 2: Define Popularity Score
df['popularity_score'] = df['rating'] * np.log1p(df['quantity_rating'])

# Step 3: Label as popular if above median
threshold = df['popularity_score'].median()
df['label'] = (df['popularity_score'] > threshold).astype(int)

# Step 4: Save labels
df['image'] = [f"lip_{i}.jpg" for i in range(len(df))]
df[['image', 'label']].to_csv('labels.csv', index=False)

# --------- CNN Model Below ---------
!pip install torch torchvision
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import transforms
from PIL import Image

# Custom Dataset
class LipDataset(Dataset):
    def __init__(self, csv_file, img_dir, img_size=128):
        self.data = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transforms.Compose([
            transforms.Resize((img_size, img_size)),
            transforms.ToTensor(),
        ])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.data.iloc[idx, 0])
        image = Image.open(img_name).convert('RGB')
        image = self.transform(image)
        label = torch.tensor(self.data.iloc[idx, 1], dtype=torch.float32)
        return image, label.unsqueeze(0)

# Load dataset
dataset = LipDataset("labels.csv", "/Users/lisabu/Documents/UVA/Courses/DS 4002/Sephora_Lip")
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_ds, test_ds = random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=32)

# Define Model
class LipNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
        )
        self.fc = nn.Sequential(
            nn.Linear(64 * 32 * 32, 128), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(128, 1), nn.Sigmoid()
        )

    def forward(self, x):
        x = self.cnn(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Compute class weights
labels_df = pd.read_csv("labels.csv")
class_weights = compute_class_weight("balanced", classes=np.unique(labels_df['label']), y=labels_df['label'])
class_weights = torch.tensor(class_weights, dtype=torch.float32)

# Train
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LipNet().to(device)
loss_fn = nn.BCELoss(weight=class_weights[1].to(device))
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_losses = []
for epoch in range(458):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = loss_fn(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    train_loss = total_loss / len(train_loader)
    train_losses.append(train_loss)
    print(f"Epoch {epoch+1} Loss: {train_loss:.4f}")

# Evaluate
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        preds = model(xb.to(device)).cpu().numpy()
        y_true += yb.numpy().flatten().tolist()
        y_pred += (preds > 0.5).astype(int).flatten().tolist()

print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred))

# Plot training loss
plt.plot(train_losses, label='Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss over Epochs')
plt.legend()
plt.show()