In [47]:
# Import libraries
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
from collections import Counter
from itertools import chain
import plotly.graph_objects as go
from tqdm import tqdm

# Load csv file with comments
df = pd.read_csv("../data/comments_post.csv")
comments = df['text_comment'].dropna().tolist()

#SAMPLE_SIZE = 500  # Speed up for test
#comments = comments[:SAMPLE_SIZE]


Columns (12,34) have mixed types. Specify dtype option on import or set low_memory=False.



Define product and ingredient synonyms

In [48]:
# Ingredient synonym groups (EN/DE) mapped to unified names
ingredient_map = {
    "hyaluronic acid": ["hyaluronic acid", "hyaluronsäure"],
    "glycerin": ["glycerin"],
    "panthenol": ["panthenol"],
    "squalane": ["squalane", "squalan"],
    "aloe vera": ["aloe vera"],
    "niacinamide": ["niacinamide", "niacinamid"],
    "vitamin c": ["vitamin c"],
    "alpha arbutin": ["alpha arbutin", "alpha-arbutin"],
    "licorice root": ["licorice root", "lakritzextrakt"],
    "kojic acid": ["kojic acid", "kojicsäure"],
    "retinol": ["retinol"],
    "peptides": ["peptides", "peptide"],
    "bakuchiol": ["bakuchiol"],
    "coenzyme q10": ["coenzyme q10", "coenzym q10"],
    "ceramides": ["ceramides", "ceramide"],
    "salicylic acid": ["salicylic acid", "salicylsäure"],
    "glycolic acid": ["glycolic acid", "glycolsäure"],
    "lactic acid": ["lactic acid", "milchsäure"],
    "pha": ["pha"],
    "benzoyl peroxide": ["benzoyl peroxide", "benzoylperoxid"],
    "centella asiatica": ["centella asiatica", "tigergras"],
    "green tea": ["green tea", "grüner tee"],
    "allantoin": ["allantoin"],
    "calendula": ["calendula", "ringelblume"],
    "chamomile": ["chamomile", "kamille"]
}

# Product synonym groups (EN/DE) mapped to unified names
product_map = {
    "cleanser": ["cleanser", "reiniger", "gesichtsreinigung", "face wash"],
    "toner": ["toner", "gesichtswasser"],
    "serum": ["serum"],
    "essence": ["essence", "essenz"],
    "moisturizer": ["moisturizer", "feuchtigkeitscreme"],
    "sunscreen": ["sunscreen", "sonnencreme"],
    "eye cream": ["eye cream", "augencreme"],
    "exfoliator": ["exfoliator", "peeling"],
    "spot treatment": ["spot treatment", "punktbehandlung"],
    "micellar water": ["micellar water", "mizellenwasser"],
    "night cream": ["night cream", "nachtcreme"],
    "face oil": ["face oil", "gesichtsöl"],
    "mask": ["mask", "maske"]
}

In [49]:
# build a list of all terms for matching
all_ingredient_terms = [term for group in ingredient_map.values() for term in group]
all_product_terms = [term for group in product_map.values() for term in group]
all_keywords = list(set(all_ingredient_terms + all_product_terms))

nlp = spacy.blank("xx")  # multilingual pipeline
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(term) for term in all_keywords]
matcher.add("SKINCARE", patterns)

# Normalize Matched Terms to Unified Names
def normalize_term(term):
    term_lower = term.lower()
    for name, synonyms in {**ingredient_map, **product_map}.items():
        if term_lower in synonyms:
            return name
    return term_lower

# process all comments in batch (pipe)
results = []

for doc in tqdm(nlp.pipe(comments, batch_size=50), total=len(comments)):
    matches = matcher(doc)
    matched = [doc[start:end].text for _, start, end in matches]
    normalized = [normalize_term(t) for t in matched]

    sequence = [token.text for token in doc if token.text.lower() in all_keywords]
    normalized_sequence = [normalize_term(t) for t in sequence]

    results.append({
        "normalized_items": normalized,
        "routine_sequence": normalized_sequence
    })

results_df = pd.DataFrame(results)

# Frequency analysis
all_normalized = list(chain.from_iterable(results_df['normalized_items']))
freq = Counter(all_normalized)
top_items_df = pd.DataFrame(freq.most_common(30), columns=['Item', 'Frequency'])
top_items_df.head(30)

100%|██████████| 54237/54237 [00:17<00:00, 3097.69it/s]


Unnamed: 0,Item,Frequency
0,cleanser,1151
1,serum,980
2,toner,943
3,moisturizer,626
4,sunscreen,594
5,mask,579
6,vitamin c,379
7,retinol,321
8,exfoliator,152
9,niacinamide,140


Sankey diagramm (common routines)

In [50]:
# Extract 2-step transitions
routine_sequences = results_df['routine_sequence'].dropna().tolist()
routine_sequences = [tuple(seq) for seq in routine_sequences if len(seq) > 1]

edges = []
for routine in routine_sequences:
    for i in range(len(routine) - 1):
        edges.append((routine[i], routine[i + 1]))

edge_counts = Counter(edges)

# Sankey input
labels = list(set([e[0] for e in edge_counts] + [e[1] for e in edge_counts]))
label_to_index = {label: i for i, label in enumerate(labels)}
source_indices = [label_to_index[src] for (src, tgt) in edge_counts]
target_indices = [label_to_index[tgt] for (src, tgt) in edge_counts]
values = list(edge_counts.values())

# Plot Sankey
fig = go.Figure(data=[go.Sankey(
    node=dict(pad=15, thickness=20, line=dict(color="gray", width=0.5), label=labels),
    link=dict(source=source_indices, target=target_indices, value=values)
)])
fig.update_layout(title_text="Skincare Routine Flow (All Comments)", font_size=12)
fig.show()


Common routines and example comment

In [51]:
# Helper function to remove consecutive duplicates from a routine
def remove_consecutive_duplicates(seq):
    return [x for i, x in enumerate(seq) if i == 0 or x != seq[i - 1]]

# Extract routine sequences from the DataFrame
routine_sequences = results_df['routine_sequence'].dropna().tolist()

# Clean each sequence and filter out short ones
routine_sequences = [
    tuple(remove_consecutive_duplicates(seq))
    for seq in routine_sequences
    if len(remove_consecutive_duplicates(seq)) > 1
]

# Count how often each cleaned routine occurs
from collections import Counter
routine_counts = Counter(routine_sequences)

# Create a DataFrame from the routine counts
routine_df = pd.DataFrame([
    {"Routine": " → ".join(routine), "Frequency": count}
    for routine, count in routine_counts.items()
])

# Filter routines by minimum frequency (e.g. at least 10)
routine_df = routine_df[routine_df['Frequency'] >= 10].sort_values(by='Frequency', ascending=False)

# Find an example comment for each routine
examples = []

for i, seq in enumerate(results_df['routine_sequence']):
    if not isinstance(seq, list) or len(seq) < 2:
        continue

    cleaned_seq = tuple(remove_consecutive_duplicates(seq))
    
    if cleaned_seq in routine_counts and routine_counts[cleaned_seq] >= 10:
        examples.append({
            "Routine": " → ".join(cleaned_seq),
            "ExampleComment": comments[i]
        })

# Create example DataFrame and remove duplicates per routine
example_df = pd.DataFrame(examples).drop_duplicates(subset="Routine")

# Merge example comments with the routine frequency table
routine_df = routine_df.merge(example_df, on="Routine", how="left")

# Display routine with frequency and example comment
print(routine_df[['Routine', 'Frequency', 'ExampleComment']])


                               Routine  Frequency  \
0                        toner → serum         37   
1               cleanser → moisturizer         27   
2              moisturizer → sunscreen         25   
3                  serum → moisturizer         24   
4                  niacinamide → serum         21   
5                     cleanser → toner         19   
6                      retinol → serum         19   
7   cleanser → moisturizer → sunscreen         15   
8                     toner → cleanser         14   
9              sunscreen → moisturizer         13   
10                       serum → toner         12   
11            cleanser → toner → serum         11   
12                     serum → retinol         11   
13                cleanser → sunscreen         11   
14                    peptides → serum         10   
15                 toner → moisturizer         10   
16               exfoliator → cleanser         10   

                                       Exampl