In [10]:
import os
import json
import re
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, MiniBatchKMeans
from sklearn.mixture import GaussianMixture

# -----------------------------
# Configuration
# -----------------------------
STEPS_FILE = 'RecipeDB2_general_final.csv'   # file used to train Word2Vec (contains 'steps')
PROCESS_FILE = 'Recipedb_general.csv'       # file containing 'Processes' column
OUTPUT_FOLDER = 'Process_Clustering_Results'
ROW_LIMIT = 50000                            # if you want to limit W2V training rows; set None to use all
W2V_DIM = 100
NUM_CLUSTERS = 8                             # choose as appropriate for process clustering
DBSCAN_EPS = 0.7
DBSCAN_MIN_SAMPLES = 2
AGGLO_THRESHOLD = 2000                       # if unique processes > this, skip Agglo for memory
ANNOTATE = True                              # annotate points with label (set False to hide)

# Simple stopwords to remove accidental non-process words
STOPWORDS = set([
    'and','or','the','a','an','in','on','of','with','to','for','by','at','from','into',
    'grated','chopped','sliced'  # note: if these are valid processes you want keep them remove from STOPWORDS
])

# Patterns considered valid process-like tokens (letters, spaces, hyphens)
VALID_PROCESS_RE = re.compile(r'^[a-z][a-z0-9\-\s]+$')

os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# -----------------------------
# 1) Load processes file and extract unique process tokens
# -----------------------------
print("[INFO] Loading process file:", PROCESS_FILE)
df_proc = pd.read_csv(PROCESS_FILE)

# find 'Processes' column case-insensitively
proc_col = None
for c in df_proc.columns:
    if c.strip().lower() == 'processes':
        proc_col = c
        break
if proc_col is None:
    raise ValueError("No 'Processes' column found in the processes file.")

# Collect raw tokens from column splitting on '||'
raw_processes = []
for val in df_proc[proc_col].dropna().astype(str):
    parts = [p.strip() for p in val.split('||') if p.strip()]
    raw_processes.extend(parts)

# Normalize and filter processes
def clean_process_token(tok):
    tok = tok.strip().lower()
    # remove leading/trailing punctuation
    tok = re.sub(r'^[^\w]+|[^\w]+$', '', tok)
    return tok

cleaned = [clean_process_token(p) for p in raw_processes]

# Keep only tokens that look like process words and are not in STOPWORDS
filtered = []
for p in cleaned:
    if len(p) < 2: 
        continue
    # filter out obvious non-process garbage: urls, numbers-only, too-long
    if p.isdigit() or len(p) > 40:
        continue
    if p in STOPWORDS:
        continue
    if not VALID_PROCESS_RE.match(p):
        # allow multi-word like 'pan-fry' or 'lightly stir' by removing weird punctuation
        p2 = re.sub(r'[^a-z0-9\-\s]', '', p)
        if VALID_PROCESS_RE.match(p2):
            p = p2
        else:
            continue
    # final length check of words (avoid single-letter nonsense)
    toks = p.split()
    if any(len(t) <= 1 for t in toks):
        # keep tokens like 'pan fry' where words length>1; skip if any 1-char token
        if not all(len(t) > 1 for t in toks):
            continue
    filtered.append(p)

unique_processes = sorted(list(set(filtered)))
print(f"[INFO] Found {len(unique_processes)} unique processes after cleaning.")

# Save list for inspection
with open(os.path.join(OUTPUT_FOLDER, "unique_processes_raw.txt"), "w", encoding="utf-8") as f:
    for p in unique_processes:
        f.write(p + "\n")

# -----------------------------
# 2) Prepare Word2Vec training data:
#    train on 'steps' + also include each process as its own sentence so W2V learns process tokens
# -----------------------------
print("[INFO] Preparing corpus for Word2Vec...")

df_steps = pd.read_csv(STEPS_FILE)

# optionally limit rows for speed
if ROW_LIMIT:
    df_steps = df_steps.head(ROW_LIMIT)

# Tokenize steps (basic)
def tokenize_step_text(s):
    if pd.isna(s):
        return []
    s = str(s).lower()
    return re.findall(r'\b\w+\b', s)

sentences = df_steps['steps'].fillna('').apply(tokenize_step_text).tolist()

# Add process tokens as individual 'sentences' to help Word2Vec learn them
process_sentences = [re.findall(r'\b\w+\b', p) for p in unique_processes]
sentences_for_w2v = sentences + process_sentences

print(f"[INFO] Sentences for W2V: steps={len(sentences)}, processes={len(process_sentences)}")

# Train Word2Vec
print("[INFO] Training Word2Vec (this can take a minute)...")
w2v_model = Word2Vec(
    sentences=sentences_for_w2v,
    vector_size=W2V_DIM,
    window=8,
    min_count=1,
    sg=1,
    workers=4,
    epochs=10
)
print("[INFO] Word2Vec trained. Vocab size:", len(w2v_model.wv))

# -----------------------------
# 3) Convert each unique process into an embedding
# -----------------------------
def embed_process_text(text, model, dim):
    # multi-word processes -> average token vectors
    tokens = re.findall(r'\b\w+\b', text.lower())
    vecs = [model.wv[t] for t in tokens if t in model.wv]
    if len(vecs) == 0:
        return np.zeros(dim)
    return np.mean(vecs, axis=0)

process_embeddings = np.vstack([embed_process_text(p, w2v_model, W2V_DIM) for p in unique_processes])
print("[INFO] Process embeddings shape:", process_embeddings.shape)

# If all-zero embeddings exist (no tokens in vocab), warn
zero_counts = np.sum(np.all(process_embeddings == 0, axis=1))
if zero_counts > 0:
    print(f"[WARN] {zero_counts} process(es) produced zero embeddings (no tokens in W2V vocab).")

# -----------------------------
# 4) Clustering the unique processes
# -----------------------------
print("[INFO] Clustering unique processes...")

# KMeans
kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=42, n_init=20)
labels_kmeans = kmeans.fit_predict(process_embeddings)

# Decide whether to run Agglomerative
run_agglo = len(unique_processes) <= AGGLO_THRESHOLD
if run_agglo:
    agglo = AgglomerativeClustering(n_clusters=NUM_CLUSTERS, linkage='ward')
    labels_agglo = agglo.fit_predict(process_embeddings)
else:
    print(f"[INFO] Number of processes ({len(unique_processes)}) > {AGGLO_THRESHOLD}; skipping Agglomerative.")
    # Use MiniBatchKMeans as a memory-friendly substitute and label it 'Agglo_replacement'
    mbk = MiniBatchKMeans(n_clusters=NUM_CLUSTERS, random_state=42, batch_size=256)
    labels_agglo = mbk.fit_predict(process_embeddings)

# DBSCAN
dbscan = DBSCAN(eps=DBSCAN_EPS, min_samples=DBSCAN_MIN_SAMPLES, metric='euclidean')
labels_dbscan = dbscan.fit_predict(process_embeddings)

# GMM
gmm = GaussianMixture(n_components=NUM_CLUSTERS, random_state=42)
labels_gmm = gmm.fit_predict(process_embeddings)

# Build result DataFrame
df_procs = pd.DataFrame({
    'Process': unique_processes,
    'Embedding': [json.dumps(vec.tolist()) for vec in process_embeddings],
    'KMeans': labels_kmeans,
    'Agglo_or_MBK': labels_agglo,
    'DBSCAN': labels_dbscan,
    'GMM': labels_gmm
})

df_procs.to_csv(os.path.join(OUTPUT_FOLDER, "unique_processes_clusters.csv"), index=False)
print("[INFO] Saved cluster assignments to CSV.")

# -----------------------------
# 5) Human-readable cluster summaries
# -----------------------------
def summary_text_for_col(df, col):
    s = f"\n=== {col} ===\n"
    for c in sorted(df[col].unique()):
        subset = df[df[col] == c]['Process'].tolist()
        s += f"\nCluster {c} ({len(subset)}):\n"
        s += ", ".join(subset[:200]) + ("\n" if len(subset)<=200 else " ...\n")
    return s

summary = "PROCESS CLUSTER SUMMARY\n"
for col in ['KMeans', 'Agglo_or_MBK', 'DBSCAN', 'GMM']:
    summary += summary_text_for_col(df_procs, col)

with open(os.path.join(OUTPUT_FOLDER, "process_cluster_summary.txt"), "w", encoding="utf-8") as f:
    f.write(summary)
print("[INFO] Wrote human-readable summary.")

# -----------------------------
# 6) PCA + plotting for every method
# -----------------------------
print("[INFO] Creating PCA plots for each clustering method...")

pca2 = PCA(n_components=2)
proc_2d = pca2.fit_transform(process_embeddings)

plot_methods = {
    'KMeans': labels_kmeans,
    'Agglo_or_MBK': labels_agglo,
    'DBSCAN': labels_dbscan,
    'GMM': labels_gmm
}

# function for nice scatter
def plot_process_clusters(xy, labels, title, outpath, annotate=ANNOTATE):
    plt.figure(figsize=(10, 8))
    palette = sns.color_palette('tab10') if len(set(labels)) <= 10 else sns.color_palette('husl', n_colors=len(set(labels)))
    scatter = plt.scatter(xy[:,0], xy[:,1], c=labels, cmap=None, s=120)
    plt.title(title)
    plt.xlabel("PCA 1")
    plt.ylabel("PCA 2")
    # annotate labels (best for smaller sets)
    if annotate and len(unique_processes) <= 300:
        for i, txt in enumerate(unique_processes):
            plt.annotate(txt, (xy[i,0], xy[i,1]), textcoords="offset points", xytext=(3,3), fontsize=8)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(outpath, dpi=150)
    plt.close()
    print(f"[INFO] Saved plot: {outpath}")

for method_name, labels in plot_methods.items():
    out = os.path.join(OUTPUT_FOLDER, f"processes_pca_{method_name}.png")
    plot_process_clusters(proc_2d, labels, f"Processes clustered by {method_name}", out, annotate=ANNOTATE)

print("\n[ALL DONE]")
print("Results folder:", OUTPUT_FOLDER)
print(" - CSV: unique_processes_clusters.csv")
print(" - Summary: process_cluster_summary.txt")
print(" - PCA plots: processes_pca_<method>.png")


[INFO] Loading process file: Recipedb_general.csv


  df_proc = pd.read_csv(PROCESS_FILE)


[INFO] Found 270 unique processes after cleaning.
[INFO] Preparing corpus for Word2Vec...


  df_steps = pd.read_csv(STEPS_FILE)


[INFO] Sentences for W2V: steps=50000, processes=270
[INFO] Training Word2Vec (this can take a minute)...
[INFO] Word2Vec trained. Vocab size: 20775
[INFO] Process embeddings shape: (270, 100)
[INFO] Clustering unique processes...
[INFO] Saved cluster assignments to CSV.
[INFO] Wrote human-readable summary.
[INFO] Creating PCA plots for each clustering method...
[INFO] Saved plot: Process_Clustering_Results\processes_pca_KMeans.png
[INFO] Saved plot: Process_Clustering_Results\processes_pca_Agglo_or_MBK.png
[INFO] Saved plot: Process_Clustering_Results\processes_pca_DBSCAN.png
[INFO] Saved plot: Process_Clustering_Results\processes_pca_GMM.png

[ALL DONE]
Results folder: Process_Clustering_Results
 - CSV: unique_processes_clusters.csv
 - Summary: process_cluster_summary.txt
 - PCA plots: processes_pca_<method>.png
