In [3]:
import ast
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from collections import Counter

# =====================================================
# LOAD INPUT FILES
# =====================================================

# 1. Load processes from text file
with open("Process_Clustering_Results/unique_processes_raw.txt", "r") as f:
    processes = [line.strip() for line in f.readlines() if line.strip()]

# 2. Load embeddings + cluster labels
df_embed = pd.read_csv("Process_Clustering_Results/unique_processes_clusters.csv")

# Ensure order matches processes
df_embed = df_embed.set_index("Process").loc[processes].reset_index()

# Convert embedding string â†’ numpy vector
def parse_embedding(s):
    return np.array(ast.literal_eval(s))

emb = np.vstack(df_embed["Embedding"].apply(parse_embedding).values)

# Use ANY clustering method you want (default KMeans)
labels = df_embed["KMeans"].values.astype(int)

# =====================================================
# 1. WORD COMPLEXITY SCORE
# =====================================================

def word_complexity(process):
    tokens = process.split()
    num_tokens = len(tokens)

    suffixes = ("ize", "ise", "ify", "ation", "ated", "izing")
    suffix_score = sum(1 for t in tokens if t.endswith(suffixes))

    len_score = sum(len(t) for t in tokens)

    rarity_score = sum(1.0 / (1 + Counter(t).most_common(1)[0][1])
                        for t in tokens)

    return 0.4 * len_score + 0.3 * num_tokens + 0.3 * (rarity_score + suffix_score)

WCS = np.array([word_complexity(p) for p in processes])

# =====================================================
# 2. EMBEDDING DISPERSION SCORE
# =====================================================

vector_norm = np.linalg.norm(emb, axis=1)
vector_var = np.var(emb, axis=1)
EDS = vector_norm + vector_var

# =====================================================
# 3. CLUSTER TRANSITION DIFFICULTY
# =====================================================

global_centroid = np.mean(emb, axis=0)
CTDS = np.linalg.norm(emb - global_centroid, axis=1)

# =====================================================
# NORMALIZE EVERYTHING
# =====================================================

scaler = MinMaxScaler()

WCS_n = scaler.fit_transform(WCS.reshape(-1,1)).flatten()
EDS_n = scaler.fit_transform(EDS.reshape(-1,1)).flatten()
CTDS_n = scaler.fit_transform(CTDS.reshape(-1,1)).flatten()

# =====================================================
# FINAL PROCESSING INTENSITY SCORE
# =====================================================

PIS = 0.33 * WCS_n + 0.33 * EDS_n + 0.34 * CTDS_n

# =====================================================
# SAVE + PRINT RESULTS
# =====================================================

df_scores = pd.DataFrame({
    "Process": processes,
    "Cluster_KMeans": labels,
    "WCS": WCS_n,
    "EDS": EDS_n,
    "CTDS": CTDS_n,
    "Processing_Intensity": PIS
})

df_scores_sorted = df_scores.sort_values("Processing_Intensity", ascending=False)
df_scores_sorted.to_csv("Intensity/processing_intensity_scores.csv", index=False)

print("Saved file: processing_intensity_scores.csv")
print(df_scores_sorted.head())

# =====================================================
# CLUSTER-LEVEL AVERAGE INTENSITY
# =====================================================

cluster_scores = df_scores.groupby("Cluster_KMeans")["Processing_Intensity"].mean().sort_values(ascending=False)
cluster_scores.to_csv("Intensity/cluster_intensity_ranking.csv")

print("\n=== CLUSTER INTENSITY RANKING ===")
print(cluster_scores)


Saved file: processing_intensity_scores.csv
      Process  Cluster_KMeans       WCS       EDS      CTDS  \
202  splutter               1  0.512821  0.945197  0.972472   
45    deflate               3  0.410256  0.902869  0.921771   
223    stream               7  0.320513  0.910002  0.988492   
37      crimp               6  0.217949  1.000000  1.000000   
73    floured               3  0.423077  0.846296  0.883234   

     Processing_Intensity  
202              0.811786  
45               0.746734  
223              0.742157  
37               0.741923  
73               0.719193  

=== CLUSTER INTENSITY RANKING ===
Cluster_KMeans
3    0.457452
4    0.444612
6    0.435876
7    0.404735
0    0.404410
1    0.385378
2    0.339221
5    0.327647
Name: Processing_Intensity, dtype: float64
