In [1]:
import pandas as pd
print(pd.__version__)


2.3.3


Manual spotting of unusual preprocessing is hard because workflows are heterogeneous and sparse. 
Workflow flags -
Each flag answers: “Did this preprocessing step actually happen for this sample?”

| Flag              | Meaning                                  |
| ----------------- | ---------------------------------------- |
| `did_dry`         | Sample was dried (`dryWeightNet` exists) |
| `did_ash`         | Sample was ashed (`ashWeightNet` exists) |
| `did_vaporize`    | Vaporization step performed              |
| `did_sieve`       | Sieving performed                        |
| `did_add_carrier` | Sr carrier added                         |
| `vacuumed`        | Sample was vacuumed                      |


In [9]:
import pandas as pd

# Load dataset 1
df = pd.read_csv("dataset1_280126.csv")

# Define workflow flags
flags = pd.DataFrame({
    "pseudoid": df["pseudoid"],
    "did_dry": df["dryWeightNet"].notna(),
    "did_ash": df["ashWeightNet"].notna(),
    "did_vaporize": df["totalVaporizationAmount"].notna() | df["totalVaporizationVolume"].notna(),
    "did_sieve": df["sievedDryWeight"].notna() | df["sievedOrganicWeight"].notna() | df["sievedRocksWeight"].notna(),
    "did_add_carrier": df["SrCarrierVolume"].notna(),
    "vacuumed": df["vacuumed"] == 1
})

# Summarize counts
summary = flags.drop(columns="pseudoid").sum().to_frame(name="number_of_samples")
summary["percentage"] = (summary["number_of_samples"] / len(df) * 100).round(2)

summary


Unnamed: 0,number_of_samples,percentage
did_dry,5290,23.88
did_ash,1141,5.15
did_vaporize,3332,15.04
did_sieve,214,0.97
did_add_carrier,1524,6.88
vacuumed,2072,9.35


Dataset 1 shows that preprocessing procedures are highly heterogeneous and that missing values systematically encode different laboratory workflows rather than random data quality issues. Only about one quarter of samples (≈24%) underwent drying, while more advanced steps such as ashing (≈5%), carrier addition (≈7%), and sieving (≈1%) were applied to relatively small and specific subsets of samples. Vaporization was performed for approximately 15% of samples, and vacuuming for about 9%, indicating additional but non-standard preprocessing variations. These results demonstrate that most samples follow minimal preprocessing pipelines, while a smaller number are processed using specialized or complex workflows. Consequently, preprocessing steps must be explicitly modeled (e.g., via workflow flags) before applying clustering or classification methods, as treating missing values as noise or imputing them indiscriminately would obscure meaningful structure in the data.

By defining workflow flags, we’ve learned:

- Which preprocessing steps were performed

- That missing values are intentional

- That samples follow distinct preprocessing pipelines

So now we stop treating Dataset 1 as “one table” and start treating it as multiple workflows.

In [None]:
# --- Recreate workflow flags ---
df["did_dry"] = df["dryWeightNet"].notna()
df["did_ash"] = df["ashWeightNet"].notna()
df["did_vaporize"] = df["totalVaporizationAmount"].notna() | df["totalVaporizationVolume"].notna()
df["did_sieve"] = (
    df["sievedDryWeight"].notna()
    | df["sievedOrganicWeight"].notna()
    | df["sievedRocksWeight"].notna()
)
df["did_add_carrier"] = df["SrCarrierVolume"].notna()

# --- Consistency checks ---
checks = {}

# 1. Ashing without drying
#checks["ash_without_dry"] = df["did_ash"] & (~df["did_dry"])

# 2. Gross mass smaller than tare mass
checks["gross_less_than_tare"] = (
    df["massInJarGross"].notna()
    & df["massInJarTare"].notna()
    & (df["massInJarGross"] < df["massInJarTare"])
)

# 3. Dry weight larger than net_wet
checks["dry_gt_netwet"] = (
    df["dryWeightNet"].notna()
    & df["weightNet"].notna()
    & (df["dryWeightNet"] > df["weightNet"])
)

# 4. Ash weight larger than dry weight
checks["ash_gt_dry"] = (
    df["ashWeightNet"].notna()
    & df["dryWeightNet"].notna()
    & (df["ashWeightNet"] > df["dryWeightNet"])
)

# 5. Dry weight percent > 100
checks["dry_percent_gt_100"] = (
    df["dryWeightPercent"].notna()
    & (df["dryWeightPercent"] > 100)
)

# 6. Non-positive density
checks["density_nonpositive"] = (
    df["density"].notna()
    & (df["density"] <= 0)
)

# 7. Tare greater than gross
checks["tare_gt_gross"] = (
    df["massInJarGross"].notna()
    & df["massInJarTare"].notna()
    & (df["massInJarTare"] > df["massInJarGross"])
)


# --- Combine checks ---
checks_df = pd.DataFrame(checks)
df["is_consistent"] = ~checks_df.any(axis=1)

# Count violations
violation_summary = checks_df.sum().to_frame(name="number_of_violations")
violation_summary["percentage_of_samples"] = (
    violation_summary["number_of_violations"] / len(df) * 100
).round(2)

violation_summary


Unnamed: 0,number_of_violations,percentage_of_samples
gross_less_than_tare,10,0.05
dry_gt_netwet,8,0.04
ash_gt_dry,0,0.0
dry_percent_gt_100,10,0.05
density_nonpositive,10,0.05
tare_gt_gross,10,0.05
dry_flag_no_mass,0,0.0
