In [8]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=pd.errors.ParserWarning)

print("pandas:", pd.__version__)

pandas: 2.3.2


In [9]:
d1 = pd.read_csv("dataset1_280126.csv")
d1.columns = d1.columns.astype(str).str.strip().str.lower().str.replace(" ", "", regex=False)

# ensure pseudoid exists
if "pseudoid" not in d1.columns:
    guess = next(c for c in d1.columns if "pseudo" in c and "id" in c)
    d1 = d1.rename(columns={guess: "pseudoid"})
d1["pseudoid"] = d1["pseudoid"].astype(str).str.strip()

print("D1 BEFORE:", d1.shape)
d1.head()

D1 BEFORE: (22154, 23)


Unnamed: 0,pseudoid,weightnet,vacuumed,quantum,quantumunit,geometry,height,density,massinjar,massinjartare,...,totalvaporizationamount,totalvaporizationvolume,ashweightnet,amounttoashing,ashconcentration,srcarrierconcentration,srcarriervolume,sievedorganicweight,sievedrocksweight,sieveddryweight
0,51055,,0,29093.1,m^3,g-3,3.0,0.7,,,...,,,,,,,,,,
1,52642,,0,5441.8,m^3,g-7,0.0,1.0,,,...,,,,,,,,,,
2,60026,,0,28689.6,m^3,g-3,3.0,0.7,,,...,,,,,,,,,,
3,61469,,0,5569.4,m^3,g-7,0.0,1.0,,,...,,,,,,,,,,
4,77085,,0,1.0,m^2,g-3,1.0,0.12,0.17,11.3096,...,33320.0,,,,,1.0,2.5,,,


In [10]:
d1p = d1.copy()

# ensure required columns exist (so code never breaks)
for c in ["vacuumed", "quantum", "quantumunit", "geometry", "height", "density"]:
    if c not in d1p.columns:
        d1p[c] = np.nan

# normalize types used in filters
d1p["vacuumed"] = pd.to_numeric(d1p["vacuumed"], errors="coerce").fillna(0).astype(int)
d1p["quantum"] = pd.to_numeric(d1p["quantum"], errors="coerce")
d1p["height"] = pd.to_numeric(d1p["height"], errors="coerce")
d1p["density"] = pd.to_numeric(d1p["density"], errors="coerce")
d1p["quantumunit"] = d1p["quantumunit"].astype(str).str.strip().str.lower()
d1p.loc[d1p["quantumunit"].isin(["", "nan", "none"]), "quantumunit"] = np.nan

before = len(d1p)

# Rule 1: remove rows where only pseudoid & vacuumed exist
cols_other_than_id_vac = [c for c in d1p.columns if c not in ["pseudoid", "vacuumed"]]
rule_only_id_vac = d1p[cols_other_than_id_vac].isna().all(axis=1)

# Rule 2: remove laaduton placeholder rows (exact definition you gave)
allowed = ["pseudoid", "vacuumed", "quantum", "quantumunit", "geometry", "height", "density"]
outside_allowed = [c for c in d1p.columns if c not in allowed]
nothing_outside = d1p[outside_allowed].isna().all(axis=1)

rule_laaduton_placeholder = (
    (d1p["quantumunit"] == "laaduton") &
    (d1p["density"] == 1) &
    (d1p["quantum"] == 1) &
    nothing_outside
)

# Apply
drop_mask = rule_only_id_vac | rule_laaduton_placeholder
d1p = d1p.loc[~drop_mask].copy()

after = len(d1p)

# Report
report_d1 = pd.DataFrame({
    "removed_reason": [
        "only pseudoid+vacuumed present",
        "laaduton placeholder pattern",
        "TOTAL removed"
    ],
    "rows_removed": [
        int(rule_only_id_vac.sum()),
        int(rule_laaduton_placeholder.sum()),
        int(before - after)
    ]
})

print("D1 AFTER: ", d1p.shape)
print("Removed %:", round(100*(before-after)/before, 2), "%")
display(report_d1)

D1 AFTER:  (20095, 23)
Removed %: 9.29 %


Unnamed: 0,removed_reason,rows_removed
0,only pseudoid+vacuumed present,1323
1,laaduton placeholder pattern,736
2,TOTAL removed,2059


In [11]:
d1p.to_csv("dataset1_preprocessed.csv", index=False)
print("Saved dataset1_preprocessed.csv")

Saved dataset1_preprocessed.csv


In [12]:
d3 = pd.read_csv(
    "dataset3.csv",
    engine="python",
    sep=None,
    header=None,      # keep first row as data
    on_bad_lines="warn",
    encoding="utf-8"
)

d3 = d3.iloc[:, :6].copy()
d3.columns = ["pseudoid", "nuclide", "result", "quantum_unit", "uncertainty", "method"]

d3["pseudoid"] = d3["pseudoid"].astype(str).str.strip()

print("D3 BEFORE:", d3.shape)
d3.head()


D3 BEFORE: (98096, 6)


Unnamed: 0,pseudoid,nuclide,result,quantum_unit,uncertainty,method
0,50001,K-40,1121.5664549703706,Bq/kg kp,4.1 %,gamma
1,50001,Pb-210,88.54319706216569,Bq/kg kp,14.0 %,gamma
2,50001,Bi-214,50.78390397259847,Bq/kg kp,5.1 %,gamma
3,50001,Pb-214,51.06821236564313,Bq/kg kp,5.0 %,gamma
4,50001,Cs-137,0.409496306079783,Bq/kg kp,19.0 %,gamma


In [13]:
d3p = d3.copy()

# numeric conversion
d3p["result"] = pd.to_numeric(d3p["result"], errors="coerce")

d3p["uncertainty"] = (
    d3p["uncertainty"].astype(str)
        .str.replace("%", "", regex=False)
        .str.strip()
)
d3p.loc[d3p["uncertainty"].isin(["", "nan", "none"]), "uncertainty"] = np.nan
d3p["uncertainty"] = pd.to_numeric(d3p["uncertainty"], errors="coerce")

# normalize text fields lightly
d3p["nuclide"] = d3p["nuclide"].astype(str).str.strip()
d3p.loc[d3p["nuclide"].isin(["", "nan", "none"]), "nuclide"] = np.nan

before = len(d3p)
d3p = d3p.dropna(subset=["pseudoid", "nuclide"]).copy()
after = len(d3p)

report_d3 = pd.DataFrame({
    "metric": ["rows_before", "rows_after", "rows_removed", "percent_removed"],
    "value": [before, after, before-after, round(100*(before-after)/before, 2)]
})

print("D3 AFTER:", d3p.shape)
display(report_d3)

D3 AFTER: (98059, 6)


Unnamed: 0,metric,value
0,rows_before,98096.0
1,rows_after,98059.0
2,rows_removed,37.0
3,percent_removed,0.04


In [14]:
d3p.to_csv("dataset3_preprocessed.csv", index=False)
print("Saved dataset3_preprocessed.csv")

Saved dataset3_preprocessed.csv
