In [None]:

# # ENSDF Data Processing
# 
# Process ENSDF text files: split datasets, parse nuclear data, filter records.


# 1. Basic functions and data loading
from pathlib import Path

def parse_identification_line(line):
    """Parse ID record (first line of dataset)"""
    nucid = line[0:5].strip()     # cols 1-5
    dsid  = line[9:39].strip()    # cols 10-39
    date  = line[66:72].strip()   # cols 67-72
    return nucid, dsid, date

def split_datasets(text_lines):
    """Split by blank lines"""
    datasets = []
    current_block = []
    for line in text_lines:
        if line.strip() == "":
            if current_block:
                datasets.append(current_block)
                current_block = []
        else:
            current_block.append(line.rstrip("\n"))
    if current_block:
        datasets.append(current_block)
    return datasets


# ## Load ENSDF text


file_path = Path("D:/VS CODE W/ENSDF-2005")

lines = file_path.read_text(encoding="utf-8", errors="ignore").splitlines()


datasets = split_datasets(lines)

print(f"Total datasets: {len(datasets)}")


for i, block in enumerate(datasets[:10], start=1):
    nucid, dsid, date = parse_identification_line(block[0])
    print(f"[{i:04d}] NUCID={nucid:<5} DSID={dsid:<30} Date={date}")


comments_only = [
    b for b in datasets if parse_identification_line(b[0])[1].upper() == "COMMENTS"
]

print(f"COMMENTS datasets: {len(comments_only)}")


datasets_no_comments = [
    b for b in datasets if parse_identification_line(b[0])[1].upper() != "COMMENTS"
]
print(f"After removing COMMENTS: {len(datasets_no_comments)}")


from collections import defaultdict

grouped_by_nucid = defaultdict(list)
for block in datasets_no_comments:
    nucid, dsid, date = parse_identification_line(block[0])
    grouped_by_nucid[nucid].append((dsid, date))

sample_nucid = list(grouped_by_nucid.keys())[0]
print(f"Nuclide {sample_nucid} dataset types:")
for dsid, date in grouped_by_nucid[sample_nucid]:
    print(f" - {dsid} ({date})")


Total datasets: 15420
[0001] NUCID=1     DSID=COMMENTS                       Date=
[0002] NUCID=1     DSID=REFERENCES                     Date=
[0003] NUCID=1NN   DSID=ADOPTED LEVELS                 Date=
[0004] NUCID=1H    DSID=ADOPTED LEVELS                 Date=
[0005] NUCID=1H    DSID=1NN B- DECAY                   Date=
[0006] NUCID=2     DSID=COMMENTS                       Date=NSDF
[0007] NUCID=2     DSID=REFERENCES                     Date=NSDF
[0008] NUCID=2H    DSID=ADOPTED LEVELS                 Date=NSDF
[0009] NUCID=2H    DSID=1H(N,G) E=THERMAL              Date=NSDF
[0010] NUCID=3     DSID=COMMENTS                       Date=7NP
COMMENTS datasets: 362
After removing COMMENTS: 15058
Nuclide 1 dataset types:
 - REFERENCES ()


In [2]:
# 2. State machine parsing
WAIT_ID = 0
IN_BLOCK = 1

datasets = []
skipped = []
current_block = []
state = WAIT_ID

SKIP_KEYWORDS = ("COMMENTS", "REFERENCES")

def norm(s: str) -> str:
    s = s.strip()
    s = s.upper()
    parts = s.split()
    return " ".join(parts)

for raw in lines:
    if state == WAIT_ID:
        if raw.strip():
            skip_block = any(keyword in norm(raw) for keyword in SKIP_KEYWORDS)
            current_block = [raw]
            state = IN_BLOCK

    elif state == IN_BLOCK:
        if raw.strip() == "":
            if skip_block:
                skipped.append(current_block)
            else:
                datasets.append(current_block)
            current_block = []
            state = WAIT_ID
        else:
            current_block.append(raw)

if current_block:
    datasets.append(current_block)

import random
print(datasets[:5])
for i, block in enumerate(datasets[:5], start=1):
    print(f"=== Dataset {i}, {len(block)} lines ===")
    for line in block:
        print(line)
    print()

sample_datasets = random.sample(datasets, min(5, len(datasets)))
print(f"Random {len(sample_datasets)} datasets (kept):")
for i, block in enumerate(sample_datasets, start=1):
    print(f"=== Dataset {i}, {len(block)} lines ===")
    for line in block:
        print(line)
    print()

sample_skipped = random.sample(skipped, min(5, len(skipped)))
print(f"Random {len(sample_skipped)} datasets (skipped):")
for i, block in enumerate(sample_skipped, start=1):
    print(f"=== Dataset {i}, {len(block)} lines ===")
    for line in block:
        print(line)
    print()


[['  1NN    ADOPTED LEVELS                                                   200002', '  1NN  H TYP=FUL$AUT=B. Singh$CUT=1-Oct-1999$                                   ', '  1NN C  Neutron discovered by 1932CH02.                                        ', '  1NN C  Instability of neutron first proposed by 1935CH01                      ', '  1NN  Q 782.353   2  0.0                              1995AU04                 ', '  1NN  L 0.0         1/2+              614.6 S   13                             ', '  1NN2 L %B-=100$                                                               ', '  1NN2 L MOMM1=-1.91304275 45 (1998CA52,1987CO39)$                              ', '  1NN CL           Electric dipole moment LT 0.97E-25 ecm (1998CA52,1996AL25).  ', '  1NN CL J$measurements: 1954ST90, 1950HA67, 1937SC09                           ', '  1NN CL T$from evaluation by 1998CA52. This value represents the               ', '  1NN2CL average of the following measured mean lifetimes:               

In [None]:

# 3. Extract L records (flat)

def col(line: str, i: int, j: int) -> str:  # 1-based, inclusive
    return line[i-1:j] if len(line) >= j else line[i-1:]

def is_continuation(line: str, rtype_col: int = 8) -> bool:
    c6 = col(line, 6, 6)
    rt = col(line, rtype_col, rtype_col).strip().upper()
    return bool(c6.strip() and c6 != "1" and rt != "")

RTYPE_COL = 8 

# Flatten all L records (exclude continuations)
L_lines = [
    ln
    for block in datasets
    for ln in block
    if len(ln) >= RTYPE_COL and col(ln, RTYPE_COL, RTYPE_COL).strip().upper() == "L"
       and not is_continuation(ln, RTYPE_COL)
]

# Keep dataset structure
L_in_datasets = [
    [ln for ln in block
     if len(ln) >= RTYPE_COL and col(ln, RTYPE_COL, RTYPE_COL).strip().upper() == "L"
        and not is_continuation(ln, RTYPE_COL)]
    for block in datasets
]


L_lines[:55]

['  1NN  L 0.0         1/2+              614.6 S   13                             ',
 '  1NN CL           Electric dipole moment LT 0.97E-25 ecm (1998CA52,1996AL25).  ',
 '  1NN CL J$measurements: 1954ST90, 1950HA67, 1937SC09                           ',
 '  1NN CL T$from evaluation by 1998CA52. This value represents the               ',
 '  1NN CL           Earlier T1/2 measurements: 1978KU17, 1959DA06, 1959SO15,     ',
 '  1NN CL           Other references relevant to discussion of T1/2 measurements:',
 '  1NN CL           See 1999MC01 and 1989PH02 and also dataset for proton        ',
 '  1NN CL MOMM1$NMR method.                                                      ',
 '  1NN DL $1982GR27: GREENE ET AL, METROLOGIA 18, 93 (1982).                     ',
 '  1NN CL           Electric dipole moment measurements:                         ',
 '  1H   L 0.0         1/2+               STABLE                                  ',
 '  1H  CL           Electric dipole moment=-4E-23 6 ecm (1998CA5

In [None]:

# 4. Extract L records (structured)

def col(line: str, i: int, j: int) -> str:  # 1-based, inclusive
    return line[i-1:j] if len(line) >= j else line[i-1:]

def is_continuation(line: str, rtype_col: int = 8) -> bool:
    c6 = col(line, 6, 6)
    rt = col(line, rtype_col, rtype_col).strip().upper()
    return bool(c6.strip() and c6 != "1" and rt != "")

RTYPE_COL = 8  

# Keep dataset structure
L_in_datasets = []
for block in datasets:
    current_block_L = []
    for ln in block:
        if len(ln) >= RTYPE_COL:
            if col(ln, RTYPE_COL, RTYPE_COL).strip().upper() == "L":
                if not is_continuation(ln, RTYPE_COL):
                    current_block_L.append(ln)
    L_in_datasets.append(current_block_L)


L_lines[:5]
print(len(L_in_datasets))
cnt = 0
for block in L_in_datasets:
    cnt += len(block)
print(cnt)
L_in_datasets[:5]

14791
562146


[['  1NN  L 0.0         1/2+              614.6 S   13                             ',
  '  1NN CL           Electric dipole moment LT 0.97E-25 ecm (1998CA52,1996AL25).  ',
  '  1NN CL J$measurements: 1954ST90, 1950HA67, 1937SC09                           ',
  '  1NN CL T$from evaluation by 1998CA52. This value represents the               ',
  '  1NN CL           Earlier T1/2 measurements: 1978KU17, 1959DA06, 1959SO15,     ',
  '  1NN CL           Other references relevant to discussion of T1/2 measurements:',
  '  1NN CL           See 1999MC01 and 1989PH02 and also dataset for proton        ',
  '  1NN CL MOMM1$NMR method.                                                      ',
  '  1NN DL $1982GR27: GREENE ET AL, METROLOGIA 18, 93 (1982).                     ',
  '  1NN CL           Electric dipole moment measurements:                         '],
 ['  1H   L 0.0         1/2+               STABLE                                  ',
  '  1H  CL           Electric dipole moment=-4E-23 6

In [5]:
# 5. Filter by column 7

col7_vals = set()
for ln in (ln for block in L_in_datasets for ln in block):
    col7_vals.add(col(ln, 7, 7).strip())
print(col7_vals)

RTYPE_COL1 = 7  # Adjust if column 7 is the target

L_in_datasets1 = []
for block in L_in_datasets:
    current_block_L1 = []
    for ln in block:
        if len(ln) >= RTYPE_COL1:
            if col(ln, RTYPE_COL1, RTYPE_COL1).strip() == "L":
                if not is_continuation(ln):
                    current_block_L1.append(ln)
    L_in_datasets1.append(current_block_L1)

print(len(L_in_datasets1))
cnt1 = 0
for block in L_in_datasets1:
    cnt1 += len(block)
print(cnt1)


{'', 'd', 't', 'P', 'C', 'c', 'D', 'T'}
14791
0


In [None]:
# 6. Load and merge CSV files from cleandataf/{year}/
from pathlib import Path
import pandas as pd
import logging
from typing import Iterable, Iterator, List, Optional

YEARS: List[int] = list(range(2005, 2021))
BASE_DIR = Path("D:/VS CODE W/cleandataf")
CHUNKSIZE: Optional[int] = 200_000
MAX_ROWS: Optional[int] = None
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

def iter_year_csv(years: Iterable[int],
                  base_dir: Path,
                  chunksize: Optional[int]) -> Iterator[pd.DataFrame]:
    """
    Stream read CSV files for multiple years (year,dataset_index,line).
    When chunksize is None, read entire file at once.
    """
    dtypes = {"year": "Int32", "dataset_index": "Int32", "line": "string"}
    usecols = ["year", "dataset_index", "line"]
    for y in years:
        p = base_dir / str(y) / "L_in_datasets1.csv"
        if not p.exists():
            logging.warning("missing csv for year=%s: %s", y, p)
            continue
        logging.info("reading %s", p)
        if chunksize is None:
            df = pd.read_csv(
                p, dtype=dtypes, usecols=usecols, encoding="utf-8",
                na_filter=False, on_bad_lines="skip"
            )
            yield df
        else:
            for chunk in pd.read_csv(
                p, dtype=dtypes, usecols=usecols, encoding="utf-8",
                na_filter=False, on_bad_lines="skip", chunksize=chunksize
            ):
                yield chunk


rows_read = 0
chunks: List[pd.DataFrame] = []
for chunk in iter_year_csv(YEARS, BASE_DIR, CHUNKSIZE):
    if MAX_ROWS is not None and rows_read >= MAX_ROWS:
        break
    if MAX_ROWS is not None and rows_read + len(chunk) > MAX_ROWS:
        chunk = chunk.iloc[:(MAX_ROWS - rows_read)]
    chunks.append(chunk)
    rows_read += len(chunk)

df_all = pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame(
    columns=["year", "dataset_index", "line"]
).astype({"year": "Int32", "dataset_index": "Int32", "line": "string"})

logging.info("loaded rows=%d", len(df_all))
display(df_all.head(13))
df_all.info()

2025-08-24 22:34:17,415 INFO reading D:\VS CODE W\cleandataf\2005\L_in_datasets1.csv
2025-08-24 22:34:17,945 INFO reading D:\VS CODE W\cleandataf\2006\L_in_datasets1.csv
2025-08-24 22:34:18,446 INFO reading D:\VS CODE W\cleandataf\2007\L_in_datasets1.csv
2025-08-24 22:34:18,986 INFO reading D:\VS CODE W\cleandataf\2008\L_in_datasets1.csv
2025-08-24 22:34:19,519 INFO reading D:\VS CODE W\cleandataf\2009\L_in_datasets1.csv
2025-08-24 22:34:20,095 INFO reading D:\VS CODE W\cleandataf\2010\L_in_datasets1.csv
2025-08-24 22:34:20,660 INFO reading D:\VS CODE W\cleandataf\2011\L_in_datasets1.csv
2025-08-24 22:34:21,240 INFO reading D:\VS CODE W\cleandataf\2012\L_in_datasets1.csv
2025-08-24 22:34:21,833 INFO reading D:\VS CODE W\cleandataf\2013\L_in_datasets1.csv
2025-08-24 22:34:22,423 INFO reading D:\VS CODE W\cleandataf\2014\L_in_datasets1.csv
2025-08-24 22:34:23,038 INFO reading D:\VS CODE W\cleandataf\2015\L_in_datasets1.csv
2025-08-24 22:34:23,676 INFO reading D:\VS CODE W\cleandataf\2016

Unnamed: 0,year,dataset_index,line
0,2005,0,1NN L 0.0 1/2+ 614.6 S...
1,2005,1,1H L 0.0 1/2+ STABLE...
2,2005,2,1H L 0.0 1/2+ STABL...
3,2005,3,2H L 0 1+ STABLE ...
4,2005,4,2H L 0.0 1+ STABLE ...
5,2005,4,"2H L 2224.5725220+,1+ ..."
6,2005,6,3H L 0.0 1/2+ 12.32 Y...
7,2005,7,3H L 0.0 1/2+ 12.33 Y...
8,2005,7,"3H L 6257.2482 241/2+,3/2+ ..."
9,2005,8,3HE L 0.0 1/2+ STABLE ...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7724271 entries, 0 to 7724270
Data columns (total 3 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   year           Int32 
 1   dataset_index  Int32 
 2   line           string
dtypes: Int32(2), string(1)
memory usage: 132.6 MB


In [7]:
# 7. Check loaded data

print(df_all["line"].head(13))

0       1NN  L 0.0         1/2+              614.6 S...
1       1H   L 0.0         1/2+               STABLE...
2       1H   L 0.0           1/2+              STABL...
3       2H   L 0           1+                STABLE ...
4       2H   L 0.0         1+                STABLE ...
5       2H   L  2224.5725220+,1+                    ...
6       3H   L 0.0         1/2+              12.32 Y...
7       3H   L 0.0         1/2+              12.33 Y...
8       3H   L 6257.2482 241/2+,3/2+                ...
9       3HE  L 0.0         1/2+              STABLE ...
10      3HE  L 0.0         1/2+              STABLE ...
11      4H   L 0.0          2-                      ...
12      4H   L  310         1-               6.73 ME...
Name: line, dtype: string


In [8]:
# 8. Parse ENSDF fields

# A (mass number) cols 1-3
df_A = df_all["line"].str.slice(0, 3).str.strip()
df_all["A"] = df_A

# Element symbol cols 3-5
df_elem = df_all["line"].str.slice(3, 5).str.strip()
df_all["elem"] = df_elem

# Record type cols 5-8
df_rtype = df_all["line"].str.slice(5, 8).str.strip()
df_all["rtype"] = df_rtype

# Energy E cols 10-19
df_energy = df_all["line"].str.slice(9, 19).str.strip()
df_all["energy"] = df_energy

# Energy uncertainty DE cols 20-21
df_de = df_all["line"].str.slice(19, 21).str.strip()
df_all["DE"] = df_de

# Spin/parity J cols 22-39
df_spin = df_all["line"].str.slice(21, 39).str.strip()
df_all["spin"] = df_spin

# Half-life/width T cols 40-49
df_life = df_all["line"].str.slice(39, 49).str.strip()
df_all["life"] = df_life

# Half-life uncertainty DT cols 50-55
df_DT = df_all["line"].str.slice(49, 55).str.strip()
df_all["DT"] = df_DT

# Angular momentum transfer L cols 56-64
df_L = df_all["line"].str.slice(55, 64).str.strip()
df_all["L"] = df_L

# Intensity S cols 65-74
df_S = df_all["line"].str.slice(64, 74).str.strip()
df_all["S"] = df_S

# Intensity uncertainty DS cols 75-76
df_DS = df_all["line"].str.slice(74, 76).str.strip()
df_all["DS"] = df_DS

# Comment flag C col 77
df_C = df_all["line"].str.slice(76, 77).str.strip()
df_all["C"] = df_C

# Isomer flag MS cols 78-79
df_MS = df_all["line"].str.slice(77, 79).str.strip()
df_all["MS"] = df_MS

print(df_all.head(13))


    year  dataset_index                                               line  A  \
0   2005              0    1NN  L 0.0         1/2+              614.6 S...  1   
1   2005              1    1H   L 0.0         1/2+               STABLE...  1   
2   2005              2    1H   L 0.0           1/2+              STABL...  1   
3   2005              3    2H   L 0           1+                STABLE ...  2   
4   2005              4    2H   L 0.0         1+                STABLE ...  2   
5   2005              4    2H   L  2224.5725220+,1+                    ...  2   
6   2005              6    3H   L 0.0         1/2+              12.32 Y...  3   
7   2005              7    3H   L 0.0         1/2+              12.33 Y...  3   
8   2005              7    3H   L 6257.2482 241/2+,3/2+                ...  3   
9   2005              8    3HE  L 0.0         1/2+              STABLE ...  3   
10  2005              9    3HE  L 0.0         1/2+              STABLE ...  3   
11  2005             12    4

In [9]:
import re

# 9. Parse nuclide information

# Extract nuclide string
df_nucl = df_all["line"].str.slice(0, 5).str.strip()
df_all["nuclide_raw"] = df_nucl

# Split A and element symbol
nuclide_split = df_nucl.str.extract(r"^\s*(\d+)\s*([A-Z]{1,3})$")
df_all["A_int"] = pd.to_numeric(nuclide_split[0], errors="coerce").astype("Int64")
df_all["elem_sym"] = nuclide_split[1]

# Check A continuity
all_A = sorted(df_all["A_int"].dropna().unique())
gaps = [a for i,a in enumerate(all_A[1:], start=1) if a != all_A[i-1]+1]

print("A range:", min(all_A), "→", max(all_A))
print("Continuous range:", f"{min(all_A)}–{max(all_A)}")
print("Gaps:", gaps)

# Element symbols
all_elems = sorted(df_all["elem_sym"].dropna().unique())
print("Element symbols:")
print(", ".join(all_elems))


A range: 1 → 294
Continuous range: 1–294
Gaps: [288, 290, 292, 294]
Element symbols:
AC, AG, AL, AM, AR, AS, AT, AU, B, BA, BE, BH, BI, BK, BR, C, CA, CD, CE, CF, CL, CM, CN, CO, CR, CS, CU, DB, DS, DY, ER, ES, EU, F, FE, FL, FM, FR, GA, GD, GE, H, HE, HF, HG, HO, HS, I, IN, IR, K, KR, LA, LI, LR, LU, LV, MC, MD, MG, MN, MO, MT, N, NA, NB, ND, NE, NH, NI, NN, NO, NP, O, OG, OS, P, PA, PB, PD, PM, PO, PR, PT, PU, RA, RB, RE, RF, RG, RH, RN, RU, S, SB, SC, SE, SG, SI, SM, SN, SR, TA, TB, TC, TE, TH, TI, TL, TM, TS, U, V, W, XE, Y, YB, ZN, ZR


In [10]:
# 10. Nuclear physics calculations

# Periodic table mapping (symbol → Z)
SYMS = [
    "H","HE","LI","BE","B","C","N","O","F","NE","NA","MG","AL","SI","P","S","CL","AR",
    "K","CA","SC","TI","V","CR","MN","FE","CO","NI","CU","ZN","GA","GE","AS","SE","BR","KR",
    "RB","SR","Y","ZR","NB","MO","TC","RU","RH","PD","AG","CD","IN","SN","SB","TE","I","XE",
    "CS","BA","LA","CE","PR","ND","PM","SM","EU","GD","TB","DY","HO","ER","TM","YB","LU",
    "HF","TA","W","RE","OS","IR","PT","AU","HG","TL","PB","BI","PO","AT","RN",
    "FR","RA","AC","TH","PA","U","NP","PU","AM","CM","BK","CF","ES","FM","MD","NO","LR",
    "RF","DB","SG","BH","HS","MT","DS","RG","CN","NH","FL","MC","LV","TS","OG"
]
SYM2Z = {s:i+1 for i,s in enumerate(SYMS)}
MAGIC = [2, 8, 20, 28, 50, 82, 126]

import numpy as np, pandas as pd, re, math

# Mark special element symbols (not in periodic table, like NN)
df_is_special = ~df_all["elem_sym"].isin(SYM2Z.keys())
df_all["is_special_symbol"] = df_is_special

# Z: map normal elements; keep special ones as NaN
df_Z = df_all["elem_sym"].map(SYM2Z)
df_Z = df_Z.astype("Int64")
df_all["Z_int"] = df_Z

# N = A - Z (only when both A and Z available)
df_N = (df_all["A_int"] - df_all["Z_int"]).where(df_all["A_int"].notna() & df_all["Z_int"].notna())
df_N = df_N.astype("Int64")
df_all["N_int"] = df_N

# Asymmetry (N - Z) / A
df_asym = (df_all["N_int"] - df_all["Z_int"]) / df_all["A_int"]
df_all["asym"] = df_asym

# Parity (odd/even)
df_oddZ = (df_all["Z_int"] % 2).where(df_all["Z_int"].notna()).astype("Int64")
df_oddN = (df_all["N_int"] % 2).where(df_all["N_int"].notna()).astype("Int64")
df_all["is_odd_Z"] = df_oddZ
df_all["is_odd_N"] = df_oddN

# Pairing type (EE/OE/EO/OO)
tmp_pair = df_oddZ.astype("string") + df_oddN.astype("string")
pair_map = {"00":"EE","10":"OE","01":"EO","11":"OO"}
df_pair = tmp_pair.map(pair_map)
df_all["pairing_type"] = df_pair

# Distance to nearest magic numbers ΔZ / ΔN
def _delta_to_nearest_magic(x):
    if pd.isna(x): return np.nan
    m = min(MAGIC, key=lambda k: abs(int(x)-k))
    return int(x) - m

df_dZ = df_all["Z_int"].map(_delta_to_nearest_magic)
df_dN = df_all["N_int"].map(_delta_to_nearest_magic)
df_all["delta_magic_z"] = df_dZ
df_all["delta_magic_n"] = df_dN

# Valence nucleons: absolute distance to nearest magic numbers
df_vp = df_dZ.abs()
df_vn = df_dN.abs()
df_all["valence_p"] = df_vp
df_all["valence_n"] = df_vn

frac_pat = re.compile(r"(?P<num>\d+)(?:/(?P<den>\d+))?(?P<par>[+-])?")

def _j_from_text(t):
    if not isinstance(t, str) or not t.strip(): return np.nan
    s = t.replace("J","").replace("=","")
    m = frac_pat.search(s)
    if not m: return np.nan
    num = int(m.group("num")); den = int(m.group("den")) if m.group("den") else 1
    return num/den

def _par_from_text(t):
    if not isinstance(t, str) or not t.strip(): return 0
    s = t.replace("J","").replace("=","")
    m = frac_pat.search(s)
    if not m: return 0
    p = m.group("par")
    return +1 if p=="+" else -1 if p=="-" else 0

# Parse spin and parity
df_J = df_all["spin"].map(_j_from_text)
df_par = df_all["spin"].map(_par_from_text)
df_all["J_value"] = df_J
df_all["parity_pm"] = df_par  # +1/-1/0(unknown)


In [11]:
# 11. Parse half-life data

UNIT2S = {
    "Y": 365.25*24*3600, "D": 24*3600, "H": 3600, "MIN": 60, "S": 1.0,
    "MS": 1e-3, "US": 1e-6, "NS": 1e-9, "PS": 1e-12, "FS": 1e-15,
}
unit_pat = re.compile(r"(?P<val>[+-]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)\s*(?P<u>[A-Za-z]+)")

# Stability check
df_stable = df_all["life"].str.contains("STABLE", case=False, na=False)
df_all["is_stable"] = df_stable

def _life_to_seconds(t):
    if not isinstance(t, str) or not t.strip(): return np.nan
    if "STABLE" in t.upper(): return np.nan
    m = unit_pat.search(t.upper())
    if not m: return np.nan
    try: v = float(m.group("val"))
    except: return np.nan
    u = (m.group("u")
         .replace("MSEC","MS").replace("USEC","US").replace("NSEC","NS")
         .replace("PSEC","PS").replace("FSEC","FS"))
    return v*UNIT2S[u] if u in UNIT2S else np.nan

df_tau = df_all["life"].map(_life_to_seconds)
df_all["lifetime_s"] = df_tau
df_all["log10_lifetime_s"] = df_tau.map(lambda x: np.log10(x) if np.isfinite(x) else np.nan)


In [12]:
# 12. Summary statistics

# A continuity and gaps
A_vals = sorted(df_all["A_int"].dropna().unique().tolist())
if A_vals:
    # Continuous segments
    segs = []
    start = prev = A_vals[0]
    for a in A_vals[1:]:
        if a == prev + 1:
            prev = a
            continue
        segs.append((start, prev))
        start = prev = a
    segs.append((start, prev))
    # Gap points
    gaps = [A_vals[i] for i in range(1, len(A_vals)) if A_vals[i] != A_vals[i-1] + 1]
    print(f"A range: {A_vals[0]} → {A_vals[-1]}")
    print("Continuous intervals:", ", ".join([f"{s}–{e}" if s!=e else f"{s}" for s,e in segs]))
    print("Gaps:", gaps)
else:
    print("No valid A values found.")

# Element symbols (standard / special)
elems_ok  = sorted(df_all.loc[~df_all["is_special_symbol"], "elem_sym"].dropna().unique().tolist())
elems_bad = sorted(df_all.loc[ df_all["is_special_symbol"], "elem_sym"].dropna().unique().tolist())
print("\nStandard element symbols:")
print(", ".join(elems_ok))
print("\nSpecial/non-standard element symbols:")
print(", ".join(elems_bad) if elems_bad else "(none)")


A range: 1 → 294
Continuous intervals: 1–286, 288, 290, 292, 294
Gaps: [288, 290, 292, 294]

Standard element symbols:
AC, AG, AL, AM, AR, AS, AT, AU, B, BA, BE, BH, BI, BK, BR, C, CA, CD, CE, CF, CL, CM, CN, CO, CR, CS, CU, DB, DS, DY, ER, ES, EU, F, FE, FL, FM, FR, GA, GD, GE, H, HE, HF, HG, HO, HS, I, IN, IR, K, KR, LA, LI, LR, LU, LV, MC, MD, MG, MN, MO, MT, N, NA, NB, ND, NE, NH, NI, NO, NP, O, OG, OS, P, PA, PB, PD, PM, PO, PR, PT, PU, RA, RB, RE, RF, RG, RH, RN, RU, S, SB, SC, SE, SG, SI, SM, SN, SR, TA, TB, TC, TE, TH, TI, TL, TM, TS, U, V, W, XE, Y, YB, ZN, ZR

Special/non-standard element symbols:
NN


In [13]:
# 13. Sample data inspection

print(df_all.sample(30, random_state=421))


         year  dataset_index  \
4947407  2015          12967   
3469287  2012          12905   
7723950  2020          18066   
1997545  2009          10927   
6198352  2018           3530   
1564192  2008          11144   
670330   2006           9857   
6158325  2018           2090   
2934889  2011          11381   
4840027  2015          10113   
5518554  2016          14357   
4658807  2015           4110   
1789998  2009           4147   
5029576  2015          15251   
4712429  2015           5997   
1078798  2007           9639   
5387335  2016          11180   
3068273  2011          15425   
4904549  2015          11966   
7018712  2019          12728   
3149162  2012           2925   
2330237  2010           7308   
3602198  2013           1702   
1218316  2007          13676   
1354109  2008           4392   
6529195  2018          13561   
921405   2007           4190   
6825539  2019           6952   
2957514  2011          11913   
2193365  2010           2351   

       

In [None]:
# 14. Energy field analysis

import re
import pandas as pd
import numpy as np

# Convert energy to string view
energy_s = df_all['energy'].astype(str).str.strip()

# rule: what is "pure number energy" (support scientific notation)
num_pat = re.compile(r"^[+-]?(\d+(\.\d+)?)([eE][+-]?\d+)?$")

is_plain_number = energy_s.map(lambda s: bool(num_pat.fullmatch(s)))
is_empty        = energy_s.eq("") | energy_s.str.lower().isin(["nan", "none", "na", "null"])
is_sp_prefix    = energy_s.str.startswith(("SP+","Sp+","sp+"))         # IAR common form
has_letters     = energy_s.str.contains(r"[A-Za-z]", regex=True)       # Any letters (like +X, AP, SP etc.)
has_weird_plus  = energy_s.str.contains(r"\+\D", regex=True)           # “数值后跟非数字”的 + 标记

# mask used for deleting non-numeric energy rows
mask_non_numeric = (~is_plain_number) & (~is_empty)

# —— Statistical Summary —— #
total = len(df_all)
nn_count = mask_non_numeric.sum()
sp_count = (is_sp_prefix & mask_non_numeric).sum()
letter_count = (has_letters & mask_non_numeric).sum()
weird_plus_count = (has_weird_plus & mask_non_numeric).sum()

print(f"Total rows: {total:,}")
print(f"Non-numeric energy rows: {nn_count:,}  ({nn_count/total:.2%})")
print(f"  Starting with 'SP+' (IAR typical): {sp_count:,}  ({sp_count/max(nn_count,1):.2%} of non-numeric)")
print(f"  Containing letters (+X/AP/labels): {letter_count:,}  ({letter_count/max(nn_count,1):.2%} of non-numeric)")
print(f"  With '+non-digit' pattern: {weird_plus_count:,}  ({weird_plus_count/max(nn_count,1):.2%} of non-numeric)")

# Common prefixes analysis
print("\nCommon non-numeric energy prefixes (Top 15):")
print(energy_s[mask_non_numeric].str.slice(0, 8).value_counts().head(15))

# Random sample inspection
print("\nSample 20 rows:")
cols = ['year','dataset_index','nuclide_raw','elem','rtype','energy','spin','life','line']
print(df_all.loc[mask_non_numeric, cols].sample(20, random_state=42))

# SP+ IAR samples
print("\nSample 10 SP+ IAR rows:")
print(df_all.loc[is_sp_prefix & mask_non_numeric, cols].sample(min(10, sp_count), random_state=0))

# 7) If confirmed to be extremely small proportion, delete directly (two deletion methods):
df_drop_all_nonnumeric = df_all.loc[~mask_non_numeric].copy()
# 2nd option:
df_drop_only_sp = df_all.loc[~(is_sp_prefix & mask_non_numeric)].copy()

print(f"\nAfter removing all non-numeric energies: {len(df_drop_all_nonnumeric):,} rows (removed {nn_count:,} rows)")
print(f"After removing only SP+ IAR: {len(df_drop_only_sp):,} rows (removed {sp_count:,} rows)")


Total rows: 7,724,271
Non-numeric energy rows: 689,704  (8.93%)
  Starting with 'SP+' (IAR typical): 36,568  (5.30% of non-numeric)
  Containing letters (+X/AP/labels): 667,798  (96.82% of non-numeric)
  With '+non-digit' pattern: 500,469  (72.56% of non-numeric)

Common non-numeric energy prefixes (Top 15):
energy
0.0+X    6190
X        5658
0+X      4871
Y        3059
0.0+Y    2168
Z        2027
0+Y      1610
U        1375
0.       1194
V         878
0.0+Z     628
W         607
0+Z       539
S         294
0+U       266
Name: count, dtype: int64

Sample 20 rows:
         year  dataset_index nuclide_raw elem rtype     energy     spin life  \
4927203  2015          12521       161LU   LU     L   1833.1+X  (19/2+)        
2032060  2009          11738       168YB   YB     L   3341.7+X     (23)        
2487547  2010          11805       166TM   TM     L  1599.63+X      14+        
4368425  2014          11036       145TB   TB     L   3732.5+Z     J+10        
6398743  2018          10205  

In [15]:

# 15. Clean dataset after removing non-numeric energies

print(df_drop_all_nonnumeric.sample(30, random_state=421))
print(df_drop_all_nonnumeric.columns)
print(df_drop_all_nonnumeric.head(13))

         year  dataset_index  \
2006538  2009          11136   
4757576  2015           7635   
616718   2006           8168   
4444798  2014          12884   
5042117  2015          15785   
5950198  2017          12275   
7603010  2020          13883   
4797154  2015           8900   
5105458  2016           1748   
1572880  2008          11341   
1961744  2009           9958   
4009139  2013          14376   
5130114  2016           2637   
4111067  2014           2365   
3554556  2012          15834   
2300384  2010           6256   
5318534  2016           9239   
6826508  2019           6998   
7299329  2020           4785   
7583873  2020          13471   
497066   2006           3801   
7082779  2019          14224   
2762532  2011           6080   
1956197  2009           9793   
5631720  2017           1984   
778674   2006          12773   
2040064  2009          11950   
6635582  2018          17028   
3513386  2012          14048   
5420391  2016          12050   

       

In [None]:
# 16. Clean spin uncertainties

# Convert spin column to string
s_raw = df_drop_all_nonnumeric['spin'].astype(str)

# Remove parentheses version
s_clean = s_raw.str.replace(r'[()]', '', regex=True).str.strip()

# Mark as uncertain if before/after cleaning differs
df_drop_all_nonnumeric['spin_is_uncertain'] = (s_raw != s_clean).astype('int8')

# Update spin column to cleaned version (parentheses removed)
df_drop_all_nonnumeric['spin'] = s_clean

# Random sample check
df_drop_all_nonnumeric[['spin', 'spin_is_uncertain']].sample(30, random_state=421)


Unnamed: 0,spin,spin_is_uncertain
2006538,17/2+,1
4757576,5/2+,1
616718,35/2+,1
4444798,"7/2,9/2+",1
5042117,1-,0
5950198,"3/2+,5/2+,7/2+",0
7603010,51/2+,1
4797154,,0
5105458,2+,0
1572880,27/2-,0


In [None]:
# 17. Analyze uncertainty field patterns

import re

dt_s = df_drop_all_nonnumeric['DT'].astype(str).str.strip()

# Regex patterns
num_pat   = re.compile(r"^[+-]?(\d+(\.\d+)?)([eE][+-]?\d+)?$")  # pure numbers
asym_pat  = re.compile(r"^\+\d+-\d+$")                         # +a-b asymmetric error
label_pat = re.compile(r"^[A-Za-z]+")                          # Labels starting with letters

is_number = dt_s.map(lambda s: bool(num_pat.fullmatch(s)))
is_asym   = dt_s.map(lambda s: bool(asym_pat.fullmatch(s)))
is_label  = dt_s.map(lambda s: bool(label_pat.match(s)))


is_blank  = df_drop_all_nonnumeric['DT'].astype(str).str.fullmatch(r"\s*")

total = len(dt_s)
print(f"Total rows: {total:,}")
print(f"Numeric: {is_number.sum():,} ({is_number.mean():.2%})")
print(f"Asymmetric error (+a-b): {is_asym.sum():,} ({is_asym.mean():.2%})")
print(f"Label type (LT/GE/AP etc): {is_label.sum():,} ({is_label.mean():.2%})")
print(f"Blank (whitespace only): {is_blank.sum():,} ({is_blank.mean():.2%})")

# Other (truly unmatched)
mask_other = ~(is_number | is_asym | is_label | is_blank)
print(f"Other (unmatched non-blank): {mask_other.sum():,} ({mask_other.mean():.2%})")


Total rows: 7,034,567
Numeric: 553,403 (7.87%)
Asymmetric error (+a-b): 76,818 (1.09%)
Label type (LT/GE/AP etc): 159,227 (2.26%)
Blank (whitespace only): 6,244,960 (88.78%)
Other (unmatched non-blank): 159 (0.00%)


In [None]:
# 18. Clean uncertainty values

import re
import numpy as np

# Regex for pure numbers
num_pat = re.compile(r"^[+-]?(\d+(\.\d+)?)([eE][+-]?\d+)?$")

# Convert to string view
dt_s = df_drop_all_nonnumeric['DT'].astype(str).str.strip()

# Mask: which are pure numbers
mask_numeric = dt_s.map(lambda s: bool(num_pat.fullmatch(s)))

# Keep rows, only change column: non-numeric DT -> NaN
df_drop_all_nonnumeric.loc[~mask_numeric, 'DT'] = np.nan

# Convert numeric column to float type (NaN compatible)
df_drop_all_nonnumeric['DT'] = df_drop_all_nonnumeric['DT'].astype(float)

print("Conversion complete: numeric kept as float, non-numeric changed to NaN, all rows preserved.")
print(df_drop_all_nonnumeric['DT'].describe())
import re

dt_s = df_drop_all_nonnumeric['DT'].astype(str).str.strip()

# Regex patterns
num_pat   = re.compile(r"^[+-]?(\d+(\.\d+)?)([eE][+-]?\d+)?$")  # pure numbers
asym_pat  = re.compile(r"^\+\d+-\d+$")                         # +a-b asymmetric error
label_pat = re.compile(r"^[A-Za-z]+")                          # Labels starting with letters

is_number = dt_s.map(lambda s: bool(num_pat.fullmatch(s)))
is_asym   = dt_s.map(lambda s: bool(asym_pat.fullmatch(s)))
is_label  = dt_s.map(lambda s: bool(label_pat.match(s)))

is_blank  = df_drop_all_nonnumeric['DT'].astype(str).str.fullmatch(r"\s*")

total = len(dt_s)
print(f"Total rows: {total:,}")
print(f"Numeric: {is_number.sum():,} ({is_number.mean():.2%})")
print(f"Asymmetric error (+a-b): {is_asym.sum():,} ({is_asym.mean():.2%})")
print(f"Label type (LT/GE/AP etc): {is_label.sum():,} ({is_label.mean():.2%})")
print(f"Blank (whitespace only): {is_blank.sum():,} ({is_blank.mean():.2%})")

# Other (truly unmatched)
mask_other = ~(is_number | is_asym | is_label | is_blank)
print(f"Other (unmatched non-blank): {mask_other.sum():,} ({mask_other.mean():.2%})")



Conversion complete: numeric kept as float, non-numeric changed to NaN, all rows preserved.
count    553403.000000
mean         11.652053
std          18.392009
min           1.000000
25%           4.000000
50%           7.000000
75%          14.000000
max        1000.000000
Name: DT, dtype: float64
Total rows: 7,034,567
Numeric: 553,403 (7.87%)
Asymmetric error (+a-b): 0 (0.00%)
Label type (LT/GE/AP etc): 6,481,164 (92.13%)
Blank (whitespace only): 0 (0.00%)
Other (unmatched non-blank): 0 (0.00%)


In [None]:

# 19. Final cleaned dataset

print(df_drop_all_nonnumeric.sample(30, random_state=421))
print(df_drop_all_nonnumeric.columns)
print(df_drop_all_nonnumeric.head(13))

         year  dataset_index  \
2006538  2009          11136   
4757576  2015           7635   
616718   2006           8168   
4444798  2014          12884   
5042117  2015          15785   
5950198  2017          12275   
7603010  2020          13883   
4797154  2015           8900   
5105458  2016           1748   
1572880  2008          11341   
1961744  2009           9958   
4009139  2013          14376   
5130114  2016           2637   
4111067  2014           2365   
3554556  2012          15834   
2300384  2010           6256   
5318534  2016           9239   
6826508  2019           6998   
7299329  2020           4785   
7583873  2020          13471   
497066   2006           3801   
7082779  2019          14224   
2762532  2011           6080   
1956197  2009           9793   
5631720  2017           1984   
778674   2006          12773   
2040064  2009          11950   
6635582  2018          17028   
3513386  2012          14048   
5420391  2016          12050   

       

In [None]:
# 20. Export final dataset

# notebook cell
from pathlib import Path

# Target output prefix (absolute path)
out_prefix = Path("D:/VS CODE W/outputs/clean_levels")
out_prefix.parent.mkdir(parents=True, exist_ok=True)

# Save the cleaned dataframe from notebook
df_drop_all_nonnumeric.to_csv(out_prefix.with_suffix(".csv"), index=False)
df_drop_all_nonnumeric.to_feather(out_prefix.with_suffix(".feather"))  # Requires pyarrow installation