In [3]:
import polars as pl
import os
import glob

In [None]:
ffpe_pred_path = glob.glob("../ffpe-snvf/*/*.pred_fp-cut_1e-08.tsv")
oxog_pred_path = glob.glob("../oxog-snvf/*/*.pred_fp-cut_1e-08.tsv")

In [13]:
def calc_arti_prop(artifact_pred_paths: list) -> pl.DataFrame:
	arti_prop = []

	for path in artifact_pred_paths:
		
		artifact_pred = pl.read_csv(path, separator="\t")

		snvs = artifact_pred.shape[0]
		artifacts = artifact_pred.filter(pl.col("pred") == False).shape[0]

		sample_stats = {
			"sample_name" : os.path.basename(path).split(".")[0],
			"n_snv" : snvs,
			"n_real" : (snvs - artifacts),
			"n_artifacts" : artifacts,
			"proportion" : (artifacts / snvs)
		}

		arti_prop.append(sample_stats)

	arti_prop = pl.DataFrame(arti_prop).sort("proportion", descending=True)
	return arti_prop

In [19]:
def get_stats(arti_prop: pl.DataFrame) -> None:
	mean_pct = arti_prop["proportion"].mean() * 100
	max_pct = arti_prop["proportion"].max() * 100
	n_samples = arti_prop.shape[0]
	n_with_artifacts = arti_prop.filter(pl.col("proportion") > 0).shape[0]

	print(f"Samples analyzed: {n_samples}")
	print(f"Samples with ≥1 predicted artifact: {n_with_artifacts} ({n_with_artifacts / n_samples * 100:.1f}%)")
	print(f"Mean proportion of artifactual SNVs: {mean_pct:.2f}%")
	print(f"Max proportion of artifactual SNVs: {max_pct:.2f}%")

In [62]:
def get_res(damage_type: str, fp_cut: float) -> pl.DataFrame:
	
	arti_pred_paths = glob.glob(f"../{damage_type.lower()}-snvf/*/*.pred_fp-cut_{fp_cut:.0e}.tsv")

	proportions = calc_arti_prop(arti_pred_paths)

	print(f"fp-cut: {fp_cut}")
	get_stats(proportions)

	proportions.write_csv(f"{damage_type}_proportions_per_sample.fp-cut_{fp_cut:.0e}.tsv", separator="\t")
	return proportions

## FFPE

In [58]:
get_res("ffpe", 1e-08)

fp-cut: 1e-08
Samples analyzed: 425
Samples with ≥1 predicted artifact: 81 (19.1%)
Mean proportion of artifactual SNVs: 0.93%
Max proportion of artifactual SNVs: 25.00%


sample_name,n_snv,n_real,n_artifacts,proportion
str,i64,i64,i64,f64
"""ORD-1010565-01""",12,9,3,0.25
"""ORD-1587905-01""",41,32,9,0.219512
"""ORD-1597650-01""",43,36,7,0.162791
"""ORD-1222219-01""",35,30,5,0.142857
"""ORD-1472408-01""",36,31,5,0.138889
…,…,…,…,…
"""ORD-1042000-01""",35,35,0,0.0
"""ORD-1555240-01""",11,11,0,0.0
"""ORD-1472386-01""",45,45,0,0.0
"""ORD-1458564-01""",20,20,0,0.0


In [59]:
get_res("ffpe", 5e-01)

fp-cut: 0.5
Samples analyzed: 425
Samples with ≥1 predicted artifact: 24 (5.6%)
Mean proportion of artifactual SNVs: 0.26%
Max proportion of artifactual SNVs: 25.00%


sample_name,n_snv,n_real,n_artifacts,proportion
str,i64,i64,i64,f64
"""ORD-1010565-01""",12,9,3,0.25
"""ORD-1587905-01""",41,36,5,0.121951
"""ORD-1595187-01""",66,60,6,0.090909
"""ORD-1472408-01""",36,33,3,0.083333
"""ORD-1474214-01""",36,34,2,0.055556
…,…,…,…,…
"""ORD-1042000-01""",35,35,0,0.0
"""ORD-1555240-01""",11,11,0,0.0
"""ORD-1472386-01""",45,45,0,0.0
"""ORD-1458564-01""",20,20,0,0.0


#### Observations - FFPE

**2025/10/24**

Out of 204 samples analyzed so far.

- Using a stringent fp-cut of 1e-08: 
	- 39/204 samples (19.1%) was observed to possess artifacts. 
	- In terms of proportion of artifactual SNVs, the max was 25%, mean was 0.94 %

- Using a relaxed fp-cut of 5e-01: 
	- 9/204 samples (4.4%) was observed to possess artifacts. 
	- In terms of proportion of artifactual SNVs, the max was 25%, mean was 0.24 %


**2025/10/27**

Out of 425 samples analyzed so far.

- Using a stringent fp-cut of 1e-08: 
	- 81/425 samples (19.1%) was observed to possess artifacts. 
	- In terms of proportion of artifactual SNVs, the max was 25%, mean was 0.93 %

- Using a relaxed fp-cut of 5e-01: 
	- 24/425 samples (5.6%) was observed to possess artifacts. 
	- In terms of proportion of artifactual SNVs, the max was 25%, mean was 0.26 %

## OxoG

In [60]:
get_res("oxog", 1e-08)

fp-cut: 1e-08
Samples analyzed: 425
Samples with ≥1 predicted artifact: 36 (8.5%)
Mean proportion of artifactual SNVs: 0.34%
Max proportion of artifactual SNVs: 25.00%


sample_name,n_snv,n_real,n_artifacts,proportion
str,i64,i64,i64,f64
"""ORD-1010565-01""",12,9,3,0.25
"""ORD-1050682-01""",8,7,1,0.125
"""ORD-1487773-01""",41,37,4,0.097561
"""ORD-1521390-01""",40,37,3,0.075
"""ORD-1127305-01""",31,29,2,0.064516
…,…,…,…,…
"""ORD-1601357-01""",21,21,0,0.0
"""ORD-1042000-01""",35,35,0,0.0
"""ORD-1555240-01""",11,11,0,0.0
"""ORD-1472386-01""",45,45,0,0.0


In [61]:
get_res("oxog", 5e-01)

fp-cut: 0.5
Samples analyzed: 425
Samples with ≥1 predicted artifact: 13 (3.1%)
Mean proportion of artifactual SNVs: 0.16%
Max proportion of artifactual SNVs: 25.00%


sample_name,n_snv,n_real,n_artifacts,proportion
str,i64,i64,i64,f64
"""ORD-1010565-01""",12,9,3,0.25
"""ORD-1487773-01""",41,38,3,0.073171
"""ORD-1127305-01""",31,29,2,0.064516
"""ORD-1539713-01""",20,19,1,0.05
"""ORD-1521390-01""",40,38,2,0.05
…,…,…,…,…
"""ORD-1042000-01""",35,35,0,0.0
"""ORD-1555240-01""",11,11,0,0.0
"""ORD-1472386-01""",45,45,0,0.0
"""ORD-1458564-01""",20,20,0,0.0


#### Observations - OxoG

**2025/10/24**

Out of 204 samples analyzed so far.

- Using a stringent fp-cut of 1e-08: 
	- 15/204 samples (7.3%) was observed to possess artifacts. 
	- In terms of proportion of artifactual SNVs within a sample, the max was 25%, mean was 0.37%

- Using a relaxed fp-cut of 5e-01: 
	- 5/204 samples (2.45%) was observed to possess artifacts. 
	- In terms of proportion of artifactual SNVs within a sample, the max was 25%, mean was 0.20%

**2025/10/27**

Out of 425 samples analyzed so far.

- Using a stringent fp-cut of 1e-08: 
	- 36/425 samples (8.5%) was observed to possess artifacts. 
	- In terms of proportion of artifactual SNVs within a sample, the max was 25%; mean was 0.34%

- Using a relaxed fp-cut of 5e-01: 
	- 13/425 samples (3.1%) was observed to possess artifacts. 
	- In terms of proportion of artifactual SNVs within a sample, the max was 25%; mean was 0.16%