In [1]:
import pandas as pd
import numpy as np

In [2]:
def is_measurement_valid(measurement: float) -> bool:
	return 0.0 < measurement < 15.0

def is_measurement_invalid(measurement: float) -> bool:
	# Has to be implemented separately,
	# because of the np.nan case
	return 0.0 >= measurement or measurement >= 15.0



In [3]:
rawIrisData: pd.DataFrame = pd.read_csv(
	"iris_with_errors.csv",
	na_values={"NA", "-", "n/a", "na"},
	dtype={
		"sepal.length": np.float64,
		"sepal.width": np.float64,
		"petal.length": np.float64,
		"petal.width": np.float64,
		# "variety": pd.CategoricalDtype(categories=["Setosa", "Versicolor", "Virginica"]),
		"variety": str,
	}
)
measurements_columns_names: list[str] = ["sepal.length", "sepal.width", "petal.length", "petal.width"]
rawIrisData

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,15.6,2.3,Virginica


In [4]:
for columnName in measurements_columns_names:
	print(columnName, list(filter(is_measurement_invalid, rawIrisData[columnName])))

sepal.length [30.9, 0.0]
sepal.width [-1.0, -2.7]
petal.length [-6.1, 15.6]
petal.width []


In [5]:
def calculate_mean_of_valid_measurements(data: pd.Series) -> float:
	return np.mean(
		list(filter(
			is_measurement_valid,
			data.dropna()
		))
	)
valid_measurements_means: dict[str, float] = dict(
	list(map(
		lambda x: (x, calculate_mean_of_valid_measurements(rawIrisData[x])),
		measurements_columns_names
	))
)

In [6]:
irisData: pd.DataFrame = rawIrisData.copy()
for columnName in measurements_columns_names:
	irisData[columnName] = list(map(
		lambda x: valid_measurements_means[columnName] if is_measurement_invalid(x) else x,
		irisData[columnName]
	))
irisData

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.400000,0.2,Setosa
1,4.9,3.0,1.400000,0.2,Setosa
2,4.7,3.2,1.300000,0.2,Setosa
3,4.6,3.1,1.500000,0.2,Setosa
4,5.0,3.6,1.400000,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.200000,2.3,Virginica
146,6.3,2.5,5.000000,1.9,Virginica
147,6.5,3.0,5.200000,2.0,Virginica
148,6.2,3.4,3.777551,2.3,Virginica


In [7]:
for columnName in measurements_columns_names:
	print(columnName, list(filter(is_measurement_invalid, irisData[columnName])))

sepal.length []
sepal.width []
petal.length []
petal.width []


In [8]:
irisData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  148 non-null    float64
 1   sepal.width   149 non-null    float64
 2   petal.length  149 non-null    float64
 3   petal.width   149 non-null    float64
 4   variety       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
