In [1]:
# Title: Data Cleaning using Pandas
# Description: Check for missing values and handle them by imputing the median.
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

# Simulate missing values for demonstration (optional for real datasets)
# Example: set 5 random values in 'sepal length (cm)' to NaN
np.random.seed(42)
missing_indices = np.random.choice(df.index, size=5, replace=False)
df.loc[missing_indices, 'sepal length (cm)'] = np.nan

# Display missing value count
print("Missing values before imputation:")
print(df.isnull().sum())

# Impute missing values with median (only numeric columns)
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Confirm missing values handled
print("\nMissing values after imputation:")
print(df.isnull().sum())


Missing values before imputation:
sepal length (cm)    5
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
species              0
dtype: int64

Missing values after imputation:
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
species              0
dtype: int64
