# Data Cleaning: Identify and Remove Invalid Entries

This notebook creates a small sample dataset with columns `Name`, `Gender`, and `Age`, intentionally containing invalid entries, then cleans it.

Rules used:
- Name: required, not "invalid data", not empty/whitespace, and should contain letters (allowing spaces, hyphens, apostrophes).
- Gender: normalized to `Male`/`Female`; anything else becomes invalid.
- Age: numeric, between 1 and 120 inclusive.

We'll show the dataset before and after cleaning and save the cleaned result to `datasets/cleaned_people.csv`. 

In [1]:
# Imports and sample data
import pandas as pd
import numpy as np
from pathlib import Path

# Create sample dataset with invalid entries
raw_data = [
    {"Name": "Alice", "Gender": "F", "Age": 29},
    {"Name": "Bob", "Gender": "Male", "Age": 34},
    {"Name": " invalid data ", "Gender": "M", "Age": 22},
    {"Name": None, "Gender": "Female", "Age": 44},
    {"Name": "   ", "Gender": "Unknown", "Age": 19},
    {"Name": "Charlie-Ann", "Gender": "FEMALE", "Age": "27"},
    {"Name": "D'Angelo", "Gender": "m", "Age": 17},
    {"Name": "Eve", "Gender": "", "Age": 25},
    {"Name": "Frank", "Gender": "Other", "Age": 136},
    {"Name": "Grace", "Gender": "F", "Age": -5},
    {"Name": "Heidi", "Gender": "F", "Age": "twenty"},
]

df_raw = pd.DataFrame(raw_data)
df_raw

Unnamed: 0,Name,Gender,Age
0,Alice,F,29
1,Bob,Male,34
2,invalid data,M,22
3,,Female,44
4,,Unknown,19
5,Charlie-Ann,FEMALE,27
6,D'Angelo,m,17
7,Eve,,25
8,Frank,Other,136
9,Grace,F,-5


In [2]:
# Validation helpers and cleaning
import re

VALID_NAME_RE = re.compile(r"^[A-Za-z][A-Za-z\s\-']*[A-Za-z]$|^[A-Za-z]$")

# Normalize gender to Male/Female or return None if unrecognized
def normalize_gender(val):
    if val is None:
        return None
    s = str(val).strip().lower()
    if s in {"m", "male"}:
        return "Male"
    if s in {"f", "female"}:
        return "Female"
    return None

# Return True if name looks valid
def is_valid_name(name):
    if name is None:
        return False
    s = str(name).strip()
    if not s or s.lower() == "invalid data":
        return False
    return bool(VALID_NAME_RE.match(s))

# Convert age to numeric, coerce errors to NaN, and validate range

def to_valid_age(age):
    try:
        n = pd.to_numeric(age, errors="coerce")
    except Exception:
        return np.nan
    if pd.isna(n) or n < 1 or n > 120:
        return np.nan
    return int(n)


def clean_people(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out["Name_valid"] = out["Name"].apply(is_valid_name)
    out["Gender_norm"] = out["Gender"].apply(normalize_gender)
    out["Age_num"] = out["Age"].apply(to_valid_age)

    cleaned = out.loc[
        (out["Name_valid"]) & (~out["Gender_norm"].isna()) & (~out["Age_num"].isna()),
        ["Name", "Gender_norm", "Age_num"],
    ].rename(columns={"Gender_norm": "Gender", "Age_num": "Age"})

    return cleaned.reset_index(drop=True)

cleaned_preview = clean_people(df_raw)
cleaned_preview

Unnamed: 0,Name,Gender,Age
0,Alice,Female,29.0
1,Bob,Male,34.0
2,Charlie-Ann,Female,27.0
3,D'Angelo,Male,17.0


In [3]:
# Diagnostics: what was removed and why
out = df_raw.copy()
out["Name_valid"] = out["Name"].apply(is_valid_name)
out["Gender_norm"] = out["Gender"].apply(normalize_gender)
out["Age_num"] = out["Age"].apply(to_valid_age)

invalid_mask = (~out["Name_valid"]) | (out["Gender_norm"].isna()) | (out["Age_num"].isna())

# Show invalid rows with reasons
reasons = []
for idx, row in out[invalid_mask].iterrows():
    r = []
    if not row["Name_valid"]:
        r.append("bad name")
    if pd.isna(row["Gender_norm"]):
        r.append("bad gender")
    if pd.isna(row["Age_num"]):
        r.append("bad age")
    reasons.append(", ".join(r))

invalid_rows = out[invalid_mask][["Name", "Gender", "Age"]].copy()
invalid_rows["reason"] = reasons
invalid_rows

Unnamed: 0,Name,Gender,Age,reason
2,invalid data,M,22,bad name
3,,Female,44,bad name
4,,Unknown,19,"bad name, bad gender"
7,Eve,,25,bad gender
8,Frank,Other,136,"bad gender, bad age"
9,Grace,F,-5,bad age
10,Heidi,F,twenty,bad age


In [4]:
# Save cleaned dataset
cleaned = clean_people(df_raw)

# Ensure output directory exists
out_path = Path("datasets/cleaned_people.csv")
out_path.parent.mkdir(parents=True, exist_ok=True)

cleaned.to_csv(out_path, index=False)

summary = {
    "raw_rows": len(df_raw),
    "clean_rows": len(cleaned),
    "removed_rows": len(df_raw) - len(cleaned),
    "output": str(out_path)
}
summary, cleaned.head()

({'raw_rows': 11,
  'clean_rows': 4,
  'removed_rows': 7,
  'output': 'datasets/cleaned_people.csv'},
           Name  Gender   Age
 0        Alice  Female  29.0
 1          Bob    Male  34.0
 2  Charlie-Ann  Female  27.0
 3     D'Angelo    Male  17.0)