In [157]:
import numpy as np
import pandas as pd
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

In [158]:
# Incosistant generated dataset
data = {
    'Name': ['Alice', 'bob ', 'CHARLIE', 'dana', 'Eve'],
    'Age': [25, None, 30, -5, 27],
    'Gender': ['F', 'male', 'M', 'Female', 'f'],
    'Country': ['usa', 'U.K.', 'United States', 'canada', 'UK'],
    'Purchase_Amount': [120.5, 'N/A', -50, 300, None],
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Gender,Country,Purchase_Amount
0,Alice,25.0,F,usa,120.5
1,bob,,male,U.K.,
2,CHARLIE,30.0,M,United States,-50.0
3,dana,-5.0,Female,canada,300.0
4,Eve,27.0,f,UK,


In [159]:
# Handle missing or placeholder values
df.replace(['N/A', 'None', 'missing'], np.nan, inplace=True)
df

Unnamed: 0,Name,Age,Gender,Country,Purchase_Amount
0,Alice,25.0,F,usa,120.5
1,bob,,male,U.K.,
2,CHARLIE,30.0,M,United States,-50.0
3,dana,-5.0,Female,canada,300.0
4,Eve,27.0,f,UK,


In [160]:
# Fix inconsistent categorical labels
df['Gender'] = df['Gender'].str.lower().map({'f': 'Female', 'female': 'Female', 'm': 'Male', 'male': 'Male'})

df['Country'] = df['Country'].str.strip().str.lower().replace({
    'usa': 'United States',
    'u.k.': 'United Kingdom',
    'uk': 'United Kingdom'
}).str.title()
df

Unnamed: 0,Name,Age,Gender,Country,Purchase_Amount
0,Alice,25.0,Female,United States,120.5
1,bob,,Male,United Kingdom,
2,CHARLIE,30.0,Male,United States,-50.0
3,dana,-5.0,Female,Canada,300.0
4,Eve,27.0,Female,United Kingdom,


In [161]:
# Fix numeric issues (negative, wrong types)
df['Purchase_Amount'] = pd.to_numeric(df['Purchase_Amount'], errors='coerce')
df.loc[df['Purchase_Amount'] < 0, 'Purchase_Amount'] = np.nan

df['Age'] = df['Age'].apply(lambda x: np.nan if x is not None and x < 0 else x)

df


Unnamed: 0,Name,Age,Gender,Country,Purchase_Amount
0,Alice,25.0,Female,United States,120.5
1,bob,,Male,United Kingdom,
2,CHARLIE,30.0,Male,United States,
3,dana,,Female,Canada,300.0
4,Eve,27.0,Female,United Kingdom,


In [162]:
# Impute missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Purchase_Amount'].fillna(df['Purchase_Amount'].median(), inplace=True)

df

Unnamed: 0,Name,Age,Gender,Country,Purchase_Amount
0,Alice,25.0,Female,United States,120.5
1,bob,27.0,Male,United Kingdom,210.25
2,CHARLIE,30.0,Male,United States,210.25
3,dana,27.0,Female,Canada,300.0
4,Eve,27.0,Female,United Kingdom,210.25


In [163]:
# Standardize text columns
df['Name'] = df['Name'].str.strip().str.title()

df

Unnamed: 0,Name,Age,Gender,Country,Purchase_Amount
0,Alice,25.0,Female,United States,120.5
1,Bob,27.0,Male,United Kingdom,210.25
2,Charlie,30.0,Male,United States,210.25
3,Dana,27.0,Female,Canada,300.0
4,Eve,27.0,Female,United Kingdom,210.25
