In [4]:
# Advanced Data Cleaning with Multiple Issues

import pandas as pd
import numpy as np
import re
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Step 1: Load the dataset
def load_data(filepath):
    try:
        df = pd.read_csv(filepath)
    except Exception as e:
        print(f"Error loading data: {e}")
        return None
    return df

# Step 2: Standardize column names
def standardize_column_names(df):
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '').str.replace('[^a-z0-9]', '', regex=True)
    return df

# Step 3: Remove duplicate rows
def remove_duplicates(df):
    initial_shape = df.shape
    df = df.drop_duplicates()
    final_shape = df.shape
    print(f"Removed {initial_shape[0] - final_shape[0]} duplicate rows.")
    return df

# Step 4: Handle missing values
def handle_missing_values(df):
    for column in df.columns:
        if df[column].dtype in ['float64', 'int64']:
            imputer = SimpleImputer(strategy='median')
            df[column] = imputer.fit_transform(df[[column]])
        else:
            df[column] = df[column].fillna(df[column].mode()[0])
    return df

# Step 5: Standardize text data
def standardize_text_data(df):
    for column in df.select_dtypes(include='object').columns:
        df[column] = df[column].str.strip().str.lower()
    return df

# Step 6: Handle inconsistent categorical entries
def fix_categorical_inconsistencies(df, column, mapping_dict):
    df[column] = df[column].replace(mapping_dict)
    return df

# Step 7: Detect and handle outliers using IQR
def handle_outliers(df):
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[col] = np.where(df[col] < lower_bound, lower_bound,
                           np.where(df[col] > upper_bound, upper_bound, df[col]))
    return df

# Step 8: Encode categorical variables
def encode_categorical(df):
    label_encoders = {}
    for column in df.select_dtypes(include='object').columns:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le
    return df, label_encoders

# Step 9: Final validation
def validate_data(df):
    print("Data types:\n", df.dtypes)
    print("\nMissing values:\n", df.isnull().sum())
    print("\nData preview:\n", df.head())
    return

# Main execution
if _name_ == "_main_":
    # Replace 'your_dataset.csv' with your actual dataset path
    filepath = 'your_dataset.csv'
    df = load_data(filepath)
    if df is not None:
        df = standardize_column_names(df)
        df = remove_duplicates(df)
        df = handle_missing_values(df)
        df = standardize_text_data(df)
        # Example: Fix inconsistencies in 'gender' column
        gender_mapping = {'m': 'male', 'f': 'female', 'male': 'male', 'female': 'female'}
        if 'gender' in df.columns:
            df = fix_categorical_inconsistencies(df, 'gender', gender_mapping)
        df = handle_outliers(df)
        df, encoders = encode_categorical(df)
        validate_data(df)
        # Optionally, save the cleaned data
        df.to_csv('cleaned_data.csv', index=False)

NameError: name '_name_' is not defined