In [14]:
import pandas as pd
import numpy as np
def standardize_column_names(df):
    """
    Standardize column names to lowercase and replace spaces with underscores.
    """
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    return df

def handle_missing_values(df):
    """
    Fill missing numerical values with the mean and categorical values with the mode.
    """
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        df[col].fillna(df[col].mean(), inplace=True)
    for col in df.select_dtypes(include=['object']).columns:
        df[col].fillna(df[col].mode()[0], inplace=True)
    return df

def remove_duplicates(df):
    """
    Remove duplicate rows from the DataFrame.
    """
    df = df.drop_duplicates()
    return df

def remove_outliers(df, columns):
    """
    Remove outliers from specified columns using the IQR method.
    """
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

def convert_data_types(df):
    """
    Convert data types as necessary. For example, convert date columns to datetime.
    """
    # Example: Convert 'date_column' to datetime
    # df['date_column'] = pd.to_datetime(df['date_column'])
    return df

def data_cleaning_pipeline(df):
    """
    Apply a series of data cleaning functions using pandas pipe.
    """
    df_cleaned = (df
                  .pipe(standardize_column_names)
                  .pipe(handle_missing_values)
                  .pipe(remove_duplicates)
                  .pipe(remove_outliers, columns=df.select_dtypes(include=['float64', 'int64']).columns)
                  .pipe(convert_data_types)
                 )
    return df_cleaned
# Replace 'your_dataset.csv' with your actual dataset path
df = pd.read_csv('your_dataset.csv')

# Apply the data cleaning pipeline
df_cleaned = data_cleaning_pipeline(df)

# Display the cleaned data
print(df_cleaned.head())
# Save the cleaned dataset to a new CSV file
df_cleaned.to_csv('cleaned_dataset.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'your_dataset.csv'