In [18]:
import pandas as pd
import numpy as np

def standardize_column_names(df):
    """
    Convert column names to lowercase and replace spaces with underscores.
    """
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    return df

def handle_missing_values(df):
    """
    Fill missing numerical values with the mean and categorical values with the mode.
    """
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        df[col].fillna(df[col].mean(), inplace=True)
    for col in df.select_dtypes(include='object').columns:
        df[col].fillna(df[col].mode()[0], inplace=True)
    return df

def remove_duplicates(df):
    """
    Remove duplicate rows from the DataFrame.
    """
    return df.drop_duplicates()

def remove_outliers(df):
    """
    Remove outliers from numerical columns using the IQR method.
    """
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    for col in num_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

def convert_data_types(df):
    """
    Convert object columns to appropriate data types.
    """
    for col in df.columns:
        if df[col].dtype == 'object':
            try:
                df[col] = pd.to_datetime(df[col])
            except:
                try:
                    df[col] = pd.to_numeric(df[col])
                except:
                    pass
    return df

def clean_text_columns(df):
    """
    Clean textual data by stripping whitespace and converting to lowercase.
    """
    text_cols = df.select_dtypes(include='object').columns
    for col in text_cols:
        df[col] = df[col].str.strip().str.lower()
    return df
def data_cleaning_pipeline(df):
    """
    Apply a series of data cleaning functions using pandas pipe.
    """
    return (df.pipe(standardize_column_names)
              .pipe(handle_missing_values)
              .pipe(remove_duplicates)
              .pipe(remove_outliers)
              .pipe(convert_data_types)
              .pipe(clean_text_columns))

# Replace 'your_dataset.csv' with the actual dataset path
df = pd.read_csv('your_dataset.csv')

# Apply the data cleaning pipeline
df_cleaned = data_cleaning_pipeline(df)

# Display the cleaned data
print(df_cleaned.head())

# Save the cleaned dataset to a new CSV file
df_cleaned.to_csv('cleaned_dataset.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'your_dataset.csv'