In [20]:
# Import required libraries
import pandas as pd
import numpy as np

# Define cleaning functions
def standardize_column_names(df):
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    return df

def handle_missing_values(df):
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        df[col].fillna(df[col].mean(), inplace=True)
    for col in df.select_dtypes(include='object').columns:
        df[col].fillna(df[col].mode()[0], inplace=True)
    return df

def remove_duplicates(df):
    return df.drop_duplicates()

def remove_outliers(df):
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    for col in num_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

def convert_data_types(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            try:
                df[col] = pd.to_datetime(df[col])
            except:
                try:
                    df[col] = pd.to_numeric(df[col])
                except:
                    pass
    return df

def clean_text_columns(df):
    text_cols = df.select_dtypes(include='object').columns
    for col in text_cols:
        df[col] = df[col].str.strip().str.lower()
    return df

# Pipeline function
def data_cleaning_pipeline(df):
    return (df.pipe(standardize_column_names)
              .pipe(handle_missing_values)
              .pipe(remove_duplicates)
              .pipe(remove_outliers)
              .pipe(convert_data_types)
              .pipe(clean_text_columns))

# Load your dataset
# Replace 'your_dataset.csv' with the actual file name or path
df = pd.read_csv('your_dataset.csv')

# Apply the pipeline
cleaned_df = data_cleaning_pipeline(df)

# Display the cleaned data
print("Cleaned Data Sample:")
print(cleaned_df.head())

# Save the cleaned data (optional)
cleaned_df.to_csv('cleaned_output.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'your_dataset.csv'