In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def preprocess_data(df, target_column=None):
    """
    Applies a comprehensive set of data preprocessing steps to the input DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame to preprocess.
        target_column (str, optional): The name of the target variable column.
                                       If provided, it will be separated before preprocessing
                                       and rejoined at the end. Defaults to None.

    Returns:
        tuple: A tuple containing the preprocessed features (pd.DataFrame) and
               the target variable (pd.Series) if target_column is provided,
               otherwise just the preprocessed features (pd.DataFrame).
    """
    X = df.copy()
    y = None
    if target_column in X.columns:
        y = X[target_column]
        X = X.drop(columns=[target_column])

    # Identify numerical and categorical columns
    numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
    categorical_cols = X.select_dtypes(include='object').columns.tolist()

    # Create preprocessing pipelines for numerical and categorical features
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),  # Handle missing values with median
        ('scaler', StandardScaler())                    # Scale numerical features
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')), # Handle missing values with most frequent
        ('onehot', OneHotEncoder(handle_unknown='ignore'))    # One-hot encode categorical features
    ])

    # Create a preprocessor using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)])

    # Fit and transform the data
    X_processed = preprocessor.fit_transform(X)

    # Get feature names after one-hot encoding
    feature_names = preprocessor.get_feature_names_out(input_features=X.columns)
    X_processed_df = pd.DataFrame(X_processed, columns=feature_names, index=X.index)

    if y is not None:
        return X_processed_df, y
    else:
        return X_processed_df

def handle_outliers_iqr(df, columns, threshold=1.5):
    """
    Handles outliers in specified columns of a DataFrame using the IQR method.

    Args:
        df (pd.DataFrame): The input DataFrame.
        columns (list): A list of column names to handle outliers in.
        threshold (float): The multiplier for the IQR to define outlier boundaries.

    Returns:
        pd.DataFrame: The DataFrame with outliers capped at the IQR boundaries.
    """
    df_cleaned = df.copy()
    for col in columns:
        if col in df_cleaned.columns and pd.api.types.is_numeric_dtype(df_cleaned[col]):
            Q1 = df_cleaned[col].quantile(0.25)
            Q3 = df_cleaned[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - threshold * IQR
            upper_bound = Q3 + threshold * IQR
            df_cleaned[col] = np.where(df_cleaned[col] < lower_bound, lower_bound, df_cleaned[col])
            df_cleaned[col] = np.where(df_cleaned[col] > upper_bound, upper_bound, df_cleaned[col])
    return df_cleaned

def remove_duplicate_rows(df):
    """
    Removes duplicate rows from a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: The DataFrame with duplicate rows removed.
    """
    df_no_duplicates = df.drop_duplicates().reset_index(drop=True)
    return df_no_duplicates

def handle_imbalanced_data(X, y, method='oversampling', random_state=None):
    """
    Handles imbalanced datasets using either oversampling or undersampling.

    Args:
        X (pd.DataFrame): The feature matrix.
        y (pd.Series): The target variable.
        method (str): 'oversampling' or 'undersampling'. Defaults to 'oversampling'.
        random_state (int, optional): Random seed for reproducibility. Defaults to None.

    Returns:
        tuple: The resampled feature matrix (pd.DataFrame) and target variable (pd.Series).
               Returns the original data if the method is not recognized.
    """
    if method == 'oversampling':
        from imblearn.over_sampling import SMOTE
        smote = SMOTE(random_state=random_state)
        X_resampled, y_resampled = smote.fit_resample(X, y)
        return pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled)
    elif method == 'undersampling':
        from imblearn.under_sampling import RandomUnderSampler
        rus = RandomUnderSampler(random_state=random_state)
        X_resampled, y_resampled = rus.fit_resample(X, y)
        return pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled)
    else:
        print(f"Warning: Method '{method}' not recognized. Returning original data.")
        return X, y

if _name_ == "_main_":
    # Sample DataFrame (replace with your actual data loading)
    data = {
        'numerical_col1': [1, 5, 2, np.nan, 7, 100, 3],
        'numerical_col2': [0.1, 0.5, 0.2, 0.8, np.nan, 0.3, 0.6],
        'categorical_col1': ['A', 'B', 'A', 'C', 'B', 'A', np.nan],
        'categorical_col2': ['X', 'Y', 'X', 'Z', 'Y', 'X', 'Y'],
        'target_variable': [0, 1, 0, 1, 0, 1, 0]
    }
    df = pd.DataFrame(data)

    print("Original DataFrame:")
    print(df)
    print("\n" + "="*50 + "\n")

    # Separate target variable if it exists
    TARGET_COLUMN = 'target_variable'
    if TARGET_COLUMN in df.columns:
        X, y = df.drop(columns=[TARGET_COLUMN]), df[TARGET_COLUMN]
    else:
        X, y = df, None

    # Handle missing values and scale/encode
    X_processed = preprocess_data(X)
    print("DataFrame after missing value handling, scaling, and encoding:")
    print(X_processed)
    print("\n" + "="*50 + "\n")

    # Handle outliers in numerical columns
    numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
    X_processed_no_outliers = handle_outliers_iqr(X_processed, numerical_cols)
    print("DataFrame after outlier handling (IQR method):")
    print(X_processed_no_outliers)
    print("\n" + "="*50 + "\n")

    # Remove duplicate rows
    X_processed_unique = remove_duplicate_rows(X_processed_no_outliers)
    print("DataFrame after removing duplicate rows:")
    print(X_processed_unique)
    print("\n" + "="*50 + "\n")

    # Handle imbalanced data if a target variable exists
    if y is not None:
        X_balanced, y_balanced = handle_imbalanced_data(X_processed_unique, y, method='oversampling', random_state=42)
        print("DataFrame after handling imbalanced data (oversampling):")
        print("Resampled Features:")
        print(X_balanced)
        print("\nResampled Target:")
        print(y_balanced.value_counts())
        print("\n" + "="*50 + "\n")

NameError: name '_name_' is not defined