In [12]:
cleaning_config = {
    "missing_values": {
        "strategy": "mean",
        "columns": ["Flavor Rating", "Texture Rating", "Total Rating"]
    },
    "outlier_handling": {
        "method": "iqr",
        "columns": ["Flavor Rating", "Texture Rating", "Total Rating"],
        "remove": True
    },
    "string_cleaning": {
        "columns": ["Flavor", "Base Flavor"],
        "operations": ["strip", "lower"]
    },
    "category_corrections": {
        "Flavor": {
            "chocolte fudge brownie": "chocolate fudge brownie",
            "mint chocolate chip": "mint chocolate chip"
        },
        "Base Flavor": {
            "vanila": "vanilla",
            "chocolatee": "chocolate"
        }
    }
}



from enum import Enum

class DataCleaningErrorMessages(Enum):
    """
      Enumeration class for standardized data cleaning error messages.

      This Enum defines a set of constant error messages to be used throughout
      the data cleaning pipeline. Utilizing an Enum for error handling ensures
      consistent messaging, improved maintainability, and reduces the risk of
      hardcoded string discrepancies in the codebase.

      Attributes:
      -----------
      INVALID_CONFIG : str
          Raised when the data cleaning configuration dictionary is invalid or unreadable.

      MISSING_COLUMN : str
          Raised when a required column specified in the configuration is not found
          in the provided pandas DataFrame.

      INVALID_STRATEGY : str
          Raised when an unsupported missing value imputation strategy is specified.

      INVALID_OUTLIER_METHOD : str
          Raised when an unsupported outlier detection or handling method is provided.

      Usage:
      ------
      These constants are typically used when raising exceptions within the
      DataCleaner class during data validation and preprocessing.

      Example:
      --------
      if col not in df.columns:
          raise ValueError(DataCleaningErrorMessages.MISSING_COLUMN.value.format(column=col))
      """
    INVALID_CONFIG = "Invalid config structure: {error}"
    MISSING_COLUMN = "Column {column} not found in DataFrame."
    INVALID_STRATEGY = "Invalid missing value strategy: {strategy}"
    INVALID_OUTLIER_METHOD = "Invalid outlier handling method: {method}"

In [13]:
import pandas as pd
import numpy as np

class DataCleaner:
    def __init__(self, config):
        self.config = config

    def clean(self, df):
        df_cleaned = df.copy()

        if "missing_values" in self.config:
            df_cleaned = self.handle_missing_values(df_cleaned)

        if "outlier_handling" in self.config:
            df_cleaned = self.handle_outliers(df_cleaned)

        if "string_cleaning" in self.config:
            df_cleaned = self.clean_strings(df_cleaned)

        if "category_corrections" in self.config:
            df_cleaned = self.correct_categories(df_cleaned)

        return df_cleaned

    def handle_missing_values(self, df):
        strategy = self.config["missing_values"]["strategy"]
        columns = self.config["missing_values"]["columns"]

        for col in columns:
            if col not in df.columns:
                raise ValueError(DataCleaningErrorMessages.MISSING_COLUMN.value.format(column=col))

            if strategy == "mean":
                df[col].fillna(df[col].mean(), inplace=True)
            elif strategy == "median":
                df[col].fillna(df[col].median(), inplace=True)
            elif strategy == "mode":
                df[col].fillna(df[col].mode()[0], inplace=True)
            else:
                raise ValueError(DataCleaningErrorMessages.INVALID_STRATEGY.value.format(strategy=strategy))
        return df

    def handle_outliers(self, df):
        method = self.config["outlier_handling"]["method"]
        columns = self.config["outlier_handling"]["columns"]
        remove = self.config["outlier_handling"].get("remove", False)

        for col in columns:
            if col not in df.columns:
                raise ValueError(DataCleaningErrorMessages.MISSING_COLUMN.value.format(column=col))

            if method == "iqr":
                q1 = df[col].quantile(0.25)
                q3 = df[col].quantile(0.75)
                iqr = q3 - q1
                lower = q1 - 1.5 * iqr
                upper = q3 + 1.5 * iqr

                if remove:
                    df = df[(df[col] >= lower) & (df[col] <= upper)]
                else:
                    df[col] = np.where(df[col] < lower, lower, np.where(df[col] > upper, upper, df[col]))
            else:
                raise ValueError(DataCleaningErrorMessages.INVALID_OUTLIER_METHOD.value.format(method=method))
        return df

    def clean_strings(self, df):
        operations = self.config["string_cleaning"]["operations"]
        columns = self.config["string_cleaning"]["columns"]

        for col in columns:
            if col not in df.columns:
                raise ValueError(DataCleaningErrorMessages.MISSING_COLUMN.value.format(column=col))

            for op in operations:
                if op == "strip":
                    df[col] = df[col].astype(str).str.strip()
                elif op == "lower":
                    df[col] = df[col].astype(str).str.lower()
                elif op == "upper":
                    df[col] = df[col].astype(str).str.upper()
        return df

    def correct_categories(self, df):
        corrections = self.config["category_corrections"]

        for col, mapping in corrections.items():
            if col not in df.columns:
                raise ValueError(DataCleaningErrorMessages.MISSING_COLUMN.value.format(column=col))
            df[col] = df[col].replace(mapping)
        return df

In [14]:
# Örnek veri seti
df = pd.read_csv('/content/Flavors.csv')
# Cleaner çalıştır
cleaner = DataCleaner(cleaning_config)
cleaned_df = cleaner.clean(df)

# Sonuç
print(cleaned_df)


                    Flavor Base Flavor Liked  Flavor Rating  Texture Rating  \
0      mint chocolate chip     vanilla   Yes           10.0             8.0   
1                chocolate   chocolate   Yes            8.8             7.6   
2                  vanilla     vanilla    No            4.7             5.0   
3             cookie dough     vanilla   Yes            6.9             6.5   
4               rocky road   chocolate   Yes            8.2             7.0   
5                pistachio     vanilla    No            2.3             3.4   
6              cake batter     vanilla   Yes            6.5             6.0   
7               neapolitan     vanilla    No            3.8             5.0   
8  chocolate fudge brownie   chocolate   Yes            8.2             7.1   

   Total Rating  
0          18.0  
1          16.6  
2           9.7  
3          13.4  
4          15.2  
5           5.7  
6          12.5  
7           8.8  
8          15.3  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
