In [12]:
import pandas as pd
import numpy as np
import logging
from typing import List, Dict, Tuple

class DataBiasDetection:
    def __init__(self, data: pd.DataFrame):
        """Initialize with dataset."""
        self.data = data
        self.bias_report = {}

    def data_slicing(self, slice_cols: List[str]) -> Dict[str, pd.DataFrame]:
        """Slice data based on unique values in specified columns."""
        sliced_data = {}
        for col in slice_cols:
            unique_vals = self.data[col].unique()
            for val in unique_vals:
                slice_name = f"{col}_{val}"
                sliced_data[slice_name] = self.data[self.data[col] == val]
                logging.info(f"Data slice created: {slice_name} with {len(sliced_data[slice_name])} rows.")
        return sliced_data

    def calculate_statistics(self, sliced_data: Dict[str, pd.DataFrame], feature_col: str) -> Dict[str, float]:
        """Calculate mean statistics for each data slice."""
        slice_statistics = {}
        for slice_name, df_slice in sliced_data.items():
            mean_value = df_slice[feature_col].mean()
            slice_statistics[slice_name] = mean_value
            logging.info(f"Mean {feature_col} for slice {slice_name}: {mean_value:.2f}")
        return slice_statistics

    def detect_bias(self, slice_statistics: Dict[str, float], threshold_ratio: float = 0.2) -> List[Tuple[str, float]]:
        """Detect bias by identifying slices with significant mean deviation.
        Returns the biased slices and their mean values if bias is detected.
        """
        overall_mean = np.mean(list(slice_statistics.values()))
        biased_slices = [
            (slice_name, mean_value) for slice_name, mean_value in slice_statistics.items()
            if abs(mean_value - overall_mean) > threshold_ratio * overall_mean
        ]
        
        # Log bias detection
        if biased_slices:
            logging.warning(f"Bias detected in slices: {[slice[0] for slice in biased_slices]}")
            self.bias_report['biased_slices'] = biased_slices
        else:
            logging.info("No significant bias detected.")
            self.bias_report['biased_slices'] = []

        return biased_slices

    def document_bias_report(self) -> None:
        """Log and document bias detection results."""
        logging.info("Bias Report:")
        for key, value in self.bias_report.items():
            logging.info(f"{key}: {value}")

    def mitigate_bias_resample_with_imputation(self, slice_cols: List[str], date_col: str, feature_cols: List[str]) -> pd.DataFrame:
        """Mitigate bias by re-sampling underrepresented slices with rolling average imputation for missing dates."""
        max_count = max(len(self.data[self.data[col] == val])
                        for col in slice_cols for val in self.data[col].unique())
        
        print(self.data.columns)
        
        resampled_data = pd.DataFrame()
        date_range = pd.date_range(start=self.data[date_col].min(), end=self.data[date_col].max(), freq='D')

        for col in slice_cols:
            for val in self.data[col].unique():
                # Create a subset for each unique value in the slice column
                subset = self.data[self.data[col] == val].set_index(date_col)
                
                # Reindex to include all dates in the range and fill missing feature columns using rolling average

                subset = subset.reindex(date_range).sort_index()
                subset[slice_cols] = val  # Ensure the slice column retains its value
                for feature in feature_cols:
                    subset[feature] = subset[feature].fillna(subset[feature].rolling(window=3, min_periods=1).mean())

                # Ensure we have max_count rows by resampling with replacement if needed
                resampled_subset = subset.sample(n=max_count, replace=True, random_state=42)
                
                # Reset index and append to resampled data

                resampled_data = pd.concat([resampled_data, resampled_subset.reset_index()], axis=0)
                logging.info(f"Resampled data slice {col}_{val} to {max_count} rows with imputation.")

        resampled_data = resampled_data.rename(columns={'index': date_col})
        return resampled_data.reset_index(drop=True)

    

In [13]:
import pandas as pd
import logging

# Set up logging to see warnings and info
logging.basicConfig(level=logging.INFO)

# Load the dataset from a local CSV file
data = pd.read_csv('/Users/akm/Desktop/mlops-project/preprocessed_data.csv')

# Instantiate the detector with the loaded data
detector = DataBiasDetection(data)

# Define columns to slice the data and the metric columns for bias detection
slice_columns = ['zone']
exclude_columns = ['datetime'] + slice_columns 
feature_columns = data.columns.difference(exclude_columns)

sliced_data = None
slice_statistics = None
biased_slices = None
resampled_data = None

# Perform bias detection for each metric column
for feature_column in feature_columns:
    print(f"\nDetecting bias for metric column: {feature_column}")
    
    # Slice the data based on the slice columns
    sliced_data = detector.data_slicing(slice_cols=slice_columns)
    
    # Calculate statistics (mean) for each slice based on the metric column
    slice_statistics = detector.calculate_statistics(sliced_data, feature_col=feature_column)
    
    # Detect bias in the slices
    biased_slices = detector.detect_bias(slice_statistics)
    
    # Check and print results
    if biased_slices:
        print("Bias detected in slices:", biased_slices)
    else:
        print("No significant bias detected.")
    
    resampled_data = detector.mitigate_bias_resample_with_imputation(slice_cols=slice_columns, date_col = 'datetime', feature_cols = feature_columns)
    # Document the bias report for this metric column
    detector.document_bias_report()


INFO:root:Data slice created: zone_1 with 30634 rows.
INFO:root:Data slice created: zone_7 with 30634 rows.
INFO:root:Data slice created: zone_3 with 30633 rows.
INFO:root:Data slice created: zone_4 with 30633 rows.
INFO:root:Data slice created: zone_5 with 30633 rows.
INFO:root:Data slice created: zone_2 with 30633 rows.
INFO:root:Data slice created: zone_6 with 30633 rows.
INFO:root:Data slice created: zone_0 with 30633 rows.
INFO:root:Data slice created: zone_16 with 34979 rows.
INFO:root:Data slice created: zone_19 with 34979 rows.
INFO:root:Data slice created: zone_24 with 34979 rows.
INFO:root:Data slice created: zone_26 with 34979 rows.
INFO:root:Data slice created: zone_20 with 34979 rows.
INFO:root:Data slice created: zone_22 with 34979 rows.
INFO:root:Data slice created: zone_17 with 34979 rows.
INFO:root:Data slice created: zone_18 with 34979 rows.
INFO:root:Data slice created: zone_25 with 34979 rows.
INFO:root:Data slice created: zone_21 with 34979 rows.
INFO:root:Data sli


Detecting bias for metric column: FeelsLikeF
Bias detected in slices: [('zone_8', 0.7086940969340797), ('zone_14', 0.7441896180043099)]
Index(['Unnamed: 0', 'precipMM', 'weatherCode', 'visibility', 'HeatIndexF',
       'WindChillF', 'windspeedMiles', 'FeelsLikeF', 'tempF_rolling_mean',
       'windspeedMiles_rolling_mean', 'humidity_rolling_mean', 'value',
       'pressure', 'pressureInches', 'cloudcover', 'uvIndex', 'subba-name',
       'zone', 'tempF_rolling_std', 'windspeedMiles_rolling_std',
       'humidity_rolling_std', 'tempF_lag_2', 'windspeedMiles_lag_2',
       'humidity_lag_2', 'tempF_lag_4', 'windspeedMiles_lag_4',
       'humidity_lag_4', 'tempF_lag_6', 'windspeedMiles_lag_6',
       'humidity_lag_6', 'month_sin', 'month_cos'],
      dtype='object')


KeyError: 'datetime'

In [4]:
data.shape

(880291, 32)

In [5]:
resampled_data.shape

(944433, 32)