In [1]:
import pandas as pd
import numpy as np
import logging
from typing import List, Dict, Tuple

class DataBiasDetection:
    def __init__(self, data: pd.DataFrame):
        """Initialize with dataset."""
        self.data = data
        self.bias_report = {}

    def data_slicing(self, slice_cols: List[str]) -> Dict[str, pd.DataFrame]:
        """Slice data based on unique values in specified columns."""
        sliced_data = {}
        for col in slice_cols:
            unique_vals = self.data[col].unique()
            for val in unique_vals:
                slice_name = f"{col}_{val}"
                sliced_data[slice_name] = self.data[self.data[col] == val]
                logging.info(f"Data slice created: {slice_name} with {len(sliced_data[slice_name])} rows.")
        return sliced_data

    def calculate_statistics(self, sliced_data: Dict[str, pd.DataFrame], metric_col: str) -> Dict[str, float]:
        """Calculate mean statistics for each data slice."""
        slice_statistics = {}
        for slice_name, df_slice in sliced_data.items():
            mean_value = df_slice[metric_col].mean()
            slice_statistics[slice_name] = mean_value
            logging.info(f"Mean {metric_col} for slice {slice_name}: {mean_value:.2f}")
        return slice_statistics

    def detect_bias(self, slice_statistics: Dict[str, float], threshold_ratio: float = 0.2) -> List[Tuple[str, float]]:
        """Detect bias by identifying slices with significant mean deviation.
        Returns the biased slices and their mean values if bias is detected.
        """
        overall_mean = np.mean(list(slice_statistics.values()))
        biased_slices = [
            (slice_name, mean_value) for slice_name, mean_value in slice_statistics.items()
            if abs(mean_value - overall_mean) > threshold_ratio * overall_mean
        ]
        
        # Log bias detection
        if biased_slices:
            logging.warning(f"Bias detected in slices: {[slice[0] for slice in biased_slices]}")
            self.bias_report['biased_slices'] = biased_slices
        else:
            logging.info("No significant bias detected.")
            self.bias_report['biased_slices'] = []

        return biased_slices

    def document_bias_report(self) -> None:
        """Log and document bias detection results."""
        logging.info("Bias Report:")
        for key, value in self.bias_report.items():
            logging.info(f"{key}: {value}")


In [7]:
import pandas as pd
import logging

# Set up logging to see warnings and info
logging.basicConfig(level=logging.INFO)

# Load the dataset from a local CSV file
data = pd.read_csv('data_preprocess.csv')

# Instantiate the detector with the loaded data
detector = DataBiasDetection(data)

# Define columns to slice the data and the metric columns for bias detection
slice_columns = ['zone']
exclude_columns = ['datetime'] + slice_columns 
metric_columns = data.columns.difference(exclude_columns)

# Perform bias detection for each metric column
for metric_column in metric_columns:
    print(f"\nDetecting bias for metric column: {metric_column}")
    
    # Slice the data based on the slice columns
    sliced_data = detector.data_slicing(slice_cols=slice_columns)
    
    # Calculate statistics (mean) for each slice based on the metric column
    slice_statistics = detector.calculate_statistics(sliced_data, metric_col=metric_column)
    
    # Detect bias in the slices
    biased_slices = detector.detect_bias(slice_statistics)
    
    # Check and print results
    if biased_slices:
        print("Bias detected in slices:", biased_slices)
    else:
        print("No significant bias detected.")
    
    # Document the bias report for this metric column
    detector.document_bias_report()



Detecting bias for metric column: FeelsLikeF




Bias detected in slices: [('zone_8', 0.7207796383706702), ('zone_14', 0.7534238343198222)]

Detecting bias for metric column: HeatIndexF




Bias detected in slices: [('zone_8', 0.6834848596720436), ('zone_14', 0.7193004867831224)]

Detecting bias for metric column: WindChillF




Bias detected in slices: [('zone_14', 0.7648856001061237)]

Detecting bias for metric column: cloudcover




Bias detected in slices: [('zone_2', 0.5763552304018698), ('zone_16', 0.539478837794555), ('zone_19', 0.5573463738274994), ('zone_20', 0.5994930221917182), ('zone_17', 0.5557666437886069), ('zone_18', 0.5725323724548158), ('zone_10', 0.21583426987337978), ('zone_11', 0.3405769086686146), ('zone_12', 0.2923428485802053), ('zone_15', 0.26036512574611026)]

Detecting bias for metric column: humidity_lag_2
No significant bias detected.

Detecting bias for metric column: humidity_lag_4
No significant bias detected.

Detecting bias for metric column: humidity_lag_6
No significant bias detected.

Detecting bias for metric column: humidity_rolling_mean
No significant bias detected.

Detecting bias for metric column: humidity_rolling_std




Bias detected in slices: [('zone_1', 0.15952401438426603), ('zone_7', 0.1621496474648208), ('zone_3', 0.159018396039557), ('zone_4', 0.15907449446845756), ('zone_5', 0.16000513749733783), ('zone_2', 0.17418718707537725), ('zone_6', 0.16002816924545293), ('zone_0', 0.16294595209629628), ('zone_8', 0.28763585565346517), ('zone_10', 0.30347998451107616), ('zone_11', 0.28105155861394066), ('zone_12', 0.28227978498205347), ('zone_13', 0.2817571343147099), ('zone_14', 0.29175880581354857), ('zone_15', 0.288442135671694), ('zone_9', 0.29184007180881594)]

Detecting bias for metric column: month_cos
No significant bias detected.

Detecting bias for metric column: month_sin
No significant bias detected.

Detecting bias for metric column: precipMM




Bias detected in slices: [('zone_20', 0.0033823337432441094), ('zone_10', 0.0007086427561424539), ('zone_12', 0.0015912597310406833), ('zone_13', 0.0020137551101770687), ('zone_14', 0.001464365315683924), ('zone_15', 0.0011829682485926924)]

Detecting bias for metric column: pressure
No significant bias detected.

Detecting bias for metric column: pressureInches
No significant bias detected.

Detecting bias for metric column: subba-name




Bias detected in slices: [('zone_7', 16.0), ('zone_4', 17.0), ('zone_5', 18.0), ('zone_2', 19.0), ('zone_6', 20.0), ('zone_16', 26.0), ('zone_19', 25.0), ('zone_24', 2.0), ('zone_26', 21.0), ('zone_20', 23.0), ('zone_18', 1.0), ('zone_25', 24.0), ('zone_21', 0.0), ('zone_23', 22.0), ('zone_8', 3.0), ('zone_10', 5.0), ('zone_11', 7.0), ('zone_12', 6.0), ('zone_13', 9.0), ('zone_14', 8.0), ('zone_15', 10.0), ('zone_9', 4.0)]

Detecting bias for metric column: tempF_lag_2
No significant bias detected.

Detecting bias for metric column: tempF_lag_4
No significant bias detected.

Detecting bias for metric column: tempF_lag_6
No significant bias detected.

Detecting bias for metric column: tempF_rolling_mean




Bias detected in slices: [('zone_8', 0.6752100714624336), ('zone_13', 0.673609619875143), ('zone_14', 0.6783720326835019)]

Detecting bias for metric column: tempF_rolling_std




Bias detected in slices: [('zone_1', 0.10663600168924225), ('zone_7', 0.1058273444316204), ('zone_3', 0.10621065909615914), ('zone_4', 0.10726958162811816), ('zone_5', 0.10897088397098512), ('zone_6', 0.10590823454639793), ('zone_0', 0.10956518374763755), ('zone_14', 0.17745097042001448)]

Detecting bias for metric column: uvIndex




Bias detected in slices: [('zone_2', 0.11051922979220646), ('zone_0', 0.11970990194584159), ('zone_16', 0.11975291695264242), ('zone_19', 0.10711736444749487), ('zone_20', 0.10916723861816517), ('zone_17', 0.11773735987188284), ('zone_18', 0.11595515900251657), ('zone_21', 0.12517501715854495), ('zone_8', 0.24178716815264353), ('zone_10', 0.2265128243550361), ('zone_11', 0.22962463474938188), ('zone_12', 0.22287655153467698), ('zone_13', 0.23517894158487548), ('zone_14', 0.273701955495617), ('zone_15', 0.23018655877725336), ('zone_9', 0.21485227641667295)]

Detecting bias for metric column: value




Bias detected in slices: [('zone_1', 0.045558584429235154), ('zone_4', 0.031187445098752733), ('zone_5', 0.0558161210128893), ('zone_2', 0.02006980167660926), ('zone_6', 0.06369854352510289), ('zone_0', 0.04589126005461058), ('zone_16', 0.0592506078579547), ('zone_19', 0.02166424488155113), ('zone_24', 0.023132937989418893), ('zone_26', 0.08110682886981925), ('zone_20', 0.03025468082317547), ('zone_22', 0.03743081337303363), ('zone_17', 0.03870003300901167), ('zone_18', 0.06203685433787463), ('zone_25', 0.19934097283630972), ('zone_21', 0.04721588399840998), ('zone_23', 0.011395284957847177), ('zone_8', 0.4560170425406258), ('zone_10', 0.15639818084530605), ('zone_11', 0.4961742285541071), ('zone_12', 0.04070820776002098), ('zone_13', 0.2709538563645963), ('zone_14', 0.13335626324199945), ('zone_15', 0.05039134826669913), ('zone_9', 0.0614073588472216)]

Detecting bias for metric column: visibility
No significant bias detected.

Detecting bias for metric column: weatherCode




Bias detected in slices: [('zone_2', 0.12884944454280853), ('zone_16', 0.11075998967683369), ('zone_19', 0.11766989392928849), ('zone_20', 0.1489062340238847), ('zone_17', 0.11377064501088309), ('zone_18', 0.1374645268340471), ('zone_21', 0.1076651236120453), ('zone_10', 0.028739425067945357), ('zone_11', 0.06142028888360929), ('zone_12', 0.04814586325546065), ('zone_13', 0.060564251427396916), ('zone_14', 0.046858884536699875), ('zone_15', 0.03703252038937339), ('zone_9', 0.07115791419981769)]

Detecting bias for metric column: windspeedMiles




Bias detected in slices: [('zone_1', 0.1564863972362639), ('zone_3', 0.1524354773154499), ('zone_2', 0.16060115327947977), ('zone_6', 0.13886348625717626), ('zone_22', 0.13726950354609926), ('zone_21', 0.1519154655685198), ('zone_23', 0.1653225806451613), ('zone_10', 0.3065876226867461), ('zone_12', 0.27064571314402736), ('zone_14', 0.2800429559701306), ('zone_15', 0.29079506006343503)]

Detecting bias for metric column: windspeedMiles_lag_2




Bias detected in slices: [('zone_8', 0.2562523413501161), ('zone_11', 0.25203166754077067), ('zone_13', 0.2544729152618566), ('zone_9', 0.25389725531330387)]

Detecting bias for metric column: windspeedMiles_lag_4




Bias detected in slices: [('zone_8', 0.25353949701555906), ('zone_11', 0.2519348917359706), ('zone_12', 0.2486882195749357), ('zone_13', 0.2525068055243376), ('zone_9', 0.25244811568142655)]

Detecting bias for metric column: windspeedMiles_lag_6




Bias detected in slices: [('zone_8', 0.2513417497065508), ('zone_10', 0.2495117504557828), ('zone_11', 0.25037274293848805), ('zone_12', 0.24983142279163856), ('zone_13', 0.25075110511725485), ('zone_14', 0.2492407782023426), ('zone_15', 0.2495629479783222), ('zone_9', 0.25045203666242105)]

Detecting bias for metric column: windspeedMiles_rolling_mean




Bias detected in slices: [('zone_8', 0.34501035809351915), ('zone_10', 0.35994599246438147), ('zone_11', 0.3495924538981095), ('zone_12', 0.3552105946748957), ('zone_13', 0.347900477481996), ('zone_14', 0.35687838204794337), ('zone_15', 0.3578241107893035), ('zone_9', 0.3468740138836967)]

Detecting bias for metric column: windspeedMiles_rolling_std




Bias detected in slices: [('zone_7', 0.15008775933597687), ('zone_3', 0.15169359822082254), ('zone_4', 0.1498716755370345), ('zone_8', 0.22909281752363828), ('zone_10', 0.23544840838450834), ('zone_14', 0.23334525237208112), ('zone_15', 0.22863126867647662)]
