In [12]:
import pandas as pd
import numpy as np
import logging
from typing import List, Dict, Tuple

class DataBiasDetection:
    def __init__(self, data: pd.DataFrame):
        """Initialize with dataset."""
        self.data = data
        self.bias_report = {}

    def data_slicing(self, slice_cols: List[str]) -> Dict[str, pd.DataFrame]:
        """Slice data based on unique values in specified columns."""
        sliced_data = {}
        for col in slice_cols:
            unique_vals = self.data[col].unique()
            for val in unique_vals:
                slice_name = f"{col}_{val}"
                sliced_data[slice_name] = self.data[self.data[col] == val]
                logging.info(f"Data slice created: {slice_name} with {len(sliced_data[slice_name])} rows.")
        return sliced_data

    def calculate_statistics(self, sliced_data: Dict[str, pd.DataFrame], feature_col: str) -> Dict[str, float]:
        """Calculate mean statistics for each data slice."""
        slice_statistics = {}
        for slice_name, df_slice in sliced_data.items():
            mean_value = df_slice[feature_col].mean()
            slice_statistics[slice_name] = mean_value
            logging.info(f"Mean {feature_col} for slice {slice_name}: {mean_value:.2f}")
        return slice_statistics

    def detect_bias(self, slice_statistics: Dict[str, float], threshold_ratio: float = 0.2) -> List[Tuple[str, float]]:
        """Detect bias by identifying slices with significant mean deviation.
        Returns the biased slices and their mean values if bias is detected.
        """
        overall_mean = np.mean(list(slice_statistics.values()))
        biased_slices = [
            (slice_name, mean_value) for slice_name, mean_value in slice_statistics.items()
            if abs(mean_value - overall_mean) > threshold_ratio * overall_mean
        ]
        
        # Log bias detection
        if biased_slices:
            logging.warning(f"Bias detected in slices: {[slice[0] for slice in biased_slices]}")
            self.bias_report['biased_slices'] = biased_slices
        else:
            logging.info("No significant bias detected.")
            self.bias_report['biased_slices'] = []

        return biased_slices

    def document_bias_report(self) -> None:
        """Log and document bias detection results."""
        logging.info("Bias Report:")
        for key, value in self.bias_report.items():
            logging.info(f"{key}: {value}")

    def mitigate_bias_resample_with_imputation(self, slice_cols: List[str], date_col: str, feature_cols: List[str]) -> pd.DataFrame:
        """Mitigate bias by re-sampling underrepresented slices with rolling average imputation for missing dates."""
        max_count = max(len(self.data[self.data[col] == val])
                        for col in slice_cols for val in self.data[col].unique())
        
        print(self.data.columns)
        
        resampled_data = pd.DataFrame()
        date_range = pd.date_range(start=self.data[date_col].min(), end=self.data[date_col].max(), freq='D')

        for col in slice_cols:
            for val in self.data[col].unique():
                # Create a subset for each unique value in the slice column
                subset = self.data[self.data[col] == val].set_index(date_col)
                
                # Reindex to include all dates in the range and fill missing feature columns using rolling average

                subset = subset.reindex(date_range).sort_index()
                subset[slice_cols] = val  # Ensure the slice column retains its value
                for feature in feature_cols:
                    subset[feature] = subset[feature].fillna(subset[feature].rolling(window=3, min_periods=1).mean())

                # Ensure we have max_count rows by resampling with replacement if needed
                resampled_subset = subset.sample(n=max_count, replace=True, random_state=42)
                
                # Reset index and append to resampled data

                resampled_data = pd.concat([resampled_data, resampled_subset.reset_index()], axis=0)
                logging.info(f"Resampled data slice {col}_{val} to {max_count} rows with imputation.")

        resampled_data = resampled_data.rename(columns={'index': date_col})
        return resampled_data.reset_index(drop=True)

    

In [13]:
import pandas as pd
import logging

# Set up logging to see warnings and info
logging.basicConfig(level=logging.INFO)

# Load the dataset from a local CSV file
data = pd.read_csv('/Users/akm/Desktop/mlops-project/preprocessed_data.csv')

# Instantiate the detector with the loaded data
detector = DataBiasDetection(data)

# Define columns to slice the data and the metric columns for bias detection
slice_columns = ['zone']
exclude_columns = ['datetime'] + slice_columns 
feature_columns = data.columns.difference(exclude_columns)

sliced_data = None
slice_statistics = None
biased_slices = None
resampled_data = None

# Perform bias detection for each metric column
for feature_column in feature_columns:
    print(f"\nDetecting bias for metric column: {feature_column}")
    
    # Slice the data based on the slice columns
    sliced_data = detector.data_slicing(slice_cols=slice_columns)
    
    # Calculate statistics (mean) for each slice based on the metric column
    slice_statistics = detector.calculate_statistics(sliced_data, feature_col=feature_column)
    
    # Detect bias in the slices
    biased_slices = detector.detect_bias(slice_statistics)
    
    # Check and print results
    if biased_slices:
        print("Bias detected in slices:", biased_slices)
    else:
        print("No significant bias detected.")
    
    resampled_data = detector.mitigate_bias_resample_with_imputation(slice_cols=slice_columns, date_col = 'datetime', feature_cols = feature_columns)
    # Document the bias report for this metric column
    detector.document_bias_report()


INFO:root:Data slice created: zone_1 with 30634 rows.
INFO:root:Data slice created: zone_7 with 30634 rows.
INFO:root:Data slice created: zone_3 with 30633 rows.
INFO:root:Data slice created: zone_4 with 30633 rows.
INFO:root:Data slice created: zone_5 with 30633 rows.
INFO:root:Data slice created: zone_2 with 30633 rows.
INFO:root:Data slice created: zone_6 with 30633 rows.
INFO:root:Data slice created: zone_0 with 30633 rows.
INFO:root:Data slice created: zone_16 with 34979 rows.
INFO:root:Data slice created: zone_19 with 34979 rows.
INFO:root:Data slice created: zone_24 with 34979 rows.
INFO:root:Data slice created: zone_26 with 34979 rows.
INFO:root:Data slice created: zone_20 with 34979 rows.
INFO:root:Data slice created: zone_22 with 34979 rows.
INFO:root:Data slice created: zone_17 with 34979 rows.
INFO:root:Data slice created: zone_18 with 34979 rows.
INFO:root:Data slice created: zone_25 with 34979 rows.
INFO:root:Data slice created: zone_21 with 34979 rows.
INFO:root:Data sli


Detecting bias for metric column: FeelsLikeF
Bias detected in slices: [('zone_8', 0.7086940969340797), ('zone_14', 0.7441896180043099)]
Index(['Unnamed: 0', 'precipMM', 'weatherCode', 'visibility', 'HeatIndexF',
       'WindChillF', 'windspeedMiles', 'FeelsLikeF', 'tempF_rolling_mean',
       'windspeedMiles_rolling_mean', 'humidity_rolling_mean', 'value',
       'pressure', 'pressureInches', 'cloudcover', 'uvIndex', 'subba-name',
       'zone', 'tempF_rolling_std', 'windspeedMiles_rolling_std',
       'humidity_rolling_std', 'tempF_lag_2', 'windspeedMiles_lag_2',
       'humidity_lag_2', 'tempF_lag_4', 'windspeedMiles_lag_4',
       'humidity_lag_4', 'tempF_lag_6', 'windspeedMiles_lag_6',
       'humidity_lag_6', 'month_sin', 'month_cos'],
      dtype='object')


KeyError: 'datetime'

In [4]:
data.shape

(880291, 32)

In [5]:
resampled_data.shape

(944433, 32)

In [5]:
import pandas as pd

df = pd.read_csv('/Users/akm/Desktop/mlops-project/data_preprocess.csv')
df['subba-name'] = df['subba-name'].astype(str)
df['zone'] = df['zone'].astype(str)
df

  df = pd.read_csv('/Users/akm/Desktop/mlops-project/data_preprocess.csv')


Unnamed: 0,datetime,precipMM,weatherCode,visibility,HeatIndexF,WindChillF,windspeedMiles,FeelsLikeF,tempF_rolling_mean,windspeedMiles_rolling_mean,...,tempF_lag_4,windspeedMiles_lag_4,humidity_lag_4,tempF_lag_6,windspeedMiles_lag_6,humidity_lag_6,month_sin,month_cos,subba-name,zone
0,2019-01-01 00:00:00,0.052731,0.861702,1.0,0.402878,0.457516,0.200,0.43750,0.422594,0.327044,...,0.462121,0.250,0.916667,0.416667,0.275,0.885417,0.75,0.933013,ISNE - New Hampshire,4002
1,2019-01-01 00:00:00,0.058380,0.861702,0.2,0.424460,0.477124,0.300,0.45625,0.422594,0.345912,...,0.484848,0.250,0.937500,0.446970,0.225,0.968750,0.75,0.933013,ISNE - Northeast Mass.,4008
2,2019-01-01 01:00:00,0.050847,0.861702,0.6,0.460432,0.522876,0.225,0.50000,0.426778,0.339623,...,0.477273,0.200,0.958333,0.462121,0.250,0.916667,0.75,0.933013,ISNE - Connecticut,4004
3,2019-01-01 01:00:00,0.064030,0.861702,0.6,0.467626,0.529412,0.275,0.50625,0.428173,0.345912,...,0.401515,0.275,0.833333,0.484848,0.250,0.937500,0.75,0.933013,ISNE - Rhode Island,4005
4,2019-01-01 01:00:00,0.054614,0.861702,0.9,0.402878,0.464052,0.200,0.44375,0.418410,0.345912,...,0.424242,0.200,0.958333,0.477273,0.200,0.958333,0.75,0.933013,ISNE - New Hampshire,4002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1116266,2024-01-01 00:00:00,0.000000,0.000000,1.0,0.460432,0.516340,0.300,0.49375,0.520223,0.320755,...,0.613636,0.275,0.677083,0.530303,0.200,0.447917,0.75,0.933013,ERCO - Far West,FWES
1116267,2024-01-01 00:00:00,0.000000,0.010638,1.0,0.503597,0.581699,0.150,0.55625,0.517434,0.327044,...,0.545455,0.225,0.614583,0.545455,0.125,0.458333,0.75,0.933013,ERCO - East,EAST
1116268,2024-01-01 00:00:00,0.001883,0.223404,1.0,0.597122,0.679739,0.150,0.65000,0.520223,0.295597,...,0.583333,0.125,0.843750,0.613636,0.275,0.677083,0.75,0.933013,ERCO - Coast,COAS
1116269,2024-01-01 00:00:00,0.000000,0.000000,1.0,0.625899,0.705882,0.300,0.67500,0.541144,0.314465,...,0.454545,0.325,0.645833,0.545455,0.225,0.614583,0.75,0.933013,ERCO - South,SOUT


In [7]:
df.describe()

Unnamed: 0,precipMM,weatherCode,visibility,HeatIndexF,WindChillF,windspeedMiles,FeelsLikeF,tempF_rolling_mean,windspeedMiles_rolling_mean,humidity_rolling_mean,...,windspeedMiles_lag_2,humidity_lag_2,tempF_lag_4,windspeedMiles_lag_4,humidity_lag_4,tempF_lag_6,windspeedMiles_lag_6,humidity_lag_6,month_sin,month_cos
count,1116271.0,1116271.0,1116271.0,1116271.0,1116271.0,1116271.0,1116271.0,1116271.0,1116271.0,1116271.0,...,1116271.0,1116271.0,1116271.0,1116271.0,1116271.0,1116271.0,1116271.0,1116271.0,1116271.0,1116271.0
mean,0.002569103,0.08996441,0.9147898,0.5518041,0.6157885,0.206449,0.5938949,0.5597729,0.2864638,0.6468494,...,0.2064489,0.6738261,0.5736832,0.206449,0.6738264,0.5736831,0.2064489,0.6738271,0.4782626,0.5004583
std,0.01181891,0.2121369,0.211196,0.149052,0.1464348,0.1084086,0.1464234,0.159433,0.1313615,0.1903274,...,0.1084085,0.2068161,0.1480246,0.1084085,0.2068164,0.1480247,0.1084084,0.2068165,0.3518997,0.3545335
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.4316547,0.503268,0.125,0.48125,0.432357,0.1886792,0.5150376,...,0.125,0.53125,0.4545455,0.125,0.53125,0.4545455,0.125,0.53125,0.0669875,0.0669875
50%,0.0,0.0106383,1.0,0.5539568,0.6339869,0.2,0.60625,0.5718271,0.2641509,0.6654135,...,0.2,0.6979167,0.5833333,0.2,0.6979167,0.5833333,0.2,0.6979167,0.5,0.5
75%,0.0,0.03191489,1.0,0.6834532,0.7320261,0.275,0.7125,0.6820084,0.3710692,0.7969925,...,0.275,0.84375,0.6893939,0.275,0.84375,0.6893939,0.275,0.84375,0.75,0.9330125
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
df.columns



Index(['datetime', 'precipMM', 'weatherCode', 'visibility', 'HeatIndexF',
       'WindChillF', 'windspeedMiles', 'FeelsLikeF', 'tempF_rolling_mean',
       'windspeedMiles_rolling_mean', 'humidity_rolling_mean', 'value',
       'pressure', 'pressureInches', 'cloudcover', 'uvIndex',
       'tempF_rolling_std', 'windspeedMiles_rolling_std',
       'humidity_rolling_std', 'tempF_lag_2', 'windspeedMiles_lag_2',
       'humidity_lag_2', 'tempF_lag_4', 'windspeedMiles_lag_4',
       'humidity_lag_4', 'tempF_lag_6', 'windspeedMiles_lag_6',
       'humidity_lag_6', 'month_sin', 'month_cos', 'subba-name', 'zone'],
      dtype='object')

In [8]:
df['zone'].value_counts()

zone
ZONG    43719
ZONH    43719
ZONJ    43719
ZONC    43719
ZONB    43719
ZONE    43719
ZONK    43719
ZONI    43719
ZOND    43719
ZONA    43719
ZONF    43719
FWES    40047
NCEN    40047
NRTH    40047
COAS    40047
SCEN    40047
SOUT    40047
WEST    40047
EAST    40047
4008    39374
4002    39374
4001    39373
4007    39373
4003    39373
4006    39373
4005    39373
4004    39373
Name: count, dtype: int64

In [13]:
df['subba-name'].value_counts()

subba-name
Hudson Valley - NYIS            43719
Millwood - NYIS                 43719
New York City - NYIS            43719
Central - NYIS                  43719
Genesee - NYIS                  43719
Mohawk Valley - NYIS            43719
Long Island - NYIS              43719
Dunwoodie - NYIS                43719
North - NYIS                    43719
West - NYIS                     43719
Capital - NYIS                  43719
ERCO - Far West                 40047
ERCO - North Central            40047
ERCO - North                    40047
ERCO - Coast                    40047
ERCO - South Central            40047
ERCO - South                    40047
ERCO - West                     40047
ERCO - East                     40047
ISNE - Northeast Mass.          39374
ISNE - New Hampshire            39374
ISNE - Maine                    39373
ISNE - Western/Central Mass.    39373
ISNE - Vermont                  39373
ISNE - Southeast Mass.          39373
ISNE - Rhode Island             39373
I

In [18]:
!pip install fairlearn

Collecting fairlearn
  Downloading fairlearn-0.11.0-py3-none-any.whl.metadata (7.0 kB)
Downloading fairlearn-0.11.0-py3-none-any.whl (232 kB)
Installing collected packages: fairlearn
Successfully installed fairlearn-0.11.0


In [25]:
# Find records where both 'subba-name' and 'zone' are not null
records_with_both = df[(df['subba-name'].notnull()) & (df['zone'].notnull())]

In [26]:
records_with_both

Unnamed: 0,datetime,precipMM,weatherCode,visibility,HeatIndexF,WindChillF,windspeedMiles,FeelsLikeF,tempF_rolling_mean,windspeedMiles_rolling_mean,...,tempF_lag_4,windspeedMiles_lag_4,humidity_lag_4,tempF_lag_6,windspeedMiles_lag_6,humidity_lag_6,month_sin,month_cos,subba-name,zone
0,2019-01-01 00:00:00,0.052731,0.861702,1.0,0.402878,0.457516,0.200000,0.437500,0.422594,0.327044,...,0.462121,0.250000,0.916667,0.416667,0.275000,0.885417,0.750000,0.933013,15,1
1,2019-01-01 01:00:00,0.050847,0.861702,0.6,0.460432,0.522876,0.225000,0.500000,0.426778,0.339623,...,0.477273,0.200000,0.958333,0.462121,0.250000,0.916667,0.750000,0.933013,13,3
2,2019-01-01 02:00:00,0.050847,0.861702,0.8,0.410072,0.470588,0.200000,0.450000,0.415621,0.377358,...,0.462121,0.300000,0.947917,0.424242,0.200000,0.968750,0.750000,0.933013,15,1
3,2019-01-01 03:00:00,0.039548,0.861702,0.7,0.424460,0.470588,0.375000,0.450000,0.426778,0.440252,...,0.492424,0.275000,0.937500,0.492424,0.250000,0.947917,0.750000,0.933013,14,0
4,2019-01-01 04:00:00,0.011299,0.659574,0.8,0.417266,0.464052,0.350000,0.443750,0.450488,0.345912,...,0.469697,0.100000,0.958333,0.424242,0.350000,0.885417,0.750000,0.933013,19,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48917,2024-04-05 00:00:00,0.002933,0.771739,1.0,0.107527,0.075269,0.297297,0.071429,0.072340,0.458333,...,0.147727,0.351351,0.864583,0.113636,0.189189,0.895833,0.933013,0.250000,16,7
48918,2024-04-05 01:00:00,0.014663,0.815217,0.2,0.096774,0.064516,0.297297,0.061224,0.036170,0.319444,...,0.102273,0.270270,0.979167,0.090909,0.081081,0.937500,0.933013,0.250000,15,1
48919,2024-04-05 02:00:00,0.000000,0.771739,1.0,0.086022,0.096774,0.081081,0.091837,0.031915,0.215278,...,0.079545,0.270270,0.989583,0.159091,0.378378,0.989583,0.933013,0.250000,20,6
48920,2024-04-05 03:00:00,0.000000,0.000000,1.0,0.107527,0.075269,0.243243,0.071429,0.055319,0.333333,...,0.102273,0.189189,0.895833,0.102273,0.297297,0.968750,0.933013,0.250000,18,5


In [11]:
df[['zone', 'subba-name']].drop_duplicates().shape[0]

27

In [15]:
df_without_zone = df.drop(columns=['zone'])

In [16]:
df_without_zone

Unnamed: 0,datetime,precipMM,weatherCode,visibility,HeatIndexF,WindChillF,windspeedMiles,FeelsLikeF,tempF_rolling_mean,windspeedMiles_rolling_mean,...,humidity_lag_2,tempF_lag_4,windspeedMiles_lag_4,humidity_lag_4,tempF_lag_6,windspeedMiles_lag_6,humidity_lag_6,month_sin,month_cos,subba-name
0,2019-01-01 00:00:00,0.052731,0.861702,1.0,0.402878,0.457516,0.200,0.43750,0.422594,0.327044,...,0.958333,0.462121,0.250,0.916667,0.416667,0.275,0.885417,0.75,0.933013,ISNE - New Hampshire
1,2019-01-01 00:00:00,0.058380,0.861702,0.2,0.424460,0.477124,0.300,0.45625,0.422594,0.345912,...,0.833333,0.484848,0.250,0.937500,0.446970,0.225,0.968750,0.75,0.933013,ISNE - Northeast Mass.
2,2019-01-01 01:00:00,0.050847,0.861702,0.6,0.460432,0.522876,0.225,0.50000,0.426778,0.339623,...,0.958333,0.477273,0.200,0.958333,0.462121,0.250,0.916667,0.75,0.933013,ISNE - Connecticut
3,2019-01-01 01:00:00,0.064030,0.861702,0.6,0.467626,0.529412,0.275,0.50625,0.428173,0.345912,...,0.947917,0.401515,0.275,0.833333,0.484848,0.250,0.937500,0.75,0.933013,ISNE - Rhode Island
4,2019-01-01 01:00:00,0.054614,0.861702,0.9,0.402878,0.464052,0.200,0.44375,0.418410,0.345912,...,0.958333,0.424242,0.200,0.958333,0.477273,0.200,0.958333,0.75,0.933013,ISNE - New Hampshire
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1116266,2024-01-01 00:00:00,0.000000,0.000000,1.0,0.460432,0.516340,0.300,0.49375,0.520223,0.320755,...,0.843750,0.613636,0.275,0.677083,0.530303,0.200,0.447917,0.75,0.933013,ERCO - Far West
1116267,2024-01-01 00:00:00,0.000000,0.010638,1.0,0.503597,0.581699,0.150,0.55625,0.517434,0.327044,...,0.645833,0.545455,0.225,0.614583,0.545455,0.125,0.458333,0.75,0.933013,ERCO - East
1116268,2024-01-01 00:00:00,0.001883,0.223404,1.0,0.597122,0.679739,0.150,0.65000,0.520223,0.295597,...,0.447917,0.583333,0.125,0.843750,0.613636,0.275,0.677083,0.75,0.933013,ERCO - Coast
1116269,2024-01-01 00:00:00,0.000000,0.000000,1.0,0.625899,0.705882,0.300,0.67500,0.541144,0.314465,...,0.645833,0.454545,0.325,0.645833,0.545455,0.225,0.614583,0.75,0.933013,ERCO - South


In [20]:
# Re-import necessary libraries and re-create the data and logic after state reset
import pandas as pd
import numpy as np
from fairlearn.metrics import MetricFrame, selection_rate
from datetime import timedelta

# Sample data structure (mock data for demonstration purposes)
data = df

# Impute missing dates for each 'zone'
zones = data['zone'].unique()
min_date = data['datetime'].min()
max_date = data['datetime'].max()
full_date_range = pd.date_range(min_date, max_date, freq='H')

# imputed_data = pd.DataFrame()

# for zone in zones:
#     zone_data = data[data['zone'] == zone]
#     zone_data.set_index('datetime', inplace=True)
#     zone_data = zone_data.reindex(full_date_range, fill_value=np.nan)
#     zone_data['zone'] = zone
#     zone_data['subba-name'] = zone_data['subba-name'].fillna(method='ffill')  # Forward fill subba-name
#     zone_data['value'] = zone_data['value'].fillna(zone_data['value'].rolling(window=3, min_periods=1).mean())
#     zone_data.reset_index(inplace=True)
#     zone_data.rename(columns={'index': 'datetime'}, inplace=True)
#     imputed_data = pd.concat([imputed_data, zone_data])

# Perform bias detection using FairLearn's MetricFrame on 'subba-name'
metric_frame = MetricFrame(
    metrics=selection_rate,
    y_true=imputed_data['value'].notnull().astype(int),  # Considering non-null values as positive outcomes
    y_pred=imputed_data['value'].notnull().astype(int),
    sensitive_features=imputed_data['subba-name']
)

# Display metric frame results
import ace_tools as tools; tools.display_dataframe_to_user(name="Bias Detection Metric by 'subba-name'", dataframe=metric_frame.by_group)

# imputed_data,
metric_frame.by_group


  full_date_range = pd.date_range(min_date, max_date, freq='H')


NameError: name 'imputed_data' is not defined

In [21]:
import pandas as pd
import numpy as np
from fairlearn.metrics import MetricFrame, selection_rate
from sklearn.metrics import accuracy_score

# Assuming `data` is your DataFrame already loaded
# Replace the placeholder with your actual DataFrame
data = df

# Evaluate data bias using 'subba-name' as the sensitive feature
metric_frame = MetricFrame(
    metrics={
        'Selection Rate': selection_rate,
        'Accuracy': accuracy_score
    },
    y_true=data['value'].notnull().astype(int),  # Replace with your target variable
    y_pred=data['value'].notnull().astype(int),  # Assuming binary outcome for bias detection
    sensitive_features=data['subba-name']
)

# Print bias metrics for each subgroup
print("Metrics by Group:\n", metric_frame.by_group)
print("\nOverall Metrics:\n", metric_frame.overall)

# Check if bias mitigation is needed by analyzing the difference between groups
print("\nDemographic Parity Difference:")
print(metric_frame.difference(method='between_groups'))


Metrics by Group:
                               Selection Rate  Accuracy
subba-name                                            
Capital - NYIS                           1.0       1.0
Central - NYIS                           1.0       1.0
Dunwoodie - NYIS                         1.0       1.0
ERCO - Coast                             1.0       1.0
ERCO - East                              1.0       1.0
ERCO - Far West                          1.0       1.0
ERCO - North                             1.0       1.0
ERCO - North Central                     1.0       1.0
ERCO - South                             1.0       1.0
ERCO - South Central                     1.0       1.0
ERCO - West                              1.0       1.0
Genesee - NYIS                           1.0       1.0
Hudson Valley - NYIS                     1.0       1.0
ISNE - Connecticut                       1.0       1.0
ISNE - Maine                             1.0       1.0
ISNE - New Hampshire                     1.0  