In [28]:
from backpack_predictor import prepare_data, target_encoding
from backpack_predictor.features import target, baseline_features, feature_list, cat_cols

%load_ext autoreload
%autoreload 2

from datetime import datetime
import time
import numpy as np
import pandas as pd
import scipy.stats as stats

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import KBinsDiscretizer, TargetEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import root_mean_squared_error

import xgboost as xgb
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

# test_df = pd.read_csv(r'..//data//test.csv')
train_df = pd.read_csv(r'..//data//train.csv')
n = len(train_df)
train_extra_df = pd.read_csv(r'..//data//training_extra.csv')
train_df = pd.concat([train_df, train_extra_df], ignore_index=True)
train_df = train_df.sample(n)

# Apply function to train and test datasets
train_df = prepare_data(train_df, is_train=True)
train_df = train_df.drop(columns='wc_decimal_count')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
import itertools

# ----------------------------------------------------
# 1. Make a function that, given the population prices
#    and a sample size n, returns the bootstrap CI.
# ----------------------------------------------------
def population_bootstrap_ci(population_prices, n, num_bootstrap_samples=1000, confidence_level=0.95):
    """
    Computes the bootstrap confidence interval by drawing samples of size n
    from the entire population of prices. This CI represents what the
    'typical' mean would be if we randomly picked n items from the population.
    """
    # If sample size < 2 or if there's no real population, just return NaNs
    if n < 2:
        return np.nan, np.nan

    boot_means = []
    for _ in range(num_bootstrap_samples):
        sample = np.random.choice(population_prices, size=n, replace=True)
        boot_means.append(sample.mean())

    lower_bound = np.percentile(boot_means, (1 - confidence_level) / 2 * 100)
    upper_bound = np.percentile(boot_means, (1 + confidence_level) / 2 * 100)
    return lower_bound, upper_bound

# ----------------------------------------------------
# 2. Cache to avoid recomputing the above for each group
#    of the same size.
# ----------------------------------------------------
def get_population_ci_for_size(population_prices, group_size, cache_dict, 
                               num_bootstrap_samples=1000, confidence_level=0.95):
    """
    Wrapper that checks if we've already computed the population CI
    for the given group_size. If not, compute and store in cache.
    """
    if group_size in cache_dict:
        return cache_dict[group_size]
    else:
        ci = population_bootstrap_ci(
            population_prices, 
            n=group_size,
            num_bootstrap_samples=num_bootstrap_samples,
            confidence_level=confidence_level
        )
        cache_dict[group_size] = ci
        return ci

# ----------------------------------------------------
# 3. Main logic: For each pair of columns, group and test.
# ----------------------------------------------------
def find_significant_deviations(train_df, cols,
                                num_bootstrap_samples=1000,
                                confidence_level=0.95):
    """
    For each combination of columns in `cols`, compute the group mean
    and compare it against the population-based bootstrap confidence interval
    (with sample size matching the group's size).
    """
    # Extract the entire population of prices (as a NumPy array).
    population_prices = train_df['price'].values

    # A dict to store results for each (col1, col2) pair.
    results_outside_ci = {}

    # We'll use this dictionary to memoize (cache) population CIs by group size
    ci_cache = {}

    # Iterate over all pairs of columns
    for col1, col2 in itertools.combinations(cols, 2):
        combined_col_name = f"{col1}_{col2}"

        # Create a combined column
        train_df[combined_col_name] = (
            train_df[col1].astype(str) + "_" + train_df[col2].astype(str)
        )

        # Group by this new combined column to compute count and mean
        grouped = train_df.groupby(combined_col_name)['price'].agg(['count', 'mean']).reset_index()

        # For each group, compute the population-based CI and check if mean is out of it
        # We'll add columns for the CI and a flag for "is_outside"
        ci_results = []
        is_outside_list = []

        for idx, row in grouped.iterrows():
            group_size = row['count'] // 100 * 100 # Round to the nearest 100
            group_mean = row['mean']

            # Get the population-based CI for this group size
            low_ci, high_ci = get_population_ci_for_size(
                population_prices, 
                group_size, 
                ci_cache, 
                num_bootstrap_samples=num_bootstrap_samples, 
                confidence_level=confidence_level
            )

            ci_results.append((low_ci, high_ci))
            # Check if group_mean is outside the population-based CI for that size
            is_outside = (group_mean < low_ci) or (group_mean > high_ci)
            is_outside_list.append(is_outside)

        # Convert the CI results into columns
        grouped['low_ci'] = [r[0] for r in ci_results]
        grouped['high_ci'] = [r[1] for r in ci_results]
        grouped['is_outside_ci'] = is_outside_list

        # Now filter only the groups that are outside the confidence interval
        outliers = grouped[grouped['is_outside_ci']]

        # Store the outlier dataframe in the dictionary
        results_outside_ci[combined_col_name] = outliers

        # print(f"{combined_col_name} - {outliers['count'].sum() / grouped['count'].sum():.2%}")
        # print(f"{outliers['count'].sum() / grouped['count'].sum():.2%}: {combined_col_name}")

        # Remove the combined column from train_df
        train_df.drop(columns=[combined_col_name], inplace=True)

    return results_outside_ci

cols = ['brand', 'material', 'size', 'laptop_compartment', 'is_waterproof', 'style', 'color']

results = find_significant_deviations(train_df, cols, num_bootstrap_samples=1000, confidence_level=0.95)

summary = []
for k, v in results.items():
    summary.append((v['count'].sum() / n, k))
summary.sort(reverse=True)
for i in range(10):
    print(f"{summary[i][0]:.0%}: {summary[i][1]}")

85%: material_is_waterproof
84%: material_laptop_compartment
74%: material_size
74%: material_style
71%: is_waterproof_color
58%: brand_is_waterproof
57%: brand_color
57%: style_color
55%: laptop_compartment_color
49%: brand_laptop_compartment


85%: material_is_waterproof
84%: material_laptop_compartment
74%: material_size
74%: material_style
71%: is_waterproof_color
58%: brand_is_waterproof
57%: brand_color
57%: style_color
55%: laptop_compartment_color
49%: brand_laptop_compartment

85%: material_is_waterproof
73%: material_laptop_compartment
70%: size_color
57%: brand_laptop_compartment
56%: is_waterproof_color
55%: material_style
54%: laptop_compartment_color
51%: material_color
50%: brand_is_waterproof
49%: brand_color

73%: material_style
67%: material_size
62%: material_is_waterproof
61%: material_laptop_compartment
55%: is_waterproof_color
50%: brand_is_waterproof
48%: brand_laptop_compartment
48%: laptop_compartment_color
45%: material_color
43%: size_color

85%: material_is_waterproof
76%: material_laptop_compartment
64%: material_style
63%: style_color
63%: is_waterproof_color
57%: material_size
57%: brand_material
55%: laptop_compartment_color
54%: size_color
50%: brand_is_waterproof

In [53]:
import pandas as pd

# Data from the four tests
data = {
    "Test1": {
        "material_is_waterproof": 85,
        "material_laptop_compartment": 84,
        "material_size": 74,
        "material_style": 74,
        "is_waterproof_color": 71,
        "brand_is_waterproof": 58,
        "brand_color": 57,
        "style_color": 57,
        "laptop_compartment_color": 55,
        "brand_laptop_compartment": 49
    },
    "Test2": {
        "material_is_waterproof": 85,
        "material_laptop_compartment": 73,
        "size_color": 70,
        "brand_laptop_compartment": 57,
        "is_waterproof_color": 56,
        "material_style": 55,
        "laptop_compartment_color": 54,
        "material_color": 51,
        "brand_is_waterproof": 50,
        "brand_color": 49
    },
    "Test3": {
        "material_style": 73,
        "material_size": 67,
        "material_is_waterproof": 62,
        "material_laptop_compartment": 61,
        "is_waterproof_color": 55,
        "brand_is_waterproof": 50,
        "brand_laptop_compartment": 48,
        "laptop_compartment_color": 48,
        "material_color": 45,
        "size_color": 43
    },
    "Test4": {
        "material_is_waterproof": 85,
        "material_laptop_compartment": 76,
        "material_style": 64,
        "style_color": 63,
        "is_waterproof_color": 63,
        "material_size": 57,
        "brand_material": 57,
        "laptop_compartment_color": 55,
        "size_color": 54,
        "brand_is_waterproof": 50
    }
}

# Creating DataFrame
df = pd.DataFrame(data)
average = df.mean(axis=1).round(0)
variance = df.var(axis=1).round(0)
count = df.count(axis=1)
df = pd.DataFrame({'Average (%)': average, 'Variance': variance, 'Count': count}).sort_values(by='Average (%)', ascending=False)
df

Unnamed: 0,Average (%),Variance,Count
material_is_waterproof,79.0,132.0,4
material_laptop_compartment,74.0,91.0,4
material_size,66.0,73.0,3
material_style,66.0,79.0,4
is_waterproof_color,61.0,55.0,4
style_color,60.0,18.0,2
brand_material,57.0,,1
size_color,56.0,184.0,3
brand_color,53.0,32.0,2
laptop_compartment_color,53.0,11.0,4


In [30]:
# results['material_is_waterproof']

In [31]:
# 76%
# results['material_laptop_compartment']

In [32]:
def find_significant_deviations_of_three(
        train_df, cols,
        num_bootstrap_samples=1000,
        confidence_level=0.95
    ):
    """
    For each combination of three columns in `cols`, compute the group mean
    and compare it against the population-based bootstrap confidence interval
    (with sample size matching the group's size).
    """
    # Extract the entire population of prices (as a NumPy array).
    population_prices = train_df['price'].values

    # A dict to store results for each (col1, col2, col3) combination.
    results_outside_ci = {}

    # We'll use this dictionary to memoize (cache) population CIs by group size.
    ci_cache = {}

    # Iterate over all combinations of three columns
    for col1, col2, col3 in itertools.combinations(cols, 3):
        combined_col_name = f"{col1}_{col2}_{col3}"

        # Create a combined column by concatenating the string values of the three columns.
        train_df[combined_col_name] = (
            train_df[col1].astype(str) + "_" +
            train_df[col2].astype(str) + "_" +
            train_df[col3].astype(str)
        )

        # Group by this new combined column to compute count and mean.
        grouped = train_df.groupby(combined_col_name)['price'].agg(['count', 'mean']).reset_index()

        # For each group, compute the population-based CI and check if the mean is out of it.
        ci_results = []
        is_outside_list = []

        for idx, row in grouped.iterrows():
            group_size = row['count'] // 100 * 100  # Round to the nearest 100.
            group_mean = row['mean']

            # Get the population-based CI for this group size.
            low_ci, high_ci = get_population_ci_for_size(
                population_prices, 
                group_size, 
                ci_cache, 
                num_bootstrap_samples=num_bootstrap_samples, 
                confidence_level=confidence_level
            )

            ci_results.append((low_ci, high_ci))
            # Check if group_mean is outside the population-based CI for that size.
            is_outside = (group_mean < low_ci) or (group_mean > high_ci)
            is_outside_list.append(is_outside)

        # Convert the CI results into columns.
        grouped['low_ci'] = [r[0] for r in ci_results]
        grouped['high_ci'] = [r[1] for r in ci_results]
        grouped['is_outside_ci'] = is_outside_list

        # Now filter only the groups that are outside the confidence interval.
        outliers = grouped[grouped['is_outside_ci']]

        # Store the outlier dataframe in the dictionary.
        results_outside_ci[combined_col_name] = outliers

        # Print the percentage of counts that are outliers.
        # print(f"{combined_col_name} - {outliers['count'].sum() / grouped['count'].sum():.2%}")
        # print(f"{outliers['count'].sum() / grouped['count'].sum():.2%}: {combined_col_name}")

        # Remove the combined column from train_df.
        train_df.drop(columns=[combined_col_name], inplace=True)

    return results_outside_ci

# List of columns to consider.
cols = ['brand', 'material', 'size', 'laptop_compartment', 'is_waterproof', 'style', 'color']

# Example usage:
results = find_significant_deviations_of_three(train_df, cols, num_bootstrap_samples=1000, confidence_level=0.95)

summary = []
for k, v in results.items():
    summary.append((v['count'].sum() / n, k))
summary.sort(reverse=True)
for i in range(10):
    print(f"{summary[i][0]:.0%}: {summary[i][1]}")


66%: material_laptop_compartment_is_waterproof
64%: material_laptop_compartment_style
52%: material_is_waterproof_style
52%: material_size_laptop_compartment
51%: material_size_is_waterproof
46%: laptop_compartment_is_waterproof_color
43%: brand_laptop_compartment_is_waterproof
40%: material_is_waterproof_color
39%: brand_size_is_waterproof
39%: size_is_waterproof_color


66%: material_laptop_compartment_is_waterproof
64%: material_laptop_compartment_style
52%: material_is_waterproof_style
52%: material_size_laptop_compartment
51%: material_size_is_waterproof
46%: laptop_compartment_is_waterproof_color
43%: brand_laptop_compartment_is_waterproof
40%: material_is_waterproof_color
39%: brand_size_is_waterproof
39%: size_is_waterproof_color


52%: material_is_waterproof_style
48%: material_laptop_compartment_is_waterproof
47%: material_laptop_compartment_style
46%: material_size_laptop_compartment
39%: material_laptop_compartment_color
39%: material_size_is_waterproof
38%: material_size_style
36%: size_is_waterproof_color
35%: laptop_compartment_is_waterproof_color
34%: material_is_waterproof_color


66%: material_laptop_compartment_is_waterproof
47%: laptop_compartment_is_waterproof_color
45%: size_is_waterproof_color
44%: size_laptop_compartment_color
43%: material_is_waterproof_style
42%: material_laptop_compartment_style
41%: material_is_waterproof_color
41%: material_size_laptop_compartment
41%: material_laptop_compartment_color
40%: brand_material_is_waterproof


62%: material_laptop_compartment_is_waterproof
48%: material_laptop_compartment_style
47%: material_is_waterproof_style
46%: material_size_is_waterproof
43%: laptop_compartment_is_waterproof_color
41%: material_size_laptop_compartment
40%: brand_material_is_waterproof
37%: material_is_waterproof_color
35%: brand_laptop_compartment_is_waterproof
35%: material_size_style

In [54]:
# Data from the four tests for the new set of features
data_new = {
    "Test1": {
        "material_laptop_compartment_is_waterproof": 66,
        "material_laptop_compartment_style": 64,
        "material_is_waterproof_style": 52,
        "material_size_laptop_compartment": 52,
        "material_size_is_waterproof": 51,
        "laptop_compartment_is_waterproof_color": 46,
        "brand_laptop_compartment_is_waterproof": 43,
        "material_is_waterproof_color": 40,
        "brand_size_is_waterproof": 39,
        "size_is_waterproof_color": 39
    },
    "Test2": {
        "material_is_waterproof_style": 52,
        "material_laptop_compartment_is_waterproof": 48,
        "material_laptop_compartment_style": 47,
        "material_size_laptop_compartment": 46,
        "material_laptop_compartment_color": 39,
        "material_size_is_waterproof": 39,
        "material_size_style": 38,
        "size_is_waterproof_color": 36,
        "laptop_compartment_is_waterproof_color": 35,
        "material_is_waterproof_color": 34
    },
    "Test3": {
        "material_laptop_compartment_is_waterproof": 66,
        "laptop_compartment_is_waterproof_color": 47,
        "size_is_waterproof_color": 45,
        "size_laptop_compartment_color": 44,
        "material_is_waterproof_style": 43,
        "material_laptop_compartment_style": 42,
        "material_is_waterproof_color": 41,
        "material_size_laptop_compartment": 41,
        "material_laptop_compartment_color": 41,
        "brand_material_is_waterproof": 40
    },
    "Test4": {
        "material_laptop_compartment_is_waterproof": 62,
        "material_laptop_compartment_style": 48,
        "material_is_waterproof_style": 47,
        "material_size_is_waterproof": 46,
        "laptop_compartment_is_waterproof_color": 43,
        "material_size_laptop_compartment": 41,
        "brand_material_is_waterproof": 40,
        "material_is_waterproof_color": 37,
        "brand_laptop_compartment_is_waterproof": 35,
        "material_size_style": 35
    }
}

# Creating DataFrame
df = pd.DataFrame(data_new)
average = df.mean(axis=1).round(0)
variance = df.var(axis=1).round(0)
count = df.count(axis=1)
df = pd.DataFrame({'Average (%)': average, 'Variance': variance, 'Count': count}).sort_values(by='Average (%)', ascending=False)
df

Unnamed: 0,Average (%),Variance,Count
material_laptop_compartment_is_waterproof,60.0,73.0,4
material_laptop_compartment_style,50.0,91.0,4
material_is_waterproof_style,48.0,19.0,4
material_size_laptop_compartment,45.0,27.0,4
material_size_is_waterproof,45.0,36.0,3
size_laptop_compartment_color,44.0,,1
laptop_compartment_is_waterproof_color,43.0,30.0,4
size_is_waterproof_color,40.0,21.0,3
material_laptop_compartment_color,40.0,2.0,2
brand_material_is_waterproof,40.0,0.0,2


In [33]:
# results['material_laptop_compartment_is_waterproof']

In [34]:
# results['material_is_waterproof_style']

In [35]:
def find_significant_deviations_of_four(
        train_df, cols,
        num_bootstrap_samples=1000,
        confidence_level=0.95
    ):
    """
    For each combination of three columns in `cols`, compute the group mean
    and compare it against the population-based bootstrap confidence interval
    (with sample size matching the group's size).
    """
    # Extract the entire population of prices (as a NumPy array).
    population_prices = train_df['price'].values

    # A dict to store results for each (col1, col2, col3) combination.
    results_outside_ci = {}

    # We'll use this dictionary to memoize (cache) population CIs by group size.
    ci_cache = {}

    # Iterate over all combinations of three columns
    for col1, col2, col3, col4 in itertools.combinations(cols, 4):
        combined_col_name = f"{col1}_{col2}_{col3}_{col4}"

        # Create a combined column by concatenating the string values of the three columns.
        train_df[combined_col_name] = (
            train_df[col1].astype(str) + "_" +
            train_df[col2].astype(str) + "_" +
            train_df[col3].astype(str) + "_" +
            train_df[col4].astype(str)
        )

        # Group by this new combined column to compute count and mean.
        grouped = train_df.groupby(combined_col_name)['price'].agg(['count', 'mean']).reset_index()

        # For each group, compute the population-based CI and check if the mean is out of it.
        ci_results = []
        is_outside_list = []

        for idx, row in grouped.iterrows():
            group_size = row['count'] // 100 * 100  # Round to the nearest 100.
            group_mean = row['mean']

            # Get the population-based CI for this group size.
            low_ci, high_ci = get_population_ci_for_size(
                population_prices, 
                group_size, 
                ci_cache, 
                num_bootstrap_samples=num_bootstrap_samples, 
                confidence_level=confidence_level
            )

            ci_results.append((low_ci, high_ci))
            # Check if group_mean is outside the population-based CI for that size.
            is_outside = (group_mean < low_ci) or (group_mean > high_ci)
            is_outside_list.append(is_outside)

        # Convert the CI results into columns.
        grouped['low_ci'] = [r[0] for r in ci_results]
        grouped['high_ci'] = [r[1] for r in ci_results]
        grouped['is_outside_ci'] = is_outside_list

        # Now filter only the groups that are outside the confidence interval.
        outliers = grouped[grouped['is_outside_ci']]

        # Store the outlier dataframe in the dictionary.
        results_outside_ci[combined_col_name] = outliers

        # Print the percentage of counts that are outliers.
        # print(f"{combined_col_name} - {outliers['count'].sum() / grouped['count'].sum():.2%}")
        # print(f"{outliers['count'].sum() / grouped['count'].sum():.2%}: {combined_col_name}")

        # Remove the combined column from train_df.
        train_df.drop(columns=[combined_col_name], inplace=True)

    return results_outside_ci

# List of columns to consider.
cols = ['brand', 'material', 'size', 'laptop_compartment', 'is_waterproof', 'style', 'color']

# Example usage:
results = find_significant_deviations_of_four(train_df, cols, num_bootstrap_samples=1000, confidence_level=0.95)

summary = []
for k, v in results.items():
    summary.append((v['count'].sum() / n, k))
summary.sort(reverse=True)
for i in range(10):
    print(f"{summary[i][0]:.0%}: {summary[i][1]}")


31%: material_size_laptop_compartment_is_waterproof
26%: brand_material_laptop_compartment_is_waterproof
26%: material_laptop_compartment_is_waterproof_style
23%: brand_laptop_compartment_is_waterproof_style
22%: brand_size_laptop_compartment_is_waterproof
22%: brand_material_size_is_waterproof
21%: brand_laptop_compartment_is_waterproof_color
20%: brand_material_size_laptop_compartment
20%: material_laptop_compartment_is_waterproof_color
20%: brand_material_is_waterproof_style


31%: material_size_laptop_compartment_is_waterproof
26%: brand_material_laptop_compartment_is_waterproof
26%: material_laptop_compartment_is_waterproof_style
23%: brand_laptop_compartment_is_waterproof_style
22%: brand_size_laptop_compartment_is_waterproof
22%: brand_material_size_is_waterproof
21%: brand_laptop_compartment_is_waterproof_color
20%: brand_material_size_laptop_compartment
20%: material_laptop_compartment_is_waterproof_color
20%: brand_material_is_waterproof_style

29%: material_laptop_compartment_is_waterproof_style
27%: material_size_laptop_compartment_is_waterproof
23%: material_laptop_compartment_is_waterproof_color
23%: brand_material_laptop_compartment_is_waterproof
22%: brand_material_size_is_waterproof
22%: material_laptop_compartment_style_color
21%: material_size_laptop_compartment_style
21%: laptop_compartment_is_waterproof_style_color
19%: brand_material_size_laptop_compartment
19%: material_size_is_waterproof_style

26%: material_size_laptop_compartment_is_waterproof
25%: material_laptop_compartment_is_waterproof_color
24%: size_laptop_compartment_is_waterproof_color
21%: brand_material_laptop_compartment_is_waterproof
21%: brand_material_size_is_waterproof
21%: material_laptop_compartment_is_waterproof_style
20%: material_size_is_waterproof_style
20%: material_size_laptop_compartment_style
19%: material_size_laptop_compartment_color
19%: size_is_waterproof_style_color

23%: brand_material_laptop_compartment_is_waterproof
23%: material_laptop_compartment_is_waterproof_color
22%: laptop_compartment_is_waterproof_style_color
22%: material_size_laptop_compartment_is_waterproof
21%: material_size_laptop_compartment_color
19%: size_laptop_compartment_is_waterproof_color
19%: material_laptop_compartment_is_waterproof_style
18%: material_size_is_waterproof_color
17%: material_size_is_waterproof_style
16%: brand_laptop_compartment_is_waterproof_color


In [55]:
# Data from the four tests for the third set of features
data_third = {
    "Test1": {
        "material_size_laptop_compartment_is_waterproof": 31,
        "brand_material_laptop_compartment_is_waterproof": 26,
        "material_laptop_compartment_is_waterproof_style": 26,
        "brand_laptop_compartment_is_waterproof_style": 23,
        "brand_size_laptop_compartment_is_waterproof": 22,
        "brand_material_size_is_waterproof": 22,
        "brand_laptop_compartment_is_waterproof_color": 21,
        "brand_material_size_laptop_compartment": 20,
        "material_laptop_compartment_is_waterproof_color": 20,
        "brand_material_is_waterproof_style": 20
    },
    "Test2": {
        "material_laptop_compartment_is_waterproof_style": 29,
        "material_size_laptop_compartment_is_waterproof": 27,
        "material_laptop_compartment_is_waterproof_color": 23,
        "brand_material_laptop_compartment_is_waterproof": 23,
        "brand_material_size_is_waterproof": 22,
        "material_laptop_compartment_style_color": 22,
        "material_size_laptop_compartment_style": 21,
        "laptop_compartment_is_waterproof_style_color": 21,
        "brand_material_size_laptop_compartment": 19,
        "material_size_is_waterproof_style": 19
    },
    "Test3": {
        "material_size_laptop_compartment_is_waterproof": 26,
        "material_laptop_compartment_is_waterproof_color": 25,
        "size_laptop_compartment_is_waterproof_color": 24,
        "brand_material_laptop_compartment_is_waterproof": 21,
        "brand_material_size_is_waterproof": 21,
        "material_laptop_compartment_is_waterproof_style": 21,
        "material_size_is_waterproof_style": 20,
        "material_size_laptop_compartment_style": 20,
        "material_size_laptop_compartment_color": 19,
        "size_is_waterproof_style_color": 19
    },
    "Test4": {
        "brand_material_laptop_compartment_is_waterproof": 23,
        "material_laptop_compartment_is_waterproof_color": 23,
        "laptop_compartment_is_waterproof_style_color": 22,
        "material_size_laptop_compartment_is_waterproof": 22,
        "material_size_laptop_compartment_color": 21,
        "size_laptop_compartment_is_waterproof_color": 19,
        "material_laptop_compartment_is_waterproof_style": 19,
        "material_size_is_waterproof_color": 18,
        "material_size_is_waterproof_style": 17,
        "brand_laptop_compartment_is_waterproof_color": 16
    }
}

df = pd.DataFrame(data_third)
average = df.mean(axis=1).round(0)
variance = df.var(axis=1).round(0)
count = df.count(axis=1)
df = pd.DataFrame({'Average (%)': average, 'Variance': variance, 'Count': count}).sort_values(by='Average (%)', ascending=False)
df

Unnamed: 0,Average (%),Variance,Count
material_size_laptop_compartment_is_waterproof,26.0,14.0,4
material_laptop_compartment_is_waterproof_style,24.0,21.0,4
brand_laptop_compartment_is_waterproof_style,23.0,,1
material_laptop_compartment_is_waterproof_color,23.0,4.0,4
brand_material_laptop_compartment_is_waterproof,23.0,4.0,4
laptop_compartment_is_waterproof_style_color,22.0,0.0,2
brand_size_laptop_compartment_is_waterproof,22.0,,1
brand_material_size_is_waterproof,22.0,0.0,3
size_laptop_compartment_is_waterproof_color,22.0,12.0,2
material_laptop_compartment_style_color,22.0,,1


In [36]:
def find_significant_deviations_of_five(
        train_df, cols,
        num_bootstrap_samples=1000,
        confidence_level=0.95
    ):
    """
    For each combination of three columns in `cols`, compute the group mean
    and compare it against the population-based bootstrap confidence interval
    (with sample size matching the group's size).
    """
    # Extract the entire population of prices (as a NumPy array).
    population_prices = train_df['price'].values

    # A dict to store results for each (col1, col2, col3) combination.
    results_outside_ci = {}

    # We'll use this dictionary to memoize (cache) population CIs by group size.
    ci_cache = {}

    # Iterate over all combinations of three columns
    for col1, col2, col3, col4, col5 in itertools.combinations(cols, 5):
        combined_col_name = f"{col1}_{col2}_{col3}_{col4}_{col5}"

        # Create a combined column by concatenating the string values of the three columns.
        train_df[combined_col_name] = (
            train_df[col1].astype(str) + "_" +
            train_df[col2].astype(str) + "_" +
            train_df[col3].astype(str) + "_" +
            train_df[col4].astype(str) + "_" +
            train_df[col5].astype(str)
        )

        # Group by this new combined column to compute count and mean.
        grouped = train_df.groupby(combined_col_name)['price'].agg(['count', 'mean']).reset_index()

        # For each group, compute the population-based CI and check if the mean is out of it.
        ci_results = []
        is_outside_list = []

        for idx, row in grouped.iterrows():
            group_size = row['count'] // 100 * 100  # Round to the nearest 100.
            group_mean = row['mean']

            # Get the population-based CI for this group size.
            low_ci, high_ci = get_population_ci_for_size(
                population_prices, 
                group_size, 
                ci_cache, 
                num_bootstrap_samples=num_bootstrap_samples, 
                confidence_level=confidence_level
            )

            ci_results.append((low_ci, high_ci))
            # Check if group_mean is outside the population-based CI for that size.
            is_outside = (group_mean < low_ci) or (group_mean > high_ci)
            is_outside_list.append(is_outside)

        # Convert the CI results into columns.
        grouped['low_ci'] = [r[0] for r in ci_results]
        grouped['high_ci'] = [r[1] for r in ci_results]
        grouped['is_outside_ci'] = is_outside_list

        # Now filter only the groups that are outside the confidence interval.
        outliers = grouped[grouped['is_outside_ci']]

        # Store the outlier dataframe in the dictionary.
        results_outside_ci[combined_col_name] = outliers

        # Print the percentage of counts that are outliers.
        # print(f"{combined_col_name} - {outliers['count'].sum() / grouped['count'].sum():.2%}")
        # print(f"{outliers['count'].sum() / grouped['count'].sum():.2%}: {combined_col_name}")

        # Remove the combined column from train_df.
        train_df.drop(columns=[combined_col_name], inplace=True)

    return results_outside_ci

# List of columns to consider.
cols = ['brand', 'material', 'size', 'laptop_compartment', 'is_waterproof', 'style', 'color']

# Example usage:
results = find_significant_deviations_of_five(train_df, cols, num_bootstrap_samples=1000, confidence_level=0.95)

summary = []
for k, v in results.items():
    summary.append((v['count'].sum() / n, k))
summary.sort(reverse=True)
for i in range(10):
    print(f"{summary[i][0]:.0%}: {summary[i][1]}")

14%: material_size_laptop_compartment_is_waterproof_style
11%: material_laptop_compartment_is_waterproof_style_color
11%: brand_material_size_laptop_compartment_is_waterproof
10%: brand_material_laptop_compartment_is_waterproof_style
9%: material_size_laptop_compartment_is_waterproof_color
9%: brand_material_laptop_compartment_is_waterproof_color
9%: brand_size_laptop_compartment_is_waterproof_style
8%: brand_material_size_laptop_compartment_style
8%: brand_material_size_is_waterproof_style
8%: size_laptop_compartment_is_waterproof_style_color


14%: material_size_laptop_compartment_is_waterproof_style
11%: material_laptop_compartment_is_waterproof_style_color
11%: brand_material_size_laptop_compartment_is_waterproof
10%: brand_material_laptop_compartment_is_waterproof_style
9%: material_size_laptop_compartment_is_waterproof_color
9%: brand_material_laptop_compartment_is_waterproof_color
9%: brand_size_laptop_compartment_is_waterproof_style
8%: brand_material_size_laptop_compartment_style
8%: brand_material_size_is_waterproof_style
8%: size_laptop_compartment_is_waterproof_style_color

13%: material_size_laptop_compartment_is_waterproof_style
12%: material_size_laptop_compartment_is_waterproof_color
11%: material_laptop_compartment_is_waterproof_style_color
10%: brand_material_size_laptop_compartment_is_waterproof
10%: brand_material_laptop_compartment_is_waterproof_style
10%: size_laptop_compartment_is_waterproof_style_color
10%: brand_size_laptop_compartment_is_waterproof_color
9%: material_size_laptop_compartment_style_color
8%: brand_laptop_compartment_is_waterproof_style_color
8%: material_size_is_waterproof_style_color

15%: material_laptop_compartment_is_waterproof_style_color
14%: material_size_laptop_compartment_is_waterproof_style
12%: material_size_laptop_compartment_is_waterproof_color
12%: brand_material_size_laptop_compartment_is_waterproof
9%: brand_material_laptop_compartment_is_waterproof_style
9%: material_size_laptop_compartment_style_color
8%: brand_material_laptop_compartment_is_waterproof_color
8%: brand_material_size_is_waterproof_style
8%: brand_material_size_laptop_compartment_style
8%: material_size_is_waterproof_style_color

11%: material_laptop_compartment_is_waterproof_style_color
10%: material_size_laptop_compartment_is_waterproof_color
10%: material_size_laptop_compartment_is_waterproof_style
8%: size_laptop_compartment_is_waterproof_style_color
8%: brand_material_laptop_compartment_is_waterproof_style
8%: brand_material_laptop_compartment_is_waterproof_color
7%: material_size_laptop_compartment_style_color
7%: brand_size_laptop_compartment_is_waterproof_style
7%: material_size_is_waterproof_style_color
7%: brand_material_size_laptop_compartment_is_waterproof

In [56]:
# Data from the four tests for the fourth set of features
data_fourth = {
    "Test1": {
        "material_size_laptop_compartment_is_waterproof_style": 14,
        "material_laptop_compartment_is_waterproof_style_color": 11,
        "brand_material_size_laptop_compartment_is_waterproof": 11,
        "brand_material_laptop_compartment_is_waterproof_style": 10,
        "material_size_laptop_compartment_is_waterproof_color": 9,
        "brand_material_laptop_compartment_is_waterproof_color": 9,
        "brand_size_laptop_compartment_is_waterproof_style": 9,
        "brand_material_size_laptop_compartment_style": 8,
        "brand_material_size_is_waterproof_style": 8,
        "size_laptop_compartment_is_waterproof_style_color": 8
    },
    "Test2": {
        "material_size_laptop_compartment_is_waterproof_style": 13,
        "material_size_laptop_compartment_is_waterproof_color": 12,
        "material_laptop_compartment_is_waterproof_style_color": 11,
        "brand_material_size_laptop_compartment_is_waterproof": 10,
        "brand_material_laptop_compartment_is_waterproof_style": 10,
        "size_laptop_compartment_is_waterproof_style_color": 10,
        "brand_size_laptop_compartment_is_waterproof_color": 10,
        "material_size_laptop_compartment_style_color": 9,
        "brand_laptop_compartment_is_waterproof_style_color": 8,
        "material_size_is_waterproof_style_color": 8
    },
    "Test3": {
        "material_laptop_compartment_is_waterproof_style_color": 15,
        "material_size_laptop_compartment_is_waterproof_style": 14,
        "material_size_laptop_compartment_is_waterproof_color": 12,
        "brand_material_size_laptop_compartment_is_waterproof": 12,
        "brand_material_laptop_compartment_is_waterproof_style": 9,
        "material_size_laptop_compartment_style_color": 9,
        "brand_material_laptop_compartment_is_waterproof_color": 8,
        "brand_material_size_is_waterproof_style": 8,
        "brand_material_size_laptop_compartment_style": 8,
        "material_size_is_waterproof_style_color": 8
    },
    "Test4": {
        "material_laptop_compartment_is_waterproof_style_color": 11,
        "material_size_laptop_compartment_is_waterproof_color": 10,
        "material_size_laptop_compartment_is_waterproof_style": 10,
        "size_laptop_compartment_is_waterproof_style_color": 8,
        "brand_material_laptop_compartment_is_waterproof_style": 8,
        "brand_material_laptop_compartment_is_waterproof_color": 8,
        "material_size_laptop_compartment_style_color": 7,
        "brand_size_laptop_compartment_is_waterproof_style": 7,
        "material_size_is_waterproof_style_color": 7,
        "brand_material_size_laptop_compartment_is_waterproof": 7
    }
}

df = pd.DataFrame(data_fourth)
average = df.mean(axis=1).round(0)
variance = df.var(axis=1).round(0)
count = df.count(axis=1)
df = pd.DataFrame({'Average (%)': average, 'Variance': variance, 'Count': count}).sort_values(by='Average (%)', ascending=False)
df

Unnamed: 0,Average (%),Variance,Count
material_size_laptop_compartment_is_waterproof_style,13.0,4.0,4
material_laptop_compartment_is_waterproof_style_color,12.0,4.0,4
material_size_laptop_compartment_is_waterproof_color,11.0,2.0,4
brand_material_size_laptop_compartment_is_waterproof,10.0,5.0,4
brand_size_laptop_compartment_is_waterproof_color,10.0,,1
brand_material_laptop_compartment_is_waterproof_style,9.0,1.0,4
size_laptop_compartment_is_waterproof_style_color,9.0,1.0,3
brand_material_laptop_compartment_is_waterproof_color,8.0,0.0,3
brand_size_laptop_compartment_is_waterproof_style,8.0,2.0,2
brand_material_size_laptop_compartment_style,8.0,0.0,2
