In [6]:
import numpy as np
import pandas as pd
from itertools import combinations
from parse import load_data

df = load_data("data.csv")


binary_cols = ["Bitter", "Fruit", "Astringent"]
cont_col = "Flavor.intensity" 


median_flavor = df[cont_col].median()
df["FlavorGroup"] = np.where(df[cont_col] >= median_flavor, 1, 0)


features = ["Bitter", "Fruit", "Astringent", "FlavorGroup"]


rows = []


for f1, f2 in combinations(features, 2):
    for v1 in [0, 1]:  # Low / High
        for v2 in [0, 1]:
            group_df = df[(df[f1] == v1) & (df[f2] == v2)]
            
            if len(group_df) > 0:  
                median = group_df["Liking"].median()
                var = group_df["Liking"].var()
                rows.append([f1, v1, f2, v2, len(group_df), median, var])

result = pd.DataFrame(rows,
                      columns=["Feature1", "Value1", "Feature2", "Value2",
                               "N_samples", "Median_Liking", "Var_Liking"])

print(result)


      Feature1  Value1     Feature2  Value2  N_samples  Median_Liking  \
0       Bitter       0        Fruit       0       1768            6.0   
1       Bitter       0        Fruit       1        415            7.0   
2       Bitter       1        Fruit       0        916            6.0   
3       Bitter       1        Fruit       1         87            6.0   
4       Bitter       0   Astringent       0       1975            6.0   
5       Bitter       0   Astringent       1        208            5.0   
6       Bitter       1   Astringent       0        850            6.0   
7       Bitter       1   Astringent       1        153            4.0   
8       Bitter       0  FlavorGroup       0        784            6.0   
9       Bitter       0  FlavorGroup       1       1399            7.0   
10      Bitter       1  FlavorGroup       0        209            5.0   
11      Bitter       1  FlavorGroup       1        794            6.0   
12       Fruit       0   Astringent       0       2

When Fruit is high and Flavor intensity is low,
the median liking score is high and the variance is small,
indicating a taste that is widely preferred and stable.

High Astringent samples tend to show larger variance,
suggesting that their liking is polarized and people are divided on them.

There are also cases that are “uniformly disliked.”
For example, samples with Bitter=1 and Astringent=1 have a low median liking (around 4)
while the variance is relatively small, meaning that most people dislike them.

Among all attributes, bitterness is the factor that most strongly influences consumer liking.

In [5]:
from scipy.stats import ttest_ind
import numpy as np

low  = df[df["Bitter"] == 0]["Liking"]
high = df[df["Bitter"] == 1]["Liking"]

stat, p = ttest_ind(low, high, equal_var=False)
print(f"Welch t-test: p = {p:.4f}")

mean_low  = np.mean(low)
mean_high = np.mean(high)
sd_pooled = np.sqrt(((len(low)-1)*np.var(low) + (len(high)-1)*np.var(high)) /
                    (len(low)+len(high)-2))
d = (mean_low - mean_high) / sd_pooled
print(f"Cohen's d = {d:.3f}")
print(f"N_low = {len(low)}, N_high = {len(high)}")

Welch t-test: p = 0.0000
Cohen's d = 0.338
N_low = 2183, N_high = 1003
