In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split,cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv("data.csv")

In [3]:
target = "Liking"
feature_cols = [
    "Dose", "Grind", "Brew Mass", "Percent Extraction", "pH", "Volume",
    "Brew Temperature", "Pour Temp", "90Sec Temp",
    "Flavor.intensity", "Acidity", "Mouthfeel",
    "Fruit", "Bitter", "Astringent", "Sour", "Sweet"
]
feature_cols = [c for c in feature_cols if c in df.columns]

X = df[feature_cols].copy()
y = df[target].copy()
X = X.apply(pd.to_numeric, errors="coerce")
y = pd.to_numeric(y, errors="coerce")
if "Flavor.intensity" in X.columns:
    X["Flavor.intensity_sq"] = X["Flavor.intensity"] ** 2
feature_cols = X.columns.tolist()
print(X.isna().sum())
print(y.isna().sum())

Dose                   0
Grind                  0
Brew Mass              0
Percent Extraction     0
pH                     0
Volume                 0
Brew Temperature       0
Pour Temp              0
90Sec Temp             0
Flavor.intensity       0
Acidity                0
Mouthfeel              0
Fruit                  0
Bitter                 0
Astringent             0
Sour                   0
Sweet                  0
Flavor.intensity_sq    0
dtype: int64
0


NaNがないので欠損を補わなくていい。

In [22]:
cont_cols = [
    "Dose", "Grind", "Brew Mass", "Percent Extraction", "pH", "Volume",
    "Brew Temperature", "90Sec Temp",
    "Flavor.intensity", "Acidity", "Mouthfeel"]

bin_cols = ["Fruit", "Bitter", "Astringent", "Sour", "Sweet","Flavor.intensity_sq"]
preprocess = ColumnTransformer(
    transformers=[
        ("cont", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("power", PowerTransformer(method="yeo-johnson")),
            ("scaler", StandardScaler()),
        ]), cont_cols),

        ("bin", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            # bin は何もしない（スケール不要）
        ]), bin_cols),
    ],
    remainder="drop"
)

pipe1 = Pipeline([
    ("pre", preprocess),
    ("model", LinearRegression())
])
cv = KFold(n_splits=5, shuffle=True, random_state=0)

scores = cross_val_score(
    pipe1,
    X,
    y,
    cv=cv,
    scoring="r2"
)
scores

array([0.30402513, 0.29619093, 0.35421649, 0.31909687, 0.36682975])

In [23]:
pipe2 = Pipeline([
    ("model", LinearRegression())
])
cv = KFold(n_splits=5, shuffle=True, random_state=0)

scores = cross_val_score(
    pipe2,
    X,
    y,
    cv=cv,
    scoring="r2"
)
scores

array([0.30226895, 0.29671246, 0.35352076, 0.31596678, 0.36753599])

Pour Temp 高 / 低で調べよう

In [24]:
median_PourTemp = df["Pour Temp"].median()
df["PourTempGroup"] = (df["Pour Temp"] >= median_PourTemp).astype(int)

low_temp  = df[df["PourTempGroup"] == 0]
high_temp = df[df["PourTempGroup"] == 1]
len(low_temp), len(high_temp)

(1548, 1638)

In [25]:
feature_cols2 = [
    "Dose", "Grind", "Brew Mass", "Percent Extraction", "pH", "Volume",
    "Brew Temperature", "90Sec Temp",
    "Flavor.intensity", "Acidity", "Mouthfeel",
    "Fruit", "Bitter", "Astringent", "Sour", "Sweet"
]

Xlow  = low_temp[feature_cols2].copy()
ylow  = low_temp[target]
if "Flavor.intensity" in Xlow.columns:
    Xlow["Flavor.intensity_sq"] = Xlow["Flavor.intensity"] ** 2
feature_cols = Xlow.columns.tolist()

Xhigh = high_temp[feature_cols2].copy()
yhigh = high_temp[target]
if "Flavor.intensity" in Xhigh.columns:
    Xhigh["Flavor.intensity_sq"] = Xhigh["Flavor.intensity"] ** 2
feature_cols = Xhigh.columns.tolist()

In [31]:
scores = cross_val_score(pipe1,Xlow,ylow,cv=cv,scoring="r2")
scores

array([0.29917925, 0.25626805, 0.25170738, 0.36879202, 0.34545901])

In [27]:
scores = cross_val_score(
    pipe2,
    Xlow,
    ylow,
    cv=cv,
    scoring="r2"
)
scores

array([0.29832226, 0.25767268, 0.24738166, 0.36488021, 0.34439482])

In [28]:
scores = cross_val_score(
    pipe1,
    Xhigh,
    yhigh,
    cv=cv,
    scoring="r2"
)
scores

array([0.36050097, 0.29974221, 0.40804457, 0.39353169, 0.31851054])

In [29]:
scores = cross_val_score(
    pipe2,
    Xhigh,
    yhigh,
    cv=cv,
    scoring="r2"
)
scores

array([0.36096915, 0.30048006, 0.40780249, 0.40085673, 0.32358249])

In [30]:
pipe1.fit(Xlow, ylow)
model = pipe1.named_steps["model"]
coef_low = pd.Series(
    model.coef_,
    index=Xlow.columns,
    name="coef_low"
).sort_values(key=abs, ascending=False)
pipe1.fit(Xhigh, yhigh)
model = pipe1.named_steps["model"]
coef_high = pd.Series(
    model.coef_,
    index=Xhigh.columns,
    name="coef_high"
).sort_values(key=abs, ascending=False)

coef_df = pd.concat([coef_low, coef_high], axis=1)
coef_df["diff_high_low"] = coef_df["coef_high"] - coef_df["coef_low"]

coef_df.sort_values("diff_high_low", key=abs, ascending=False)


Unnamed: 0,coef_low,coef_high,diff_high_low
Dose,0.497478,-0.65514,-1.152618
Volume,-0.38828,0.413224,0.801504
Percent Extraction,0.183354,-0.305914,-0.489269
Flavor.intensity,3.534808,3.814069,0.279261
Fruit,0.53652,0.337986,-0.198535
Mouthfeel,0.024799,0.181343,0.156544
Grind,-0.070681,0.07121,0.141891
Acidity,-0.132689,0.007738,0.140427
Brew Mass,-0.062647,-0.195091,-0.132445
90Sec Temp,0.092673,-0.028019,-0.120692
