to do:
Opis preprocessingu danych – obsługa brakujących wartości, kodowanie zmiennych kategorycznych, skalowanie i wagi, inżynieria cech

# Load preprocessed data


In [1]:
import pandas as pd
from config import (
    CSV_FILEPATH_EXPLORATION_PREPROCESSED_DATA,
    CSV_FILEPATH_MODEL_READY_DATA_POINTS,
    CSV_FILEPATH_MODEL_READY_DATA_PRICE,
    CSV_FILEPATH_MODEL_READY_DATA_POINTS_FILLED,
    CSV_FILEPATH_MODEL_READY_DATA_PRICE_FILLED,
)

# Preprocessed dataset created in data_exploration.ipynb
df = pd.read_csv(CSV_FILEPATH_EXPLORATION_PREPROCESSED_DATA)

# Keep raw and filled variants
DF_RAW = df.copy()
DF_FILLED = df.copy()


## Missing values: vintage/price


In [2]:
# Add missing indicators and fill with median for vintage and price
for col in ["vintage", "price"]:
    if col in DF_FILLED.columns:
        DF_FILLED[f"{col}_missing"] = DF_FILLED[col].isna().astype(int)
        DF_FILLED[col] = DF_FILLED[col].fillna(DF_FILLED[col].median())


# Categorical encoding

In [3]:
df.describe(include="object").T.sort_values("unique", ascending=False)

Unnamed: 0,count,unique,top,freq
description,129971,119955,"Seductively tart in lemon pith, cranberry and ...",3
designation,92506,37979,Reserve,2009
winery,129971,16757,Wines & Winemakers,222
region_1,108724,1229,Napa Valley,4480
variety,129970,707,Pinot Noir,13272
province,129908,425,California,36247
country,129908,43,US,54504
taster_name,103727,19,Roger Voss,25514


In [4]:
# Columns and target
categorical_cols = [
    "designation",
    "winery",
    "region_1",
    "variety",
    "province",
    "country",
    "taster_name",
]

exclude_cols = ["description"]

TARGET_COL_POINTS = "points"
TARGET_COL_PRICE = "price"
MISSING_TOKEN = "__MISSING__"
TE_COLS = ["winery", "region_1", "variety", "province"]
ONE_HOT_COLS = ["country", "taster_name"]


def _fill_missing(series):
    return series.astype("object").fillna(MISSING_TOKEN)


## Frequency encoding
- designation

In [5]:
def build_frequency_encoding(df):
    designation_filled = _fill_missing(df["designation"])
    designation_counts = designation_filled.value_counts(dropna=False)
    designation_freq = designation_filled.map(designation_counts / len(designation_filled))

    # Target-independent encoders
    encoded_base = pd.DataFrame(index=df.index)
    encoded_base["designation_fe"] = designation_freq.astype(float)
    return encoded_base


## Target encoding with CV + smoothing
- winery, 
- region_1, 
- variety, 
- province

In [6]:
from sklearn.model_selection import KFold
from config import RANDOM_STATE


def target_encode_cv(df, col, y, n_splits=5, smoothing=10, random_state=RANDOM_STATE):
    col_filled = _fill_missing(df[col])
    global_mean = y.mean()
    encoded = pd.Series(index=df.index, dtype=float)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for train_idx, val_idx in kf.split(df):
        train_col = col_filled.iloc[train_idx]
        train_y = y.iloc[train_idx]

        stats = train_y.groupby(train_col).agg(["mean", "count"])
        smooth = (stats["mean"] * stats["count"] + global_mean * smoothing) / (stats["count"] + smoothing)

        val_col = col_filled.iloc[val_idx]
        encoded.iloc[val_idx] = val_col.map(smooth).fillna(global_mean).values

    # Fit full mapping for inference
    stats_full = y.groupby(col_filled).agg(["mean", "count"])
    smooth_full = (stats_full["mean"] * stats_full["count"] + global_mean * smoothing) / (stats_full["count"] + smoothing)

    return encoded, smooth_full, global_mean


def build_target_encodings(df, target_col, te_cols, n_splits=5, smoothing=10):
    y = df[target_col]
    encoded_te = pd.DataFrame(index=df.index)
    te_mappings = {}

    for col in te_cols:
        te_series, te_map, te_global = target_encode_cv(
            df, col, y, n_splits=n_splits, smoothing=smoothing
        )
        encoded_te[f"{col}_te"] = te_series
        te_mappings[col] = {"map": te_map, "global_mean": te_global}

    return encoded_te, te_mappings


## One-hot encoding
- country, 
- taster_name

In [7]:
# One-hot encoding for low-cardinality

def build_one_hot(df):
    return pd.get_dummies(
        df[ONE_HOT_COLS].apply(_fill_missing),
        prefix=ONE_HOT_COLS,
        prefix_sep="=",
        dtype=int,
    )


## Build final dataset

In [8]:
def build_model_ready_dataset(df, target_col):
    numerical_cols = df.select_dtypes(include=["number"]).columns.tolist()

    if target_col in numerical_cols:
        numerical_cols.remove(target_col)

    encoded_base = build_frequency_encoding(df)
    one_hot = build_one_hot(df)
    encoded_te, te_mappings = build_target_encodings(df, target_col, TE_COLS)

    X = pd.concat([
        df[numerical_cols],
        encoded_base,
        encoded_te,
        one_hot,
    ], axis=1)

    y = df[target_col]
    model_df = pd.concat([X, y], axis=1)
    return model_df, te_mappings


model_df_points_raw, te_mappings_points_raw = build_model_ready_dataset(DF_RAW, TARGET_COL_POINTS)
model_df_price_raw, te_mappings_price_raw = build_model_ready_dataset(DF_RAW, TARGET_COL_PRICE)

model_df_points_filled, te_mappings_points_filled = build_model_ready_dataset(DF_FILLED, TARGET_COL_POINTS)
model_df_price_filled, te_mappings_price_filled = build_model_ready_dataset(DF_FILLED, TARGET_COL_PRICE)

model_df_points_raw.shape, model_df_points_filled.shape
model_df_points = model_df_points_raw
model_df_price = model_df_price_raw



In [9]:
model_df_points_raw.columns


Index(['price', 'vintage', 'designation_fe', 'winery_te', 'region_1_te',
       'variety_te', 'province_te', 'country=Argentina', 'country=Armenia',
       'country=Australia', 'country=Austria',
       'country=Bosnia and Herzegovina', 'country=Brazil', 'country=Bulgaria',
       'country=Canada', 'country=Chile', 'country=China', 'country=Croatia',
       'country=Cyprus', 'country=Czech Republic', 'country=Egypt',
       'country=England', 'country=France', 'country=Georgia',
       'country=Germany', 'country=Greece', 'country=Hungary', 'country=India',
       'country=Israel', 'country=Italy', 'country=Lebanon',
       'country=Luxembourg', 'country=Macedonia', 'country=Mexico',
       'country=Moldova', 'country=Morocco', 'country=New Zealand',
       'country=Peru', 'country=Portugal', 'country=Romania', 'country=Serbia',
       'country=Slovakia', 'country=Slovenia', 'country=South Africa',
       'country=Spain', 'country=Switzerland', 'country=Turkey', 'country=US',
       'c

In [10]:
model_df_points_raw.head()

model_df_points_filled.head()


Unnamed: 0,price,vintage,vintage_missing,price_missing,designation_fe,winery_te,region_1_te,variety_te,province_te,country=Argentina,...,taster_name=Matt Kettmann,taster_name=Michael Schachner,taster_name=Mike DeSimone,taster_name=Paul Gregutt,taster_name=Roger Voss,taster_name=Sean P. Sullivan,taster_name=Susan Kostrzewa,taster_name=Virginie Boone,taster_name=__MISSING__,points
0,25.0,2013.0,0,1,8e-06,88.098092,89.698917,87.362338,88.119743,0,...,0,0,0,0,0,0,0,0,0,87
1,15.0,2011.0,0,0,1.5e-05,87.904461,88.240848,88.842104,89.150171,0,...,0,0,0,0,1,0,0,0,0,87
2,14.0,2013.0,0,0,0.288257,88.15779,89.057677,88.497045,89.036889,0,...,0,0,0,1,0,0,0,0,0,87
3,13.0,2013.0,0,0,6.2e-05,87.026188,86.215713,89.441728,86.254714,0,...,0,0,0,0,0,0,0,0,0,87
4,65.0,2012.0,0,0,8e-06,87.556592,89.109913,89.403383,89.042131,0,...,0,0,0,1,0,0,0,0,0,87


In [11]:
model_df_price_raw.head()

model_df_price_filled.head()


Unnamed: 0,points,vintage,vintage_missing,price_missing,designation_fe,winery_te,region_1_te,variety_te,province_te,country=Argentina,...,taster_name=Matt Kettmann,taster_name=Michael Schachner,taster_name=Mike DeSimone,taster_name=Paul Gregutt,taster_name=Roger Voss,taster_name=Sean P. Sullivan,taster_name=Susan Kostrzewa,taster_name=Virginie Boone,taster_name=__MISSING__,price
0,87,2013.0,0,1,8e-06,29.297388,36.485596,23.328786,27.616244,0,...,0,0,0,0,0,0,0,0,0,25.0
1,87,2011.0,0,0,1.5e-05,30.403802,26.94726,24.940933,29.076308,0,...,0,0,0,0,1,0,0,0,0,15.0
2,87,2013.0,0,0,0.288257,24.520947,35.167639,23.111776,36.532145,0,...,0,0,0,1,0,0,0,0,0,14.0
3,87,2013.0,0,0,6.2e-05,26.470046,44.982028,31.417698,32.864608,0,...,0,0,0,0,0,0,0,0,0,13.0
4,87,2012.0,0,0,8e-06,29.67157,35.070082,46.358587,36.548709,0,...,0,0,0,1,0,0,0,0,0,65.0


# Save model-ready dataset


In [None]:
# Save model-ready dataset
model_df_points_raw.to_csv(CSV_FILEPATH_MODEL_READY_DATA_POINTS, index=False)
model_df_price_raw.to_csv(CSV_FILEPATH_MODEL_READY_DATA_PRICE, index=False)

model_df_points_filled.to_csv(CSV_FILEPATH_MODEL_READY_DATA_POINTS_FILLED, index=False)
model_df_price_filled.to_csv(CSV_FILEPATH_MODEL_READY_DATA_PRICE_FILLED, index=False)