to do:
Opis preprocessingu danych – obsługa brakujących wartości, kodowanie zmiennych kategorycznych, skalowanie i wagi, inżynieria cech

# Load preprocessed data


In [None]:
import pandas as pd
from config import CSV_FILEPATH_EXPLORATION_PREPROCESSED_DATA, CSV_FILEPATH_MODEL_READY_DATA

# Preprocessed dataset created in data_exploration.ipynb
df = pd.read_csv(CSV_FILEPATH_EXPLORATION_PREPROCESSED_DATA)


# Categorical encoding

In [22]:
df.describe(include="object").T.sort_values("unique", ascending=False)

Unnamed: 0,count,unique,top,freq
description,129971,119955,"Seductively tart in lemon pith, cranberry and ...",3
designation,92506,37979,Reserve,2009
winery,129971,16757,Wines & Winemakers,222
region_1,108724,1229,Napa Valley,4480
variety,129970,707,Pinot Noir,13272
province,129908,425,California,36247
country,129908,43,US,54504
taster_name,103727,19,Roger Voss,25514


In [23]:
# Columns and target
categorical_cols = [
    "designation",
    "winery",
    "region_1",
    "variety",
    "province",
    "country",
    "taster_name",
]

exclude_cols = ["description"]

TARGET_COL = "points"
MISSING_TOKEN = "__MISSING__"

def _fill_missing(series):
    return series.astype("object").fillna(MISSING_TOKEN)

## Frequency encoding
- designation

In [24]:
designation_filled = _fill_missing(df["designation"])
designation_counts = designation_filled.value_counts(dropna=False)
designation_freq = designation_filled.map(designation_counts / len(designation_filled))

# Apply encoders
encoded = pd.DataFrame(index=df.index)
encoded["designation_fe"] = designation_freq.astype(float)

## Target encoding with CV + smoothing
- winery, 
- region_1, 
- variety, 
- province

In [25]:
from sklearn.model_selection import KFold
from config import *

def target_encode_cv(df, col, y, n_splits=5, smoothing=10, random_state=RANDOM_STATE):
    col_filled = _fill_missing(df[col])
    global_mean = y.mean()
    encoded = pd.Series(index=df.index, dtype=float)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for train_idx, val_idx in kf.split(df):
        train_col = col_filled.iloc[train_idx]
        train_y = y.iloc[train_idx]

        stats = train_y.groupby(train_col).agg(["mean", "count"])
        smooth = (stats["mean"] * stats["count"] + global_mean * smoothing) / (stats["count"] + smoothing)

        val_col = col_filled.iloc[val_idx]
        encoded.iloc[val_idx] = val_col.map(smooth).fillna(global_mean).values

    # Fit full mapping for inference
    stats_full = y.groupby(col_filled).agg(["mean", "count"])
    smooth_full = (stats_full["mean"] * stats_full["count"] + global_mean * smoothing) / (stats_full["count"] + smoothing)

    return encoded, smooth_full, global_mean

y = df[TARGET_COL]

te_cols = ["winery", "region_1", "variety", "province"]
te_mappings = {}
for col in te_cols:
    te_series, te_map, te_global = target_encode_cv(df, col, y, n_splits=5, smoothing=10)
    encoded[f"{col}_te"] = te_series
    te_mappings[col] = {"map": te_map, "global_mean": te_global}

## One-hot encoding
- country, 
- taster_name

In [26]:
# One-hot encoding for low-cardinality
one_hot_cols = ["country", "taster_name"]

one_hot = pd.get_dummies(
    df[one_hot_cols].apply(_fill_missing),
    prefix=one_hot_cols,
    prefix_sep="=",
    dtype=int,
)

## Build final dataset

In [27]:
# Build final dataset for modeling
numerical_cols = df.select_dtypes(include=["number"]).columns.tolist()

if TARGET_COL in numerical_cols:
    numerical_cols.remove(TARGET_COL)

X = pd.concat([
    df[numerical_cols],
    encoded,
    one_hot,
], axis=1)

y = df[TARGET_COL]

X.shape, y.shape

((129971, 71), (129971,))

In [28]:
X.columns

Index(['price', 'vintage', 'designation_fe', 'winery_te', 'region_1_te',
       'variety_te', 'province_te', 'country=Argentina', 'country=Armenia',
       'country=Australia', 'country=Austria',
       'country=Bosnia and Herzegovina', 'country=Brazil', 'country=Bulgaria',
       'country=Canada', 'country=Chile', 'country=China', 'country=Croatia',
       'country=Cyprus', 'country=Czech Republic', 'country=Egypt',
       'country=England', 'country=France', 'country=Georgia',
       'country=Germany', 'country=Greece', 'country=Hungary', 'country=India',
       'country=Israel', 'country=Italy', 'country=Lebanon',
       'country=Luxembourg', 'country=Macedonia', 'country=Mexico',
       'country=Moldova', 'country=Morocco', 'country=New Zealand',
       'country=Peru', 'country=Portugal', 'country=Romania', 'country=Serbia',
       'country=Slovakia', 'country=Slovenia', 'country=South Africa',
       'country=Spain', 'country=Switzerland', 'country=Turkey', 'country=US',
       'c

In [None]:
model_df = pd.concat([X, y], axis=1)
model_df


# Save model-ready dataset


In [None]:
# Save model-ready dataset
model_df.to_csv(CSV_FILEPATH_MODEL_READY_DATA, index=False)
