In [None]:
import numpy as np
import pandas as pd
import polars as pl
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
import category_encoders as ce

class TargetEncoderCV(BaseEstimator, TransformerMixin):
    def __init__(self, n_folds=5, seed=42, cv_type='kfold', groups=None):
        """
        Parameters:
        - n_folds: Number of folds for cross-validation
        - seed: Random seed for reproducibility
        - cv_type: Type of cross-validation ('kfold', 'stratified', 'group')
        - groups: Group labels for group k-fold
        """
        self.n_folds = n_folds
        self.seed = seed
        self.cv_type = cv_type
        self.groups = groups
        self.final_target_encoder = None
        self.col = None

    def get_cv(self):
        if self.cv_type == 'kfold':
            return KFold(n_splits=self.n_folds, shuffle=True, random_state=self.seed)
        elif self.cv_type == 'stratified':
            return StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=self.seed)
        elif self.cv_type == 'group':
            if self.groups is None:
                raise ValueError("For GroupKFold, 'groups' must be provided.")
            return GroupKFold(n_splits=self.n_folds)
        else:
            raise ValueError("cv_type must be 'kfold', 'stratified', or 'group'.")

    def fit(self, X, y=None):
        if y is None:
            raise ValueError("Target variable (y) must be provided for fit method.")
        
        if isinstance(X, pl.DataFrame):
            X = X.to_pandas()
        if isinstance(y, pl.Series):
            y = y.to_pandas()

        self.col = X.columns.tolist()[0]
        self.target_col = y.name

        features = [c for c in X.columns if c != self.target_col]
        cv = self.get_cv()
        encoded_features = []
        X['target'] = y

        X['index'] = X.index

        for train_idx, val_idx in cv.split(X, y=y if self.cv_type == 'stratified' else None, groups=self.groups if self.cv_type == 'group' else None):
            X_train = X.iloc[train_idx]
            X_valid = X.iloc[val_idx]
            y_train = y.iloc[train_idx]

            target_encoder = ce.TargetEncoder()
            target_encoder.fit(X_train[self.col], y_train)

            X_valid[f'{self.col}_target_encoded'] = target_encoder.transform(X_valid[self.col])
            encoded_features.append(X_valid)

        encoded_df = pd.concat(encoded_features).sort_values(by='index')
        self.final_target_encoder = ce.TargetEncoder()
        self.final_target_encoder.fit(X[self.col], y)
        self.encoded_df = encoded_df

        return self

    def transform(self, X):
        if self.final_target_encoder is None:
            raise ValueError("The model has not been fit yet. Call 'fit' first.")
        
        is_polars = isinstance(X, pl.DataFrame)
        
        if is_polars:
            X = X.to_pandas()
        
        X[f'{self.col}_target_encoded'] = self.final_target_encoder.transform(X[self.col])
        
        return X if not is_polars else pl.from_pandas(X)