In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Create a fake dataset with missing values and numerical features
n_samples = 100
n_features = 5

# Create numerical data with missing values
X_fake = pd.DataFrame({
    f"feature_{i}": np.random.randn(n_samples) for i in range(n_features)
})
# Introduce missing values in some columns
missing_rate = 0.2
for col in X_fake.columns:
    X_fake.loc[X_fake.sample(frac=missing_rate).index, col] = np.nan

# Create a binary target variable
y_fake = np.random.choice([0, 1], size=n_samples)

# Display the first few rows of the fake dataset
X_fake.head(), y_fake[:10]

(   feature_0  feature_1  feature_2  feature_3  feature_4
 0   0.496714        NaN   0.357787  -0.828995  -1.594428
 1  -0.138264  -0.420645   0.560785  -0.560181  -0.599375
 2   0.647689        NaN        NaN   0.747294   0.005244
 3   1.523030  -0.802277        NaN        NaN        NaN
 4  -0.234153  -0.161286  -1.377669  -0.020902  -0.450065,
 array([0, 0, 1, 0, 0, 1, 0, 1, 1, 1]))

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# Custom Transformer for Missing Value Imputation
class MissingValueImputer(BaseEstimator, TransformerMixin):
    def __init__(self, missing_numeric_cols):
        self.missing_numeric_cols = missing_numeric_cols
        self.imputation_models = {}
    
    def fit(self, X, y=None):
        for col in self.missing_numeric_cols:
            if X[col].isnull().any():
                # Prepare data for imputation
                non_missing_data = X[X[col].notnull()]
                train_features = non_missing_data.drop(columns=[col])
                train_target = non_missing_data[col]
                
                # Train the imputation model
                imputation_model = SASDecisionTreeRegressor()
                nominal_features = train_features.select_dtypes(exclude='number').columns.tolist()
                if len(nominal_features)>0:
                    imputation_model.fit(train_features, train_target, nominals=nominal_features)
                else:
                    imputation_model.fit(train_features, train_target)
                
                # Store the model for this column
                self.imputation_models[col] = imputation_model
        return self
    
    def transform(self, X):
        X = X.copy()
        for col, model in self.imputation_models.items():
            # Separate rows with missing values
            missing_data = X[X[col].isnull()]
            if not missing_data.empty:
                missing_features = missing_data.drop(columns=[col])
                imputed_values = model.predict(missing_features)
                
                # Fill missing values
                X.loc[X[col].isnull(), col] = imputed_values
        return X

# Custom Transformer for Scaling
class DataScaler(BaseEstimator, TransformerMixin):
    def __init__(self, numeric_cols):
        self.numeric_cols = numeric_cols
        self.scaler = StandardScaler()
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.numeric_cols])
        return self
    
    def transform(self, X):
        X = X.copy()
        X[self.numeric_cols] = self.scaler.transform(X[self.numeric_cols])
        return X

# Pipeline Creation
def create_pipeline(model, numeric_cols, nominals=None):
    return Pipeline([
        ('imputer', MissingValueImputer(numeric_cols=numeric_cols, nominals=nominals)),
        ('scaler', DataScaler(numeric_cols=numeric_cols)),
        ('model', model)
    ])

# Example Usage
# pipeline = create_pipeline(
#     model=best_model,  # Replace with your trained model
#     numeric_cols=numeric_columns_list, 
#     nominals=nominal_columns_list
# )

# # Fit the pipeline on the training data
# pipeline.fit(X_train, y_train)

# # Make predictions on new data
# predictions = pipeline.predict(X_new)

In [8]:
missing_cols = [col for col in X_fake.columns if X_fake[col].isnull().any()]

In [9]:
missing_cols

['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4']

In [10]:
missing = MissingValueImputer(numeric_cols=missing_cols, nominals=)

SyntaxError: invalid syntax (354174851.py, line 1)