In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier

class DataPreprocessor:
    def __init__(self,
                 use_one_hot_encoding=False,
                 use_mean_imputation=False,
                 use_mode_imputation=False,
                 use_knn_imputation=False,
                 use_standardization=False,
                 use_min_max_scaling=False,
                 use_noise_injection=False,
                 noise_factor=0.01):

        self.use_one_hot_encoding = use_one_hot_encoding
        self.use_mean_imputation = use_mean_imputation
        self.use_mode_imputation = use_mode_imputation
        self.use_knn_imputation = use_knn_imputation
        self.use_standardization = use_standardization
        self.use_min_max_scaling = use_min_max_scaling
        self.use_noise_injection = use_noise_injection
        self.noise_factor = noise_factor

        # Initialize transformers
        self.ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)  # Fix here
        self.imputer_mean = SimpleImputer(strategy='mean')
        self.imputer_mode = SimpleImputer(strategy='most_frequent')
        self.imputer_knn = KNeighborsClassifier(n_neighbors=5)
        self.scaler_standard = StandardScaler()
        self.scaler_minmax = MinMaxScaler()

    def fit_transform(self, X):
        # Automatically detect categorical and numerical columns
        categorical_cols = X.select_dtypes(include=['object', 'category']).columns
        numerical_cols = X.select_dtypes(exclude=['object', 'category']).columns

        # Handle categorical columns with One-Hot Encoding
        if self.use_one_hot_encoding and len(categorical_cols) > 0:
            X_cat = X[categorical_cols]
            X_encoded = self.ohe.fit_transform(X_cat)
            X_encoded_df = pd.DataFrame(X_encoded, columns=self.ohe.get_feature_names_out(categorical_cols))
            X = X.drop(columns=categorical_cols)
            X = pd.concat([X, X_encoded_df], axis=1)

        # Handle Missing Data Imputation
        if self.use_mean_imputation and len(numerical_cols) > 0:
            X[numerical_cols] = self.imputer_mean.fit_transform(X[numerical_cols])
        if self.use_mode_imputation and len(categorical_cols) > 0:
            X[categorical_cols] = self.imputer_mode.fit_transform(X[categorical_cols])
        if self.use_knn_imputation:
            X = self.knn_imputation(X)

        # Feature Scaling
        if self.use_standardization and len(numerical_cols) > 0:
            X[numerical_cols] = self.scaler_standard.fit_transform(X[numerical_cols])
        if self.use_min_max_scaling and len(numerical_cols) > 0:
            X[numerical_cols] = self.scaler_minmax.fit_transform(X[numerical_cols])

        # Noise Injection for Robustness
        if self.use_noise_injection:
            X = self.inject_noise(X)

        return X

    def transform(self, X):
        # Automatically detect categorical and numerical columns
        categorical_cols = X.select_dtypes(include=['object', 'category']).columns
        numerical_cols = X.select_dtypes(exclude=['object', 'category']).columns

        # Handle categorical columns with One-Hot Encoding
        if self.use_one_hot_encoding and len(categorical_cols) > 0:
            X_cat = X[categorical_cols]
            X_encoded = self.ohe.transform(X_cat)
            X_encoded_df = pd.DataFrame(X_encoded, columns=self.ohe.get_feature_names_out(categorical_cols))
            X = X.drop(columns=categorical_cols)
            X = pd.concat([X, X_encoded_df], axis=1)

        # Handle Missing Data Imputation
        if self.use_mean_imputation and len(numerical_cols) > 0:
            X[numerical_cols] = self.imputer_mean.transform(X[numerical_cols])
        if self.use_mode_imputation and len(categorical_cols) > 0:
            X[categorical_cols] = self.imputer_mode.transform(X[categorical_cols])
        if self.use_knn_imputation:
            X = self.knn_imputation(X)

        # Feature Scaling
        if self.use_standardization and len(numerical_cols) > 0:
            X[numerical_cols] = self.scaler_standard.transform(X[numerical_cols])
        if self.use_min_max_scaling and len(numerical_cols) > 0:
            X[numerical_cols] = self.scaler_minmax.transform(X[numerical_cols])

        # Noise Injection for Robustness
        if self.use_noise_injection:
            X = self.inject_noise(X)

        return X

    def knn_imputation(self, X):
        # KNN Imputation for missing values
        X_filled = X.copy()
        for col in X.columns:
            if X[col].isnull().any():
                knn = self.imputer_knn.fit(X.dropna())
                X_filled[col] = knn.predict(X[col].dropna().values.reshape(-1, 1))
        return X_filled

    def inject_noise(self, X):
        # Inject random noise into the numerical features
        noisy_X = X.copy()
        numeric_cols = noisy_X.select_dtypes(include=[np.number]).columns
        noise = np.random.normal(0, self.noise_factor, size=noisy_X[numeric_cols].shape)
        noisy_X[numeric_cols] += noise
        return noisy_X


# Example usage
if __name__ == "__main__":
    # Generate some sample data
    data = pd.read_csv(r'covtype.csv')

    # Initialize preprocessor
    preprocessor = DataPreprocessor(
        use_one_hot_encoding=True,
        use_mean_imputation=True,
        use_standardization=True,
        use_noise_injection=False,
        noise_factor=0.01
    )

    # Preprocess the data
    processed_data = preprocessor.fit_transform(data)
    print(processed_data)


        Elevation    Aspect     Slope  Horizontal_Distance_To_Hydrology  \
0       -1.297805 -0.935157 -1.482820                         -0.053767   
1       -1.319235 -0.890480 -1.616363                         -0.270188   
2       -0.554907 -0.148836 -0.681563                         -0.006719   
3       -0.622768 -0.005869  0.520322                         -0.129044   
4       -1.301377 -0.988770 -1.616363                         -0.547771   
...           ...       ...       ...                               ...   
581007  -2.012130 -0.023740  0.787408                         -0.867697   
581008  -2.029988 -0.032675  0.653865                         -0.952383   
581009  -2.047847  0.029873  0.386780                         -0.985317   
581010  -2.054990  0.128163  0.119694                         -0.985317   
581011  -2.058562  0.083486 -0.147392                         -0.985317   

        Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
0                       

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim

# Download dataset (replace with the actual dataset path on your system)
# For example:
# data = pd.read_csv('path_to_forest_cover_type.csv')

# Assuming the data is already loaded as 'data'

class AutoMLP:
    def __init__(self, data, target_col, hidden_dims=[64, 32], batch_size=64, epochs=100, learning_rate=0.001, noise_factor=0.01):
        """
        Initialize the AutoMLP class for automatic classification or regression task.

        :param data: pandas DataFrame, contains the input features and target column
        :param target_col: str, name of the target column
        :param hidden_dims: list of int, sizes of the hidden layers
        :param batch_size: int, batch size for training
        :param epochs: int, number of training epochs
        :param learning_rate: float, learning rate for the optimizer
        :param noise_factor: float, factor for noise injection
        """
        self.data = data
        self.target_col = target_col
        self.hidden_dims = hidden_dims
        self.batch_size = batch_size
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.noise_factor = noise_factor

        # Separate features and target
        self.X = self.data.drop(columns=[self.target_col])
        self.y = self.data[self.target_col]

        # Detect task type
        self.is_classification = self._detect_task_type()
        self.output_dim = len(np.unique(self.y)) if self.is_classification else 1

        # Train-test split
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)

        # Create model
        self.model = self._build_model()

    def _detect_task_type(self):
        """Detect if the task is classification or regression based on the target column."""
        if len(np.unique(self.y)) <= 10 and np.all(self.y.apply(lambda x: isinstance(x, (int, np.integer)))):
            return True  # Classification (less than or equal to 10 unique values, integers)
        return False  # Regression

    def _build_model(self):
        """Build the MLP model."""
        input_dim = self.X.shape[1]

        layers = []
        layers.append(nn.Linear(input_dim, self.hidden_dims[0]))
        layers.append(nn.ReLU())

        # Hidden layers
        for i in range(1, len(self.hidden_dims)):
            layers.append(nn.Linear(self.hidden_dims[i-1], self.hidden_dims[i]))
            layers.append(nn.ReLU())

        # Output layer
        layers.append(nn.Linear(self.hidden_dims[-1], self.output_dim))

        # For classification, apply Softmax activation on the output layer
        if self.is_classification:
            self.final_activation = nn.Softmax(dim=1)
        else:
            self.final_activation = nn.Identity()

        model = nn.Sequential(*layers)
        return model

    def train_model(self):
        """Train the model."""
        # Convert to PyTorch tensors
        X_train_tensor = torch.tensor(self.X_train.values, dtype=torch.float32)
        y_train_tensor = torch.tensor(self.y_train.values, dtype=torch.long if self.is_classification else torch.float32).view(-1, 1)

        # Loss function and optimizer
        loss_function = nn.CrossEntropyLoss() if self.is_classification else nn.MSELoss()
        optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

        # Training loop
        for epoch in range(self.epochs):
            self.model.train()
            optimizer.zero_grad()

            # Forward pass
            outputs = self.model(X_train_tensor)
            loss = loss_function(outputs, y_train_tensor.squeeze())  # Remove extra dimension

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            if (epoch + 1) % 10 == 0:
                print(f"Epoch [{epoch+1}/{self.epochs}], Loss: {loss.item():.4f}")

    def evaluate(self):
        """Evaluate the model and return metrics."""
        # Convert to PyTorch tensors
        X_test_tensor = torch.tensor(self.X_test.values, dtype=torch.float32)
        y_test_tensor = torch.tensor(self.y_test.values, dtype=torch.long if self.is_classification else torch.float32).view(-1, 1)

        # Evaluate the model
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(X_test_tensor)
            outputs = self.final_activation(outputs)  # Apply final activation (Softmax for classification)

            # For classification, use argmax to get predicted classes
            if self.is_classification:
                predicted = torch.argmax(outputs, dim=1)
                self._print_classification_metrics(predicted, y_test_tensor)
            else:
                predicted = outputs
                self._print_regression_metrics(predicted, y_test_tensor)

            return predicted.numpy(), y_test_tensor.numpy()

    def _print_classification_metrics(self, predicted, true):
        """Print classification metrics (accuracy, precision, recall, F1 score)."""
        accuracy = accuracy_score(true, predicted)
        precision = precision_score(true, predicted, average='weighted', zero_division=0)
        recall = recall_score(true, predicted, average='weighted', zero_division=0)
        f1 = f1_score(true, predicted, average='weighted', zero_division=0)

        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")

    def _print_regression_metrics(self, predicted, true):
        """Print regression metrics (Mean Squared Error)."""
        mse = mean_squared_error(true, predicted)
        print(f"Mean Squared Error: {mse:.4f}")


# Example usage:
if __name__ == "__main__":
    # Load the dataset (replace with the actual dataset path or Kaggle API call)
    # For demonstration, using a dataset like 'load_iris' here

    data = pd.read_csv(r'C:\Users\Shaikh\Documents\SRP\covtype.csv')


    # Initialize preprocessor
    preprocessor = DataPreprocessor(
        use_one_hot_encoding=True,
        use_mean_imputation=True,
        use_standardization=True,
        use_noise_injection=False,
        noise_factor=0.01
    )

    # Preprocess the data
    X_processed = preprocessor.fit_transform(data.drop(columns=['target']))
    data['target'] = data['Cover_Type']  # This should be the target column

    # Initialize AutoMLP (this will automatically detect the task type)
    model = AutoMLP(data=data, target_col='target', hidden_dims=[64, 32], epochs=100)

    # Train the model
    model.train_model()

    # Evaluate the model (for classification: precision, recall, accuracy, F1 score)
    predictions, true_labels = model.evaluate()
    print("Predictions:", predictions)
    print("True labels:", true_labels)


IndexError: Target 7 is out of bounds.

In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, mean_squared_error
import torch
import torch.nn as nn
import torch.optim as optim

# DataPreprocessor Class
class DataPreprocessor:
    def __init__(self,
                 use_one_hot_encoding=False,
                 use_mean_imputation=False,
                 use_mode_imputation=False,
                 use_knn_imputation=False,
                 use_standardization=False,
                 use_min_max_scaling=False,
                 use_noise_injection=False,
                 noise_factor=0.01):

        self.use_one_hot_encoding = use_one_hot_encoding
        self.use_mean_imputation = use_mean_imputation
        self.use_mode_imputation = use_mode_imputation
        self.use_knn_imputation = use_knn_imputation
        self.use_standardization = use_standardization
        self.use_min_max_scaling = use_min_max_scaling
        self.use_noise_injection = use_noise_injection
        self.noise_factor = noise_factor

        self.ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        self.imputer_mean = SimpleImputer(strategy='mean')
        self.imputer_mode = SimpleImputer(strategy='most_frequent')
        self.imputer_knn = KNeighborsClassifier(n_neighbors=5)
        self.scaler_standard = StandardScaler()
        self.scaler_minmax = MinMaxScaler()

    def fit_transform(self, X):
        categorical_cols = X.select_dtypes(include=['object', 'category']).columns
        numerical_cols = X.select_dtypes(exclude=['object', 'category']).columns

        # Imputation
        if self.use_mean_imputation and len(numerical_cols) > 0:
            X[numerical_cols] = self.imputer_mean.fit_transform(X[numerical_cols])
        if self.use_mode_imputation and len(categorical_cols) > 0:
            X[categorical_cols] = self.imputer_mode.fit_transform(X[categorical_cols])

        # One-hot encoding
        if self.use_one_hot_encoding and len(categorical_cols) > 0:
            X_cat = X[categorical_cols]
            X_encoded = self.ohe.fit_transform(X_cat)
            X_encoded_df = pd.DataFrame(X_encoded, columns=self.ohe.get_feature_names_out(categorical_cols), index=X.index)
            X = X.drop(columns=categorical_cols)
            X = pd.concat([X, X_encoded_df], axis=1)

        # Scaling
        if self.use_standardization and len(numerical_cols) > 0:
            X[numerical_cols] = self.scaler_standard.fit_transform(X[numerical_cols])
        elif self.use_min_max_scaling and len(numerical_cols) > 0:
            X[numerical_cols] = self.scaler_minmax.fit_transform(X[numerical_cols])

        # Noise injection
        if self.use_noise_injection:
            X = self.inject_noise(X)

        return X

    def transform(self, X):
        categorical_cols = X.select_dtypes(include=['object', 'category']).columns
        numerical_cols = X.select_dtypes(exclude=['object', 'category']).columns

        if self.use_mean_imputation and len(numerical_cols) > 0:
            X[numerical_cols] = self.imputer_mean.transform(X[numerical_cols])
        if self.use_mode_imputation and len(categorical_cols) > 0:
            X[categorical_cols] = self.imputer_mode.transform(X[categorical_cols])

        if self.use_one_hot_encoding and len(categorical_cols) > 0:
            X_cat = X[categorical_cols]
            X_encoded = self.ohe.transform(X_cat)
            X_encoded_df = pd.DataFrame(X_encoded, columns=self.ohe.get_feature_names_out(categorical_cols), index=X.index)
            X = X.drop(columns=categorical_cols)
            X = pd.concat([X, X_encoded_df], axis=1)

        if self.use_standardization and len(numerical_cols) > 0:
            X[numerical_cols] = self.scaler_standard.transform(X[numerical_cols])
        elif self.use_min_max_scaling and len(numerical_cols) > 0:
            X[numerical_cols] = self.scaler_minmax.transform(X[numerical_cols])

        if self.use_noise_injection:
            X = self.inject_noise(X)

        return X

    def inject_noise(self, X):
        noisy_X = X.copy()
        numeric_cols = noisy_X.select_dtypes(include=[np.number]).columns
        noise = np.random.normal(0, self.noise_factor, size=noisy_X[numeric_cols].shape)
        noisy_X[numeric_cols] += noise
        return noisy_X


In [19]:
# AutoMLP Class
class AutoMLP:
    def __init__(self, X, y, hidden_dims=[64, 32], batch_size=64, epochs=140, learning_rate=0.001):
        """
        :param X: preprocessed pandas DataFrame of features (all numeric)
        :param y: pandas Series or numpy array of target values (classification or regression)
        """
        self.X = X
        self.y = y

        self.hidden_dims = hidden_dims
        self.batch_size = batch_size
        self.epochs = epochs
        self.learning_rate = learning_rate

        self.is_classification = self._detect_task_type()
        self.output_dim = len(np.unique(self.y)) if self.is_classification else 1

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.2, random_state=42)

        self.model = self._build_model()

    def _detect_task_type(self):
        if len(np.unique(self.y)) <= 10 and np.all(np.mod(self.y, 1) == 0):
            return True
        return False

    def _build_model(self):
        input_dim = self.X.shape[1]
        layers = []
        layers.append(nn.Linear(input_dim, self.hidden_dims[0]))
        layers.append(nn.ReLU())
        for i in range(1, len(self.hidden_dims)):
            layers.append(nn.Linear(self.hidden_dims[i-1], self.hidden_dims[i]))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(self.hidden_dims[-1], self.output_dim))
        if self.is_classification:
            self.final_activation = nn.Softmax(dim=1)
        else:
            self.final_activation = nn.Identity()
        return nn.Sequential(*layers)

    def train_model(self):
        X_train_tensor = torch.tensor(self.X_train.values, dtype=torch.float32)
        y_train_tensor = torch.tensor(self.y_train.values, dtype=torch.long if self.is_classification else torch.float32)

        loss_fn = nn.CrossEntropyLoss() if self.is_classification else nn.MSELoss()
        optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

        for epoch in range(self.epochs):
            self.model.train()
            optimizer.zero_grad()
            outputs = self.model(X_train_tensor)
            if self.is_classification:
                loss = loss_fn(outputs, y_train_tensor)
            else:
                loss = loss_fn(outputs.squeeze(), y_train_tensor)
            loss.backward()
            optimizer.step()
            if (epoch + 1) % 10 == 0:
                print(f"Epoch [{epoch+1}/{self.epochs}], Loss: {loss.item():.4f}")

    def evaluate(self):
        X_test_tensor = torch.tensor(self.X_test.values, dtype=torch.float32)
        y_test_tensor = torch.tensor(self.y_test.values, dtype=torch.long if self.is_classification else torch.float32)

        self.model.eval()
        with torch.no_grad():
            outputs = self.model(X_test_tensor)
            outputs = self.final_activation(outputs)
            if self.is_classification:
                preds = torch.argmax(outputs, dim=1).numpy()
                self._print_classification_metrics(preds, y_test_tensor.numpy())
            else:
                preds = outputs.squeeze().numpy()
                self._print_regression_metrics(preds, y_test_tensor.numpy())
            return preds, y_test_tensor.numpy()

    def _print_classification_metrics(self, preds, true):
        from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
        print("Classification Metrics:")
        print("Accuracy:", accuracy_score(true, preds))
        print("Precision:", precision_score(true, preds, average='weighted', zero_division=0))
        print("Recall:", recall_score(true, preds, average='weighted', zero_division=0))
        print("F1 Score:", f1_score(true, preds, average='weighted', zero_division=0))

    def _print_regression_metrics(self, preds, true):
        from sklearn.metrics import mean_squared_error
        mse = mean_squared_error(true, preds)
        print("Regression Metrics:")
        print("Mean Squared Error:", mse)


# Usage example
if __name__ == "__main__":
    data = pd.read_csv(r'covtype.csv')

    target_col = 'Cover_Type'

    # Separate features and target
    X = data.drop(columns=[target_col])
    y = data[target_col]

    # Re-index target labels for PyTorch
    y = y - y.min()

    # Preprocess features only
    preprocessor = DataPreprocessor(
        use_one_hot_encoding=True,
        use_mean_imputation=True,
        use_standardization=True,
        use_noise_injection=False,
        noise_factor=0.01
    )
    X_processed = preprocessor.fit_transform(X)

    # Train and evaluate
    model = AutoMLP(X_processed, y, hidden_dims=[64, 32], epochs=100)
    model.train_model()
    preds, true = model.evaluate()

    print("Predictions:", preds)
    print("True labels:", true)


Epoch [10/100], Loss: 1.8654
Epoch [20/100], Loss: 1.6754
Epoch [30/100], Loss: 1.4160
Epoch [40/100], Loss: 1.1675
Epoch [50/100], Loss: 1.0170
Epoch [60/100], Loss: 0.9331
Epoch [70/100], Loss: 0.8703
Epoch [80/100], Loss: 0.8234
Epoch [90/100], Loss: 0.7866
Epoch [100/100], Loss: 0.7585
Classification Metrics:
Accuracy: 0.6824092321196527
Precision: 0.6810717790464706
Recall: 0.6824092321196527
F1 Score: 0.6550432185817675
Predictions: [0 1 1 ... 1 1 6]
True labels: [0 1 1 ... 1 1 6]


In [17]:
print(len(preds))

116203


In [18]:
print(len(X_processed))

581012


In [6]:
# step1_multi_mlp_embeddings.py
import os, json, math, numpy as np, pandas as pd
from dataclasses import dataclass
from typing import Tuple, List, Optional, Dict

# ── sklearn: robust, generic preprocessing ──────────────────────────────────────
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

# ── torch: shallow MLPs + training ─────────────────────────────────────────────
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# ----------------------------- CONFIG ------------------------------------------
@dataclass
class PrepConfig:
    impute_num: str = "mean"          # "mean" | "median" | "knn"
    impute_cat: str = "most_frequent" # "most_frequent" | "constant"
    scale: Optional[str] = "standard" # None | "standard" | "minmax"
    noise_std: float = 0.0            # e.g., 0.01 for light noise on numeric features
    knn_k: int = 5

@dataclass
class TrainConfig:
    n_mlps: int = 5
    hidden_dim: int = 128
    n_hidden_layers: int = 3          # shallow by default
    dropout: float = 0.1
    batch_size: int = 512
    epochs: int = 35
    lr: float = 1e-3
    weight_decay: float = 1e-5
    val_size: float = 0.2
    test_size: float = 0.2
    seed: int = 42
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    # early stopping
    patience: int = 5

# ----------------------- PREPROCESSOR (generic) --------------------------------
class TabularPreprocessor:
    def __init__(self, cfg: PrepConfig):
        self.cfg = cfg
        self.column_transformer: Optional[ColumnTransformer] = None
        self.feature_names_: Optional[List[str]] = None
        self.num_cols_: Optional[List[str]] = None
        self.cat_cols_: Optional[List[str]] = None

    def _num_imputer(self):
        if self.cfg.impute_num == "mean":
            return SimpleImputer(strategy="mean")
        if self.cfg.impute_num == "median":
            return SimpleImputer(strategy="median")
        if self.cfg.impute_num == "knn":
            return KNNImputer(n_neighbors=self.cfg.knn_k)
        raise ValueError("impute_num must be mean|median|knn")

    def _scaler(self):
        if self.cfg.scale is None:
            return "passthrough"
        if self.cfg.scale == "standard":
            return StandardScaler()
        if self.cfg.scale == "minmax":
            return MinMaxScaler()
        raise ValueError("scale must be None|standard|minmax")

    def fit(self, X: pd.DataFrame):
        # Treat bools as categorical (common in tabular datasets)
        self.cat_cols_ = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
        self.num_cols_ = X.columns.difference(self.cat_cols_).tolist()

        num_pipe = Pipeline(steps=[
            ("imputer", self._num_imputer()),
            ("scaler", self._scaler()),
        ])

        cat_pipe = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy=self.cfg.impute_cat, fill_value="missing")),
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ])

        self.column_transformer = ColumnTransformer(
            transformers=[
                ("num", num_pipe, self.num_cols_),
                ("cat", cat_pipe, self.cat_cols_)
            ],
            remainder="drop",
        )
        self.column_transformer.fit(X)

        # Build feature names after fit
        names = []
        if self.num_cols_:
            names += self.num_cols_
        if self.cat_cols_:
            ohe = self.column_transformer.named_transformers_["cat"]["ohe"]
            names += ohe.get_feature_names_out(self.cat_cols_).tolist()
        self.feature_names_ = names
        return self

    def transform(self, X: pd.DataFrame) -> np.ndarray:
        Xt = self.column_transformer.transform(X)
        Xt = np.asarray(Xt, dtype=np.float32)
        # optional light Gaussian noise on numeric part (first len(num_cols_) columns after pipeline)
        if self.cfg.noise_std and self.cfg.noise_std > 0 and len(self.num_cols_) > 0:
            n_num = len(self.num_cols_)
            noise = np.random.normal(0, self.cfg.noise_std, size=(Xt.shape[0], n_num)).astype(np.float32)
            Xt[:, :n_num] += noise
        return Xt

# ----------------------------- TORCH DATASET -----------------------------------
class TabularDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray):
        self.X = torch.from_numpy(X.astype(np.float32))
        self.y = torch.from_numpy(y)

    def __len__(self): return self.X.shape[0]
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

# ----------------------------- MODEL -------------------------------------------
class ShallowMLP(nn.Module):
    def __init__(self, in_dim: int, hidden: int, n_hidden_layers: int, dropout: float, out_dim: int, task: str):
        super().__init__()
        layers = []
        dim = in_dim
        for _ in range(n_hidden_layers):
            layers += [nn.Linear(dim, hidden), nn.ReLU(), nn.Dropout(dropout)]
            dim = hidden
        self.backbone = nn.Sequential(*layers) if layers else nn.Identity()
        self.head = nn.Linear(dim, out_dim)
        self.task = task  # "binary" | "multiclass" | "regression"

    def forward(self, x, return_embedding: bool = False):
        emb = self.backbone(x)
        logits = self.head(emb)
        if return_embedding:
            return logits, emb
        return logits

# ----------------------------- UTIL --------------------------------------------
def infer_task_and_outdim(y: np.ndarray) -> Tuple[str, int, np.ndarray]:
    """
    Returns (task, out_dim, y_torch_ready)
    """
    if y.dtype.kind in {"f"} and len(np.unique(y)) > 10:
        return "regression", 1, y.astype(np.float32).reshape(-1, 1)
    # classification
    classes = np.unique(y)
    if len(classes) == 2:
        # map to {0,1}
        mapping = {classes[0]: 0, classes[1]: 1}
        y_ = np.vectorize(mapping.get)(y).astype(np.int64)
        return "binary", 1, y_.reshape(-1)
    else:
        # 0..K-1
        mapping = {c: i for i, c in enumerate(classes)}
        y_ = np.vectorize(mapping.get)(y).astype(np.int64)
        return "multiclass", len(classes), y_.reshape(-1)

def make_loaders(Xtr, ytr, Xva, yva, Xte, yte, bs):
    ds_tr, ds_va, ds_te = TabularDataset(Xtr, ytr), TabularDataset(Xva, yva), TabularDataset(Xte, yte)
    return (DataLoader(ds_tr, batch_size=bs, shuffle=True),
            DataLoader(ds_va, batch_size=bs, shuffle=False),
            DataLoader(ds_te, batch_size=bs, shuffle=False))

def make_loss(task: str):
    if task == "binary":     return nn.BCEWithLogitsLoss()
    if task == "multiclass": return nn.CrossEntropyLoss()
    return nn.MSELoss()  # regression

def metric_from_logits(task: str, logits: torch.Tensor, y: torch.Tensor) -> float:
    if task == "binary":
        preds = (torch.sigmoid(logits).view(-1) > 0.5).long()
        return (preds == y.long()).float().mean().item()
    if task == "multiclass":
        preds = logits.argmax(dim=1)
        return (preds == y.long()).float().mean().item()
    # regression -> negative RMSE so higher is better for early stopping
    mse = nn.functional.mse_loss(logits.view_as(y).float(), y.float()).item()
    return -math.sqrt(mse)

# ----------------------------- TRAIN ONE MLP -----------------------------------
def train_one_mlp(model, loaders, cfg: TrainConfig, task: str):
    tr_loader, va_loader, _ = loaders
    device = cfg.device
    model.to(device)
    opt = optim.AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)
    loss_fn = make_loss(task)
    best_score, best_state, patience = -1e9, None, cfg.patience

    for epoch in range(cfg.epochs):
        model.train()
        for xb, yb in tr_loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            logits = model(xb)
            if task == "binary":
                loss = loss_fn(logits.view(-1), yb.float())
            elif task == "multiclass":
                loss = loss_fn(logits, yb.long())
            else:
                loss = loss_fn(logits, yb.float())
            loss.backward(); opt.step()

        # validation
        model.eval()
        with torch.no_grad():
            scores = []
            for xb, yb in va_loader:
                xb, yb = xb.to(device), yb.to(device)
                logits = model(xb)
                scores.append(metric_from_logits(task, logits, yb))
            score = float(np.mean(scores)) if scores else -1e9

        if score > best_score + 1e-6:
            best_score, best_state, patience = score, {k: v.cpu().clone() for k, v in model.state_dict().items()}, cfg.patience
        else:
            patience -= 1
            if patience <= 0:
                break

    if best_state is not None:
        model.load_state_dict(best_state)
    return model, best_score

# ----------------------------- EMBEDDING DUMP ----------------------------------
@torch.no_grad()
def dump_embeddings(models: List[ShallowMLP], loader: DataLoader, task: str, device: str) -> np.ndarray:
    """
    Returns array (N_examples, n_mlps, emb_dim)
    """
    # probe emb dim
    for m in models: m.eval().to(device)
    # run one batch to find embedding dimension
    xb0, _ = next(iter(loader))
    xb0 = xb0.to(device)
    _, emb0 = models[0](xb0, return_embedding=True)
    emb_dim = emb0.shape[1]
    # collect
    all_embs = [ [] for _ in models ]
    for xb, _ in loader:
        xb = xb.to(device)
        for i, m in enumerate(models):
            _, emb = m(xb, return_embedding=True)
            all_embs[i].append(emb.cpu().numpy())
    stacked = [np.concatenate(chunks, axis=0) for chunks in all_embs]  # list of (N, emb_dim)
    return np.stack(stacked, axis=1)  # (N, n_mlps, emb_dim)

# ----------------------------- MAIN ENTRY --------------------------------------
def train_mlps_and_save_embeddings(
    df: pd.DataFrame,
    target_col: str,
    out_dir: str = "./artifacts_step1",
    prep_cfg: PrepConfig = PrepConfig(),
    train_cfg: TrainConfig = TrainConfig(),
):
    os.makedirs(out_dir, exist_ok=True)
    rng = np.random.RandomState(train_cfg.seed)

    y_raw = df[target_col].values
    X_raw = df.drop(columns=[target_col])
    print('hello1')
    # split (stratify if classification)
    task0, _, _ = infer_task_and_outdim(y_raw)
    if task0 in ("binary", "multiclass"):
        strat = y_raw
        splitter = StratifiedShuffleSplit(n_splits=1, test_size=train_cfg.test_size, random_state=train_cfg.seed)
        train_idx, test_idx = next(splitter.split(X_raw, strat))
    else:
        idx = np.arange(len(df)); rng.shuffle(idx)
        split = int(len(idx) * (1 - train_cfg.test_size))
        train_idx, test_idx = idx[:split], idx[split:]

    print('hello2')
    X_train, X_test = X_raw.iloc[train_idx].copy(), X_raw.iloc[test_idx].copy()
    y_train_raw, y_test_raw = y_raw[train_idx], y_raw[test_idx]

    # build preprocessor on train, transform all
    prep = TabularPreprocessor(prep_cfg).fit(X_train)
    X_train_t = prep.transform(X_train)
    X_test_t  = prep.transform(X_test)

    # secondary split: train/val
    if task0 in ("binary", "multiclass"):
        trX, vaX, trY_raw, vaY_raw = train_test_split(
            X_train_t, y_train_raw, test_size=train_cfg.val_size, random_state=train_cfg.seed, stratify=y_train_raw
        )
    else:
        trX, vaX, trY_raw, vaY_raw = train_test_split(
            X_train_t, y_train_raw, test_size=train_cfg.val_size, random_state=train_cfg.seed
        )

    # infer final task using training labels only
    task, out_dim, trY = infer_task_and_outdim(trY_raw)
    _, _, vaY = infer_task_and_outdim(vaY_raw)
    _, _, teY = infer_task_and_outdim(y_test_raw)

    tr_loader, va_loader, te_loader = make_loaders(trX, trY, vaX, vaY, X_test_t, teY, train_cfg.batch_size)

    in_dim = trX.shape[1]
    models: List[ShallowMLP] = []
    val_scores: List[float] = []

    for i in range(train_cfg.n_mlps):
        torch.manual_seed(train_cfg.seed + i)
        mlp = ShallowMLP(
            in_dim=in_dim,
            hidden=train_cfg.hidden_dim,
            n_hidden_layers=train_cfg.n_hidden_layers,
            dropout=train_cfg.dropout,
            out_dim=out_dim,
            task=task
        )
        mlp, score = train_one_mlp(mlp, (tr_loader, va_loader, te_loader), train_cfg, task)
        models.append(mlp)
        val_scores.append(score)
        torch.save(mlp.state_dict(), os.path.join(out_dir, f"mlp_{i}.pt"))

    # dump embeddings
    emb_train = dump_embeddings(models, DataLoader(TabularDataset(trX, trY), batch_size=train_cfg.batch_size, shuffle=False),
                                task, train_cfg.device)
    emb_val   = dump_embeddings(models, DataLoader(TabularDataset(vaX, vaY), batch_size=train_cfg.batch_size, shuffle=False),
                                task, train_cfg.device)
    emb_test  = dump_embeddings(models, te_loader, task, train_cfg.device)

    np.save(os.path.join(out_dir, "emb_train.npy"), emb_train)
    np.save(os.path.join(out_dir, "emb_val.npy"),   emb_val)
    np.save(os.path.join(out_dir, "emb_test.npy"),  emb_test)

    # save metadata for the next stage (Transformer)
    with open(os.path.join(out_dir, "meta.json"), "w") as f:
        json.dump({
            "task": task,
            "in_dim": in_dim,
            "embedding_dim": int(emb_train.shape[-1]),
            "n_mlps": train_cfg.n_mlps,
            "feature_names": prep.feature_names_,
            "val_scores": val_scores
        }, f, indent=2)

    return {
        "prep": prep,
        "models": models,
        "paths": {
            "emb_train": os.path.join(out_dir, "emb_train.npy"),
            "emb_val":   os.path.join(out_dir, "emb_val.npy"),
            "emb_test":  os.path.join(out_dir, "emb_test.npy"),
            "meta":      os.path.join(out_dir, "meta.json")
        }
    }

# ----------------------------- USAGE EXAMPLE -----------------------------------
if __name__ == "__main__":
    # Example: load any CSV and specify target column
    df = pd.read_csv("adult.csv")
    result = train_mlps_and_save_embeddings(df, target_col="income")
    pass


hello1
hello2


In [2]:
import pandas as pd

In [3]:
pd.read_csv('covtype.csv').head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,2596,51,3,258,0,510.0,221.0,232.0,148.0,6279.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
1,2590,56,2,212,-6,390.0,220.0,235.0,151.0,6225.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,2804,139,9,268,65,3180.0,234.0,238.0,135.0,6121.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,2785,155,18,242,118,3090.0,238.0,238.0,122.0,6211.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4,2595,45,2,153,-1,391.0,220.0,234.0,150.0,6172.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
