In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
import warnings

warnings.filterwarnings('ignore')

# --- 1. PyTorch Components ---

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 
                      'mps' if torch.backends.mps.is_available() else 'cpu')
print(f"Using device for prediction: {DEVICE}")

class TabularDataset(Dataset):
    def __init__(self, features):
        self.features = torch.tensor(features, dtype=torch.float32)
    def __len__(self):
        return len(self.features)
    def __getitem__(self, idx):
        return self.features[idx]

class MLP(nn.Module):
    def __init__(self, input_size):
        super(MLP, self).__init__()
        self.layer_stack = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
    def forward(self, x):
        return torch.sigmoid(self.layer_stack(x))

# --- 2. Preprocessing Function ---

def preprocess_data(df, scaler=None, fit_scaler=False, all_train_columns=None):
    df_clean = df.copy()
    df_clean.replace([np.inf, -np.inf], np.nan, inplace=True)

    X = df_clean.drop(['id', 'Y'] if 'Y' in df_clean.columns else ['id'], axis=1)
    
    # --- Missing indicators ---
    if fit_scaler:
        cols_with_nan = X.columns[X.isnull().sum() > 0].tolist()
    else:
        # Use only the train NaN columns that exist in test
        cols_with_nan = [col for col in all_train_columns if col in X.columns and '_nan' not in col]
    
    for col in cols_with_nan:
        X[f'{col}_nan'] = X[col].isnull().astype(int)

    # --- Log transform ---
    log_transform_cols = ['x_9', 'x_10', 'x_12', 'x_13']
    for col in log_transform_cols:
        if col in X.columns:
            X[col] = np.log1p(X[col].abs())

    # --- Median Imputation ---
    X_imputed = X.fillna(X.median())

    # --- Ensure columns match training set ---
    if not fit_scaler and all_train_columns is not None:
        X_imputed = X_imputed.reindex(columns=all_train_columns, fill_value=0)

    # --- Scaling ---
    if fit_scaler:
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_imputed)
        return X_scaled, scaler, X_imputed.columns.tolist()
    else:
        X_scaled = scaler.transform(X_imputed)
        return X_scaled

# --- 3. Main Prediction & Submission Function ---

def create_nn_submission(train_file='train.csv', test_file='test.csv', model_path='best_mlp_model.pth'):
    # Load datasets
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    # --- Train Data Prep ---
    X_train_scaled, scaler, train_columns = preprocess_data(df_train, fit_scaler=True)

    # --- Test Data Prep ---
    X_test_scaled = preprocess_data(df_test, scaler=scaler, fit_scaler=False, all_train_columns=train_columns)

    # --- Load Model ---
    try:
        input_size = X_test_scaled.shape[1]
        model = MLP(input_size).to(DEVICE)
        model.load_state_dict(torch.load(model_path, map_location=DEVICE))
        model.eval()
    except FileNotFoundError:
        print(f"ERROR: Model file '{model_path}' not found.")
        return
    except Exception as e:
        print(f"ERROR loading model: {e}")
        return

    # --- Dataset & Predictions ---
    test_dataset = TabularDataset(X_test_scaled)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

    test_predictions = []
    with torch.no_grad():
        for X_batch in test_loader:
            X_batch = X_batch.to(DEVICE)
            outputs = model(X_batch)
            test_predictions.extend(outputs.cpu().numpy().flatten())

    # --- Apply Binary Threshold ---
    test_pred_binary = (np.array(test_predictions) > 0.5).astype(int)

    # --- Save Submission ---
    submission = pd.DataFrame({
        'id': df_test['id'],
        'Target': test_predictions
    })

    submission_filename = 'submission_nn_binary.csv'
    submission.to_csv(submission_filename, index=False)

    print(f"\nSubmission file '{submission_filename}' created successfully.")
    print("\nFirst 5 rows:")
    print(submission.head())
    print(f"\nTotal predictions: {len(submission)}")

# --- Execute ---
if __name__ == '__main__':
    create_nn_submission()


Using device for prediction: cuda

Submission file 'submission_nn_binary.csv' created successfully.

First 5 rows:
      id    Target
0  35956  0.123399
1  60927  0.222833
2  79918  0.325785
3  50078  0.106976
4  44080  0.049657

Total predictions: 38670
