In [1]:
import pandas as pd
import torch
import numpy as np
import torch.nn.functional as F
from torch_frame import numerical, categorical, stype, NAStrategy
# Import Dataset and DataLoader from the .data submodule
from torch_frame.data import Dataset, DataLoader
from torch_frame.nn import FTTransformer, EmbeddingEncoder, LinearEncoder
from sklearn.model_selection import train_test_split

In [2]:
# Set a seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

In [3]:
col_to_stype = {
    # 5 Numerical features
    'annual_income': numerical,
    'debt_to_income_ratio': numerical,
    'credit_score': numerical,
    'loan_amount': numerical,
    'interest_rate': numerical,
    
    # 6 Categorical features
    'gender': categorical,
    'marital_status': categorical,
    'education_level': categorical,
    'employment_status': categorical,
    'loan_purpose': categorical,
    'grade_subgrade': categorical,
    'loan_paid_back': numerical,
}
target_col = 'loan_paid_back'

df = pd.read_csv('train.csv')
df

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.10,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.20,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.10,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.70,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
593989,593989,23004.26,0.152,703,20958.37,10.92,Female,Single,High School,Employed,Business,C3,1.0
593990,593990,35289.43,0.105,559,3257.24,14.62,Male,Single,Bachelor's,Employed,Debt consolidation,F5,1.0
593991,593991,47112.64,0.072,675,929.27,14.13,Female,Married,Bachelor's,Employed,Debt consolidation,C1,1.0
593992,593992,76748.44,0.067,740,16290.40,9.87,Male,Single,Bachelor's,Employed,Debt consolidation,B2,1.0


In [4]:
dataset = Dataset(
    df=df.drop(columns=['id'], errors='ignore'),
    col_to_stype=col_to_stype,
    target_col=target_col,
)

# Materialize the dataset (converts raw data to TensorFrames and computes stats)
dataset.materialize()

Dataset()

In [None]:
N = len(df)
indices = range(N)

train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)
train_dataset = dataset[train_idx]
test_dataset = dataset[test_idx]

train_loader = DataLoader(train_dataset.tensor_frame, batch_size=1024, shuffle=True, num_workers=8)

# --- 4. Define Model and Encoders ---
stype_encoder_dict = {
    # Use standard Embedding for all Categorical features
    stype.categorical: EmbeddingEncoder(na_strategy=NAStrategy.MOST_FREQUENT),
    # Use Linear Layer (MLP) for all Numerical features
    stype.numerical: LinearEncoder(na_strategy=NAStrategy.MEAN),
}

# Initialize the FT-Transformer Model
model = FTTransformer(
    channels=64,             # Dimensionality of column embeddings
    num_layers=4,             # Number of Transformer blocks
    out_channels=1,           # Output channels for binary classification logits (0/1)
    col_stats=dataset.col_stats,
    col_names_dict=dataset.tensor_frame.col_names_dict,
    stype_encoder_dict=stype_encoder_dict,
)

# --- 5. Training Loop ---
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# BCEWithLogitsLoss is the standard loss for binary classification with raw logits
criterion = torch.nn.BCEWithLogitsLoss() 

model.train()
epochs = 5
print("Starting training with FT-Transformer...")

for epoch in range(1, epochs + 1):
    total_loss = 0
    for tf in train_loader:
        optimizer.zero_grad()
        
        # Forward pass
        pred = model(tf).squeeze(-1) 
        target = tf.y.float() 
        
        loss = criterion(pred, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch:02d}, Loss: {total_loss / len(train_loader):.4f}')


Starting training with FT-Transformer...
Epoch 01, Loss: 0.2788
Epoch 02, Loss: 0.2609
Epoch 03, Loss: 0.2598
Epoch 04, Loss: 0.2589
Epoch 05, Loss: 0.2588


In [None]:
# # --- 6. Example Evaluation ---
# model.eval()
# test_tf = test_dataset.tensor_frame
# with torch.no_grad():
#     # Make predictions on the whole test set
#     test_pred_logits = model(test_tf).squeeze(-1)
#     test_pred_probs = torch.sigmoid(test_pred_logits)
#     test_pred_labels = (test_pred_probs > 0.5).int()
    
#     # Calculate Accuracy
#     correct = (test_pred_labels == test_tf.y).sum().item()
#     total = len(test_tf.y)
#     accuracy = correct / total
    
#     print(f"\nTest Set Accuracy: {accuracy:.4f} ({correct}/{total})")

In [7]:
test_df = pd.read_csv('test.csv')
test_col_to_stype = col_to_stype.copy()
test_col_to_stype.pop('loan_paid_back', None)
test_dataset = Dataset(
    df=test_df.drop(columns=['id'], errors='ignore'),
    col_to_stype=test_col_to_stype,
    target_col=None, # Crucial: No target column in the test file
)

# Materialize the test dataset
test_dataset.materialize()
test_tf = test_dataset.tensor_frame

# --- Generate Predictions ---
model.eval() # Set model to evaluation mode
with torch.no_grad():
    # 1. Get the raw logits
    test_pred_logits = model(test_tf).squeeze(-1)
    
    # 2. Convert logits to probabilities (required for AUC submission)
    test_predictions_proba = torch.sigmoid(test_pred_logits)

    # test_pred_labels = (test_predictions_proba > 0.5).int()
    
    # # Calculate Accuracy
    # correct = (test_pred_labels == test_tf.y).sum().item()
    # total = len(test_tf.y)
    # accuracy = correct / total

print(f"Generated {len(test_predictions_proba)} probability predictions.")

Generated 254569 probability predictions.


In [9]:
# --- Build submission DataFrame ---
# Ensure you use the ID column from the original test data
submission_df = pd.DataFrame({
    # Use the 'id' column from the original test_df
    'id': test_df['id'].values, 
    # Use the calculated probabilities
    'loan_paid_back': test_predictions_proba 
})

# --- Save submission file ---
submission_df.to_csv("sample_submission.csv", index=False)

print("\n✅ Submission file saved as sample_submission.csv")
print(f"Shape: {submission_df.shape}")
print(submission_df.head())


✅ Submission file saved as sample_submission.csv
Shape: (254569, 2)
       id  loan_paid_back
0  593994        0.924355
1  593995        0.993796
2  593996        0.387770
3  593997        0.935409
4  593998        0.972713


XGBoost

In [18]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from torch_frame import stype

# --- Assuming 'dataset' is successfully materialized from your previous step ---

# 1. Get the complete feature TensorFrame (from the entire dataset)
full_tf = dataset.tensor_frame

print(type(full_tf))

feature_arrays = []

# Iterate through the dictionary of features keyed by stype
for tensor in full_tf.feat_dict.values():
    # Convert the tensor to a CPU NumPy array (required by .numpy())
    np_array = tensor.detach().cpu().numpy()
    feature_arrays.append(np_array)


X_np = np.concatenate(feature_arrays, axis=1)
y_np = full_tf.y.numpy().astype(int) 

# Note: X_np contains the one-hot encoded categorical features and normalized numerical features, 
# ready for use by tree-based models.

# 3. Split the NumPy Data
X_train, X_test, y_train, y_test = train_test_split(
    X_np, 
    y_np, 
    test_size=0.2, 
    random_state=42
)

print(f"XGBoost Training Features Shape: {X_train.shape}")
print(f"XGBoost Testing Features Shape: {X_test.shape}")

<class 'torch_frame.data.tensor_frame.TensorFrame'>
XGBoost Training Features Shape: (475195, 11)
XGBoost Testing Features Shape: (118799, 11)


In [22]:
# 4. Initialize and Train the XGBoost Classifier
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',  # For binary classification
    eval_metric='logloss',
    use_label_encoder=False,      # Suppresses a common warning
    n_estimators=500,             # Number of boosting rounds
    learning_rate=0.05,
    random_state=42
    )

print("\nStarting XGBoost training...")
xgb_model.fit(X_train, y_train)
print("XGBoost training complete.")

# 5. Evaluate the Model (using probability for AUC)
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]  # Get probability for the positive class (1)
y_pred_labels = xgb_model.predict(X_test)

# Calculate key metrics
accuracy = (y_pred_labels == y_test).mean()
auc = roc_auc_score(y_test, y_pred_proba)

print(f"\nXGBoost Accuracy: {accuracy:.4f}")
print(f"XGBoost ROC AUC Score: {auc:.4f}")


Starting XGBoost training...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost training complete.

XGBoost Accuracy: 0.9050
XGBoost ROC AUC Score: 0.9219
