In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e11/sample_submission.csv
/kaggle/input/playground-series-s5e11/train.csv
/kaggle/input/playground-series-s5e11/test.csv


In [2]:
train_csv = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
train_csv.head()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [3]:
X_col = [x for x in train_csv.columns if x != 'loan_paid_back' and x != 'id']
y_col = 'loan_paid_back'

X = train_csv[X_col]
y = train_csv[y_col]

In [4]:
from sklearn.model_selection import train_test_split

# X : DataFrame or ndarray, y : Series/ndarray
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

In [5]:
unused_col = ['id']

numeric_col = ['annual_income', 'debt_to_income_ratio', 'credit_score', 
               'loan_amount', 'interest_rate']

ohe_categorical_col = ['gender', 'marital_status', 'education_level', 'employment_status', 
                       'loan_purpose']

hash_categorical_col = ['grade_subgrade']

output_col = ['loan_paid_back']

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.feature_extraction import FeatureHasher
from scipy import sparse

class SklearnFeatureHasher_stringtoken(BaseEstimator, TransformerMixin):
    """
    Wrap FeatureHasher with input_type='string' but ensure each sample is an iterable
    of strings (here: a single-token list [value]).
    """
    def __init__(self, n_features=32):
        self.n_features = n_features
        self.input_type = 'string'

    def fit(self, X, y=None):
        self._hasher = FeatureHasher(n_features=self.n_features, input_type=self.input_type)
        return self

    def transform(self, X):
        # Accept Series, DataFrame (single column), 1D array, or list
        if isinstance(X, (pd.Series, pd.DataFrame)):
            arr = X.values.ravel()
        else:
            arr = np.asarray(X).ravel()

        # Convert NaNs to string (or whatever sentinel you prefer)
        arr = np.where(pd.isna(arr), '___nan___', arr).astype(str)

        # IMPORTANT: FeatureHasher(input_type='string') expects each sample to be an iterable
        # of strings (e.g. tokens). Wrap each string into a single-token list:
        samples = [[val] for val in arr]

        hashed = self._hasher.transform(samples)   # returns sparse matrix
        if not sparse.isspmatrix_csr(hashed):
            hashed = hashed.tocsr()
        return hashed

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

# 3) Create ColumnTransformer - scale numeric cols, passthrough other columns
preprocessor = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False), ohe_categorical_col),
        ('hash_cat', SklearnFeatureHasher_stringtoken(n_features=32), hash_categorical_col),
        ('num', StandardScaler(), numeric_col)
    ]
)

In [8]:
# manual fit for early stopping
preprocessor.fit(X_train)
X_train_t = preprocessor.transform(X_train)
X_val_t   = preprocessor.transform(X_val)



In [9]:
X_test_t   = preprocessor.transform(X_test)

In [10]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

model = XGBClassifier(
    objective="binary:logistic",
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    use_label_encoder=False,    # suppress older warning
    eval_metric="logloss",
    random_state=42
)

# Fit with validation set and early stopping
model.fit(
    X_train_t, y_train,
    eval_set=[(X_val_t, y_val)],
    early_stopping_rounds=20,
    verbose=True
)

# Predict + evaluate
y_pred = model.predict(X_test_t)
y_proba = model.predict_proba(X_test_t)[:,1]
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))



[0]	validation_0-logloss:0.45964
[1]	validation_0-logloss:0.42703
[2]	validation_0-logloss:0.40165
[3]	validation_0-logloss:0.38105
[4]	validation_0-logloss:0.36399
[5]	validation_0-logloss:0.34967
[6]	validation_0-logloss:0.33751
[7]	validation_0-logloss:0.32714
[8]	validation_0-logloss:0.31823
[9]	validation_0-logloss:0.31056
[10]	validation_0-logloss:0.30390
[11]	validation_0-logloss:0.29814
[12]	validation_0-logloss:0.29309
[13]	validation_0-logloss:0.28872
[14]	validation_0-logloss:0.28483
[15]	validation_0-logloss:0.28147
[16]	validation_0-logloss:0.27855
[17]	validation_0-logloss:0.27597
[18]	validation_0-logloss:0.27372
[19]	validation_0-logloss:0.27175
[20]	validation_0-logloss:0.27003
[21]	validation_0-logloss:0.26853
[22]	validation_0-logloss:0.26715
[23]	validation_0-logloss:0.26595
[24]	validation_0-logloss:0.26490
[25]	validation_0-logloss:0.26390
[26]	validation_0-logloss:0.26306
[27]	validation_0-logloss:0.26233
[28]	validation_0-logloss:0.26165
[29]	validation_0-loglos

In [11]:
test_df = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')

In [12]:
test_df = test_df.drop(unused_col, axis=1)
test_df

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
0,28781.05,0.049,626,11461.42,14.73,Female,Single,High School,Employed,Other,D5
1,46626.39,0.093,732,15492.25,12.85,Female,Married,Master's,Employed,Other,C1
2,54954.89,0.367,611,3796.41,13.29,Male,Single,Bachelor's,Employed,Debt consolidation,D1
3,25644.63,0.110,671,6574.30,9.57,Female,Single,Bachelor's,Employed,Debt consolidation,C3
4,25169.64,0.081,688,17696.89,12.80,Female,Married,PhD,Employed,Business,C1
...,...,...,...,...,...,...,...,...,...,...,...
254564,92835.97,0.068,744,29704.00,13.48,Female,Single,Bachelor's,Employed,Debt consolidation,B2
254565,48846.47,0.091,634,20284.33,9.58,Female,Married,High School,Employed,Debt consolidation,D4
254566,20668.52,0.096,718,26387.55,9.00,Male,Single,Master's,Employed,Debt consolidation,C4
254567,34105.09,0.094,739,11107.36,9.81,Male,Single,Bachelor's,Employed,Business,C2


In [13]:
test_df_t   = preprocessor.transform(test_df)

In [14]:
test_df_t

array([[ 1.        ,  0.        ,  0.        , ..., -0.99157434,
        -0.51439966,  1.18133489],
       [ 1.        ,  0.        ,  0.        , ...,  0.92298637,
         0.06739779,  0.24513958],
       [ 0.        ,  1.        ,  0.        , ..., -1.26250274,
        -1.62074332,  0.46424912],
       ...,
       [ 0.        ,  1.        ,  0.        , ...,  0.67011986,
         1.63999144, -1.67206891],
       [ 0.        ,  1.        ,  0.        , ...,  1.04941962,
        -0.56550357, -1.26870816],
       [ 1.        ,  0.        ,  0.        , ..., -1.02769813,
         0.60922257, -0.35741166]])

In [15]:
# get probability for positive class
probs = model.predict_proba(test_df_t)[:,1]

In [16]:
probs

array([0.91166204, 0.98070246, 0.53032005, ..., 0.9683679 , 0.98577744,
       0.8849301 ], dtype=float32)

In [17]:
sub = pd.read_csv('/kaggle/input/playground-series-s5e11/sample_submission.csv')
sub[sub.columns[1]] = probs

In [18]:
sub

Unnamed: 0,id,loan_paid_back
0,593994,0.911662
1,593995,0.980702
2,593996,0.530320
3,593997,0.929594
4,593998,0.959497
...,...,...
254564,848558,0.988824
254565,848559,0.842186
254566,848560,0.968368
254567,848561,0.985777


In [19]:
sub.to_csv('submission_xgboost.csv', index=False)