In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e11/sample_submission.csv
/kaggle/input/playground-series-s5e11/train.csv
/kaggle/input/playground-series-s5e11/test.csv


In [2]:
import optuna
import lightgbm as lgb
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.metrics import accuracy_score, roc_auc_score

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.feature_extraction import FeatureHasher
from scipy import sparse

In [3]:
train_csv = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
train_csv.head()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [4]:
X_col = [x for x in train_csv.columns if x != 'loan_paid_back' and x != 'id']
y_col = 'loan_paid_back'

X = train_csv[X_col]
y = train_csv[y_col]

In [5]:
unused_col = ['id']

numeric_col = ['annual_income', 'debt_to_income_ratio', 'credit_score', 
               'loan_amount', 'interest_rate']

ohe_categorical_col = ['gender', 'marital_status', 'education_level', 'employment_status', 
                       'loan_purpose']

hash_categorical_col = ['grade_subgrade']

output_col = ['loan_paid_back']

In [6]:
class SklearnFeatureHasher_stringtoken(BaseEstimator, TransformerMixin):
    """
    Wrap FeatureHasher with input_type='string' but ensure each sample is an iterable
    of strings (here: a single-token list [value]).
    """
    def __init__(self, n_features=32):
        self.n_features = n_features
        self.input_type = 'string'

    def fit(self, X, y=None):
        self._hasher = FeatureHasher(n_features=self.n_features, input_type=self.input_type)
        return self

    def transform(self, X):
        # Accept Series, DataFrame (single column), 1D array, or list
        if isinstance(X, (pd.Series, pd.DataFrame)):
            arr = X.values.ravel()
        else:
            arr = np.asarray(X).ravel()

        # Convert NaNs to string (or whatever sentinel you prefer)
        arr = np.where(pd.isna(arr), '___nan___', arr).astype(str)

        # IMPORTANT: FeatureHasher(input_type='string') expects each sample to be an iterable
        # of strings (e.g. tokens). Wrap each string into a single-token list:
        samples = [[val] for val in arr]

        hashed = self._hasher.transform(samples)   # returns sparse matrix
        if not sparse.isspmatrix_csr(hashed):
            hashed = hashed.tocsr()
        return hashed

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y, 
    random_state=42
)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

# 3) Create ColumnTransformer - scale numeric cols, passthrough other columns
preprocessor = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False), ohe_categorical_col),
        ('hash_cat', SklearnFeatureHasher_stringtoken(n_features=32), hash_categorical_col),
        ('num', StandardScaler(), numeric_col)
    ]
)

# Manual fit for early stopping
preprocessor.fit(X_train)
X_train   = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)



In [8]:
# study = optuna.create_study(direction="maximize")

In [9]:
# # Defining a target function
# def objective(trial):
#     # Determine hyperparameter values
#     learning_rate = trial.suggest_float("learning_rate", 0.01, 0.1)
#     num_leaves = trial.suggest_int("num_leaves", 2, 256)
#     max_depth = trial.suggest_int("max_depth", -1, 50)
#     min_child_samples = trial.suggest_int("min_child_samples", 5, 100)
#     subsample = trial.suggest_float("subsample", 0.5, 1.0)
#     colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)
#     n_estimators = trial.suggest_int("n_estimators", 100, 1000)

#     # Create and train the model
#     model = lgb.LGBMClassifier(
#         learning_rate=learning_rate,
#         num_leaves=num_leaves,
#         max_depth=max_depth,
#         min_child_samples=min_child_samples,
#         subsample=subsample,
#         colsample_bytree=colsample_bytree,
#         n_estimators=n_estimators,
#         eval_metric="auc",
#         random_state=42
#     )
    
#     score = cross_val_score(model, X_train, y_train, cv=3, scoring="roc_auc").mean()
    
#     print("ROC AUC:", score)
#     return score

In [10]:
# # Run the study and review the results
# study.optimize(objective, n_trials=20)
# print("Best trial:")
# print(" Value: {}".format(study.best_trial.value))
# print(" Params: {}".format(study.best_trial.params))

In [11]:
model = lgb.LGBMClassifier(
            learning_rate=0.05868694772364138,
            num_leaves=76,
            max_depth=16,
            min_child_samples=51,
            subsample=0.6701419481048738,
            colsample_bytree=0.5139157560970037,
            n_estimators=557,
            eval_metric="auc",
            random_state=42
        )
    
model.fit(X_train, y_train, eval_metric="auc")

# Evaluate the model and return the metric
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]
    
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
print("ROC AUC:", roc_auc)

[LightGBM] [Info] Number of positive: 379595, number of negative: 95600
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021171 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1366
[LightGBM] [Info] Number of data points in the train set: 475195, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.798819 -> initscore=1.378932
[LightGBM] [Info] Start training from score 1.378932
ROC AUC: 0.9218640011064837


In [12]:
test_df = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')

In [13]:
test_df = test_df.drop(unused_col, axis=1)
test_df

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
0,28781.05,0.049,626,11461.42,14.73,Female,Single,High School,Employed,Other,D5
1,46626.39,0.093,732,15492.25,12.85,Female,Married,Master's,Employed,Other,C1
2,54954.89,0.367,611,3796.41,13.29,Male,Single,Bachelor's,Employed,Debt consolidation,D1
3,25644.63,0.110,671,6574.30,9.57,Female,Single,Bachelor's,Employed,Debt consolidation,C3
4,25169.64,0.081,688,17696.89,12.80,Female,Married,PhD,Employed,Business,C1
...,...,...,...,...,...,...,...,...,...,...,...
254564,92835.97,0.068,744,29704.00,13.48,Female,Single,Bachelor's,Employed,Debt consolidation,B2
254565,48846.47,0.091,634,20284.33,9.58,Female,Married,High School,Employed,Debt consolidation,D4
254566,20668.52,0.096,718,26387.55,9.00,Male,Single,Master's,Employed,Debt consolidation,C4
254567,34105.09,0.094,739,11107.36,9.81,Male,Single,Bachelor's,Employed,Business,C2


In [14]:
test_df_t   = preprocessor.transform(test_df)

In [15]:
# get probability for positive class
probs = model.predict_proba(test_df_t)[:,1]



In [16]:
probs

array([0.94335093, 0.98049511, 0.53022884, ..., 0.97310962, 0.98586233,
       0.91868282])

In [17]:
sub = pd.read_csv('/kaggle/input/playground-series-s5e11/sample_submission.csv')
sub[sub.columns[1]] = probs
sub.to_csv('submission_lgbm_optuna_cv.csv', index=False)