In [1]:
import numpy as np
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings("ignore", category=FutureWarning) # Suppress FutureWarnings

/kaggle/input/playground-series-s4e10/sample_submission.csv
/kaggle/input/playground-series-s4e10/train.csv
/kaggle/input/playground-series-s4e10/test.csv


In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58645 entries, 0 to 58644
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          58645 non-null  int64  
 1   person_age                  58645 non-null  int64  
 2   person_income               58645 non-null  int64  
 3   person_home_ownership       58645 non-null  object 
 4   person_emp_length           58645 non-null  float64
 5   loan_intent                 58645 non-null  object 
 6   loan_grade                  58645 non-null  object 
 7   loan_amnt                   58645 non-null  int64  
 8   loan_int_rate               58645 non-null  float64
 9   loan_percent_income         58645 non-null  float64
 10  cb_person_default_on_file   58645 non-null  object 
 11  cb_person_cred_hist_length  58645 non-null  int64  
 12  loan_status                 58645 non-null  int64  
dtypes: float64(3), int64(6), object

In [3]:
test = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39098 entries, 0 to 39097
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          39098 non-null  int64  
 1   person_age                  39098 non-null  int64  
 2   person_income               39098 non-null  int64  
 3   person_home_ownership       39098 non-null  object 
 4   person_emp_length           39098 non-null  float64
 5   loan_intent                 39098 non-null  object 
 6   loan_grade                  39098 non-null  object 
 7   loan_amnt                   39098 non-null  int64  
 8   loan_int_rate               39098 non-null  float64
 9   loan_percent_income         39098 non-null  float64
 10  cb_person_default_on_file   39098 non-null  object 
 11  cb_person_cred_hist_length  39098 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.6+ MB


> Train and Test have the same features.

In [4]:
train['loan_status'] = train['loan_status'].astype(str)
train['person_emp_length'] = train['person_emp_length'].astype(int)

In [5]:
num = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
cat = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']

In [6]:
for col in cat:
    print(col, train[col].unique())
    print(col, test[col].unique())

person_home_ownership ['RENT' 'OWN' 'MORTGAGE' 'OTHER']
person_home_ownership ['RENT' 'MORTGAGE' 'OWN' 'OTHER']
loan_intent ['EDUCATION' 'MEDICAL' 'PERSONAL' 'VENTURE' 'DEBTCONSOLIDATION'
 'HOMEIMPROVEMENT']
loan_intent ['HOMEIMPROVEMENT' 'PERSONAL' 'VENTURE' 'DEBTCONSOLIDATION' 'EDUCATION'
 'MEDICAL']
loan_grade ['B' 'C' 'A' 'D' 'E' 'F' 'G']
loan_grade ['F' 'C' 'E' 'A' 'D' 'B' 'G']
cb_person_default_on_file ['N' 'Y']
cb_person_default_on_file ['N' 'Y']


> Train and Test have the same labels in each column.

In [7]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,58645.0,29322.0,16929.497605,0.0,14661.0,29322.0,43983.0,58644.0
person_age,58645.0,27.550857,6.033216,20.0,23.0,26.0,30.0,123.0
person_income,58645.0,64046.172871,37931.106978,4200.0,42000.0,58000.0,75600.0,1900000.0
person_emp_length,58645.0,4.701015,3.959784,0.0,2.0,4.0,7.0,123.0
loan_amnt,58645.0,9217.556518,5563.807384,500.0,5000.0,8000.0,12000.0,35000.0
loan_int_rate,58645.0,10.677874,3.034697,5.42,7.88,10.75,12.99,23.22
loan_percent_income,58645.0,0.159238,0.091692,0.0,0.09,0.14,0.21,0.83
cb_person_cred_hist_length,58645.0,5.813556,4.029196,2.0,3.0,4.0,8.0,30.0


In [8]:
train[train['person_age']>90]

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
47336,47336,123,36000,MORTGAGE,7,PERSONAL,B,6700,10.75,0.18,N,4,0


In [9]:
train[train['person_emp_length']>50]

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
41079,41079,28,60350,MORTGAGE,123,MEDICAL,D,25000,15.95,0.35,Y,6,1
49252,49252,21,192000,MORTGAGE,123,VENTURE,B,20000,11.49,0.1,N,2,0


In [10]:
train = train.drop([47336,41079,49252], axis=0)

In [11]:
train['age_start_work'] = train['person_age'] - train['person_emp_length'] # should be positive or at least 18
train['age_start_credit'] = train['person_age'] - train['cb_person_cred_hist_length'] # should be positive or at least 18
train['emp_before_credit'] = train['person_emp_length'] - train['cb_person_cred_hist_length'] # should be positive

test['age_start_work'] = test['person_age'] - test['person_emp_length'] # should be positive or at least 18
test['age_start_credit'] = test['person_age'] - test['cb_person_cred_hist_length'] # should be positive or at least 18
test['emp_before_credit'] = test['person_emp_length'] - test['cb_person_cred_hist_length'] # should be positive


In [12]:
# contains_inf = train.isin([np.inf, -np.inf]).any().any()
# print(contains_inf)

In [13]:
# f, (ax_1, ax_2) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.2, .8)}, figsize=(12,8))
 
# # assigning a graph to each ax
# sns.boxplot(train, x='age_start_work', y='loan_status', orient='h', ax=ax_1, width=.5)
# sns.histplot(train, x='age_start_work', hue='loan_status', ax=ax_2)
# ax_1.set(xlabel="")
# plt.show()

In [14]:
# drop_list = train[train['age_start_work']<10].index
# train = train.drop(drop_list, axis=0)

In [15]:
# f, (ax_1, ax_2) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.2, .8)}, figsize=(12,8))
 
# # assigning a graph to each ax
# sns.boxplot(train, x='age_start_credit', y='loan_status', width=.3, orient='h', ax=ax_1)
# sns.histplot(train, x='age_start_credit', hue='loan_status', ax=ax_2)
# ax_1.set(xlabel="")
# plt.show()

In [16]:
# f, (ax_1, ax_2) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.2, .8)}, figsize=(12,8))
 
# # assigning a graph to each ax
# sns.boxplot(train, x='emp_before_credit', y='loan_status', width=.3, orient='h', ax=ax_1)
# sns.histplot(train, x='emp_before_credit', hue='loan_status', ax=ax_2)
# ax_1.set(xlabel="")
# plt.show()

# Model 1: Decision Tree Regressor

In [17]:
# train_base = train.copy()

# feature = train_base.columns.drop('loan_status')
# X_train = train_base[feature]
# y_train = train_base['loan_status']
# X_train = pd.get_dummies(X_train, columns=cat)
# X_test = pd.get_dummies(test, columns=cat)

# from sklearn.tree import DecisionTreeRegressor

# # Define the model
# model_base = DecisionTreeRegressor(random_state=1, max_leaf_nodes=750)

# # Fit the model
# model_base.fit(X_train,y_train)

# # Predict target
# preds = model_base.predict(X_test)

# submission = pd.DataFrame({'id': X_test.id, 'loan_status': preds})
# submission.to_csv('submission.csv', index=False)

# Model 2: DTR with One-Hot-Encoding

In [18]:
# train_2 = train.copy()

# feature = train_2.columns.drop('loan_status')
# X_train = train_2[feature]
# y_train = train_2['loan_status']

# # One Hot Encoding
# from sklearn.preprocessing import OneHotEncoder
# OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
# Xtrain_cat_OH = pd.DataFrame(OH_encoder.fit_transform(X_train[cat]))
# Xtest_cat_OH = pd.DataFrame(OH_encoder.transform(test[cat]))

# # Put index back
# Xtrain_cat_OH.index = X_train.index
# Xtest_cat_OH.index = test.index

# # Drop categorical columns
# Xtrain_num = X_train.drop(cat, axis=1)
# Xtest_num = test.drop(cat, axis=1)

# # Concatenate numerical columns with OHC columns
# X_train_OH = pd.concat([Xtrain_num, Xtrain_cat_OH], axis=1)
# X_test_OH = pd.concat([Xtest_num, Xtest_cat_OH], axis=1)

# X_train_OH.columns = X_train_OH.columns.astype(str)
# X_test_OH.columns = X_test_OH.columns.astype(str)

# # Define the model
# from sklearn.tree import DecisionTreeRegressor
# model = DecisionTreeRegressor(random_state=1, max_leaf_nodes=500)

# # Fit the model
# model.fit(X_train_OH, y_train)

# # Predict target
# preds = model.predict(X_test_OH)

# submission = pd.DataFrame({'id': test.id, 'loan_status': preds})
# submission.to_csv('submission.csv', index=False)

# Model 3: Random Forest 

In [19]:
train = train.copy()

feature = train.columns.drop('loan_status')
X_train = train[feature]
y_train = train['loan_status']

# One Hot Encoding
from sklearn.preprocessing import OneHotEncoder
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
Xtrain_cat_OH = pd.DataFrame(OH_encoder.fit_transform(X_train[cat]))
Xtest_cat_OH = pd.DataFrame(OH_encoder.transform(test[cat]))

# Put index back
Xtrain_cat_OH.index = X_train.index
Xtest_cat_OH.index = test.index

# Drop categorical columns
Xtrain_num = X_train.drop(cat, axis=1)
Xtest_num = test.drop(cat, axis=1)

# Concatenate numerical columns with OHC columns
X_train_OH = pd.concat([Xtrain_num, Xtrain_cat_OH], axis=1)
X_test_OH = pd.concat([Xtest_num, Xtest_cat_OH], axis=1)

X_train_OH.columns = X_train_OH.columns.astype(str)
X_test_OH.columns = X_test_OH.columns.astype(str)

# Define the model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Fit the model
model.fit(X_train_OH, y_train)

# Predict target
preds = model.predict(X_test_OH)

submission = pd.DataFrame({'id': test.id, 'loan_status': preds})
submission.to_csv('submission.csv', index=False)

* Sub 1: 0.75 | Minimal data cleaning | Decision Tree Regression, default max leaf nodes.
* Sub 2: 0.88329 | Minimal data cleaning | DTR, max leaf nodes 500.
* Sub 3: 0.87646 | Remove outliers based on age started working | DTR, max leaf nodes 500.
* Sub 4: 0.75465 | Minimal data cleaning | DTR, max leaf nodes 1500.
* Sub 5: 0.88329 | Minimal data cleaning | One-hot encoding | DTR, max leaf nodes 500.