In [156]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")
pd.options.display.max_columns = None


In [157]:
train_data = pd.read_csv("./train.csv")
test_data = pd.read_csv("./test.csv")
submission_data = pd.read_csv("./sample_submission.csv")


In [158]:
train_data.columns


Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [159]:
# Dividing train data into independent and dependent variables
X = train_data.drop(["Loan_ID", "Loan_Status"], axis=1)
y = train_data["Loan_Status"]
X_test = test_data.drop("Loan_ID", axis=1)

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

In [160]:
num_cols = ["ApplicantIncome", "CoapplicantIncome",
            "LoanAmount", "Loan_Amount_Term"]
cat_cols = [
    "Gender",
    "Married",
    "Dependents",
    "Education",
    "Self_Employed",
    "Property_Area",
    "Credit_History",
]


In [161]:
# # Feature Creation

# X["ApplicantIncomeByLoanAmount"] = X["ApplicantIncome"] / X["LoanAmount"]
# X["CoapplicantIncomeByLoadAmount"] = X["CoapplicantIncome"] / X["LoanAmount"]
# X["LoanAmountByLoanAmountTerm"] = X["LoanAmount"] / X["Loan_Amount_Term"]
# X.drop(["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term"], axis=1, inplace=True)


In [162]:
X.head()


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


In [163]:
X.isna().sum()


Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
dtype: int64

In [164]:
from sklearn.impute import KNNImputer

knnimp = KNNImputer()
X[num_cols] = knnimp.fit_transform(X[num_cols])
X_test[num_cols] = knnimp.transform(X_test[num_cols])


In [165]:
X.isna().sum()


Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       50
Property_Area         0
dtype: int64

In [166]:
from sklearn.impute import SimpleImputer

si = SimpleImputer(strategy="constant", fill_value="missing")
X[cat_cols] = si.fit_transform(X[cat_cols])
X_test[cat_cols] = si.transform(X_test[cat_cols])


In [167]:
X.isnull().sum()


Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [168]:
X.head()


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849.0,0.0,147.8,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,4583.0,1508.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000.0,0.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,2583.0,2358.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000.0,0.0,141.0,360.0,1.0,Urban


In [169]:
X["ApplicantIncomeByLoanAmount"] = X["ApplicantIncome"] / X["LoanAmount"]
X["CoapplicantIncomeByLoadAmount"] = X["CoapplicantIncome"] / X["LoanAmount"]
X["LoanAmountByLoanAmountTerm"] = X["LoanAmount"] / X["Loan_Amount_Term"]
X.drop(
    ["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term"],
    axis=1,
    inplace=True,
)

X_test["ApplicantIncomeByLoanAmount"] = X_test["ApplicantIncome"] / X_test["LoanAmount"]
X_test["CoapplicantIncomeByLoadAmount"] = (
    X_test["CoapplicantIncome"] / X_test["LoanAmount"]
)
X_test["LoanAmountByLoanAmountTerm"] = X_test["LoanAmount"] / X_test["Loan_Amount_Term"]
X_test.drop(
    ["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term"],
    axis=1,
    inplace=True,
)

In [170]:
X.head()


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,ApplicantIncomeByLoanAmount,CoapplicantIncomeByLoadAmount,LoanAmountByLoanAmountTerm
0,Male,No,0,Graduate,No,1.0,Urban,39.573748,0.0,0.410556
1,Male,Yes,1,Graduate,No,1.0,Rural,35.804688,11.78125,0.355556
2,Male,Yes,0,Graduate,Yes,1.0,Urban,45.454545,0.0,0.183333
3,Male,Yes,0,Not Graduate,No,1.0,Urban,21.525,19.65,0.333333
4,Male,No,0,Graduate,No,1.0,Urban,42.553191,0.0,0.391667


In [171]:
X[cat_cols] = X[cat_cols].astype(str)
X_test[cat_cols] = X_test[cat_cols].astype(str)


In [172]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore", drop="first")
X_ohe = pd.DataFrame(
    ohe.fit_transform(X[cat_cols]), index=X.index, columns=ohe.get_feature_names_out()
)
X_test_ohe = pd.DataFrame(
    ohe.transform(X_test[cat_cols]),
    index=X_test.index,
    columns=ohe.get_feature_names_out(),
)

In [173]:
X_ohe.head()


Unnamed: 0,Gender_Male,Gender_missing,Married_Yes,Married_missing,Dependents_1,Dependents_2,Dependents_3+,Dependents_missing,Education_Not Graduate,Self_Employed_Yes,Self_Employed_missing,Property_Area_Semiurban,Property_Area_Urban,Credit_History_1.0,Credit_History_missing
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [174]:
X = X.join(X_ohe)
X_test = X_test.join(X_test_ohe)

X.drop(list(cat_cols), axis=1, inplace=True)
X_test.drop(list(cat_cols), axis=1, inplace=True)


In [175]:
X_test.head()


Unnamed: 0,ApplicantIncomeByLoanAmount,CoapplicantIncomeByLoadAmount,LoanAmountByLoanAmountTerm,Gender_Male,Gender_missing,Married_Yes,Married_missing,Dependents_1,Dependents_2,Dependents_3+,Dependents_missing,Education_Not Graduate,Self_Employed_Yes,Self_Employed_missing,Property_Area_Semiurban,Property_Area_Urban,Credit_History_1.0,Credit_History_missing
0,52.0,0.0,0.305556,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,24.412698,11.904762,0.35,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,24.038462,8.653846,0.577778,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,23.4,25.46,0.277778,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,42.0,0.0,0.216667,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0


In [176]:
# from sklearn.ensemble import RandomForestClassifier

# clf1 = RandomForestClassifier(random_state=39)

# from sklearn.model_selection import GridSearchCV

# param_grid = [
#     {
#         "n_estimators": np.arange(50, 500, 50),
#         "max_depth": [3, 5, 7, None],
#         "max_features": ['sqrt', 'log2'],
#     }
# ]

# gs = GridSearchCV(clf1, param_grid=param_grid, scoring="accuracy", cv=10, n_jobs=-1)

In [177]:
# gs.fit(X, y)


In [178]:
# gs.best_estimator_ #0.8095452141723956
# RandomForestClassifier(max_depth=5, n_estimators=150, n_jobs=-1, random_state=39)

In [179]:
# gs.best_estimator_


In [180]:
# gs.best_score_


In [181]:
from sklearn.ensemble import RandomForestClassifier

clf1 = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    max_features="sqrt",
    min_samples_leaf=2,
    min_samples_split=8,
    random_state=39,
)
clf1.fit(X, y)
predictions1 = clf1.predict(X_test)

In [182]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

estimator = LogisticRegression(random_state=39, solver="liblinear", C=1.0, penalty="l2")
clf2 = AdaBoostClassifier(estimator=estimator, learning_rate=1.0, n_estimators=275)

X_new = X.copy(deep=True)
X_test_new = X_test.copy(deep=True)

y = y.astype("float64")

ss = MinMaxScaler()
X_new[
    [
        "ApplicantIncomeByLoanAmount",
        "CoapplicantIncomeByLoadAmount",
        "LoanAmountByLoanAmountTerm",
    ]
] = ss.fit_transform(
    X_new[
        [
            "ApplicantIncomeByLoanAmount",
            "CoapplicantIncomeByLoadAmount",
            "LoanAmountByLoanAmountTerm",
        ]
    ]
)
X_test_new[
    [
        "ApplicantIncomeByLoanAmount",
        "CoapplicantIncomeByLoadAmount",
        "LoanAmountByLoanAmountTerm",
    ]
] = ss.transform(
    X_test_new[
        [
            "ApplicantIncomeByLoanAmount",
            "CoapplicantIncomeByLoadAmount",
            "LoanAmountByLoanAmountTerm",
        ]
    ]
)
clf2.fit(X_new, y)

predictions2 = clf2.predict(X_test_new)

In [183]:
from xgboost import XGBClassifier

clf3 = XGBClassifier(
    reg_lambda=0.0,
    reg_alpha=0.0,
    n_estimators=200,
    min_child_weight=7,
    max_depth=7,
    learning_rate=0.01,
)
clf3.fit(X, y)
predictions3 = clf3.predict(X_test)

In [184]:
# submission_data["Loan_Status"] = le.inverse_transform(predictions3)
# submission_data.to_csv("submission.csv", index=False)


In [190]:
from scipy import stats

final_predictions = stats.mode([predictions1, predictions2, predictions3], axis=0)[0][0]
final_predictions = final_predictions.astype("int")
final_predictions

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,

In [192]:
submission_data["Loan_Status"] = le.inverse_transform(predictions2.astype(int))
submission_data.to_csv("submission.csv", index=False)


#### Individual submission score:

- clf1 = 0.7847222222222222.
- clf2 = 0.7777777777777778.
- clf3 = 0.7777777777777778.

#### Final submission score:

0.7777777777777778

So gonna try using some other algorithm from histgradientboosting for 2nd classifier


deredDict([('base_estimator__C', 1.0),
('base_estimator__penalty', 'l2'),
('learning_rate', 1.0),
('n_estimators', 278)])


OrderedDict([('base_estimator__C', 0.5241864607868151),
('base_estimator__penalty', 'l2'),
('learning_rate', 0.9907934701154619),
('n_estimators', 474)]

- 0.8095980962453726


('base_estimator**C', 1.0),
('base_estimator**penalty', 'l2'),
('learning_rate', 1.0),
('n_estimators', 275)

- 0.8112374405076679


MinMaxScaler performs the best!
