### Stack Classifier
combine all other models into one
# Steps:
- First encode categorical columns using either label encoder or one-hot encoder
- Split data using train\_test split and use stratify on the Default column
- Create a list of weak learners and add learners that perform well on this dataset to the list such as adaboost, xgboost, random forests and naive bernoulli bayes.
- Set up the parameter grid(for hyperparameters) this is different from a normal parameter grid as it has parameters for all the weak learners.
- Drop unnecessary columns like LoanID
- Create a level1 classifier such as xgboost or randomforest. 
- Conduct a randomized search over the param\_grid to get best parameters (train the model)
- Use the best parameters to provide predictions or labels for test data

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder



df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')


hard_label_enc = []

one_hot_cols = []
for column in df.columns:
    if column == 'LoanID':
        continue
    elif column in hard_label_enc:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        test_df[column] = le.transform(df[column])
    elif df[column].dtype == 'object':
        one_hot_cols.append(column)

df = pd.get_dummies(df, columns=one_hot_cols)
test_df = pd.get_dummies(test_df, columns=one_hot_cols)

df = df.drop(columns=['LoanID'])

ids = test_df['LoanID']
test_df = test_df.drop(columns=['LoanID'])
print('encoding done')


x_train, x_validate, y_train, y_validate = train_test_split(df.drop(columns=['Default']), df['Default'], test_size=0.2, random_state=17)



# create the stacking
level0 = [] #  this has weak learners
# level0.append(('knn', KNeighborsClassifier()))
level0.append(('dtre', DecisionTreeClassifier()))
level0.append(('rfc', RandomForestClassifier()))
level0.append(('ada', AdaBoostClassifier()))
level0.append(('xgb', XGBClassifier()))
level0.append(('bnb', BernoulliNB(binarize=True)))
# level0.append(('gbc',GradientBoostingClassifier()))


param_grid = {
    # 'knn__n_neighbors': [3, 5, 7, 10],
    # 'knn__weights': ['uniform', 'distance'],
    'dtre__max_depth': [None, 5, 10],
    'dtre__min_samples_split': [2, 5, 10],
    'rfc__n_estimators': [50],
    'rfc__max_depth': [None, 5, 10],
    'rfc__min_samples_split': [2, 5, 10],
    'rfc__min_samples_leaf': [1, 2, 4],
    'rfc__bootstrap': [True, False],
    'rfc__max_features': ['auto', 'sqrt', 'log2'],
    # 'gnb__var_smoothing': [1e-9, 1e-8, 1e-7],  # for GaussianNB
    'ada__n_estimators': [30],
    'ada__learning_rate': [0.01, 0.1, 1.0],
    'xgb__n_estimators': [50],
    'xgb__max_depth': [3, 5, 7, 10],
    'xgb__learning_rate': [0.01, 0.1, 0.5],
    'xgb__subsample': [0.7, 0.8, 1.0],
    'xgb__colsample_bytree': [0.7, 0.8, 1.0],
    'xgb__gamma': [0, 0.1, 0.2],
    'xgb__min_child_weight': [1, 2, 3],
    # 'final_estimator__n_estimators': [30],
    # 'final_estimator__max_depth': [None, 5, 10],
    # 'final_estimator__min_samples_split': [2, 5, 10]
}

# {'min_samples_split': 5, 'min_samples_leaf': 2, 'max_leaf_nodes': 40, 'max_depth': 10, 'criterion': 'gini'} dtree
# {'max_depth': 3, 'gamma': 0.2, 'eta': 0.3} xgb


# level1 = RandomForestClassifier() #gives the best way to combine the weak learners
level1 = XGBClassifier(max_depth=3, gamma=0.2, eta=0.3)
model = StackingClassifier(estimators=level0, final_estimator=level1)


rs = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=10, scoring='accuracy', cv=5, verbose=1, random_state=17, n_jobs=-1)

print('going to fit')
rs.fit(x_train, y_train)
print('fit done')
best_model = rs.best_estimator_
best_hparams = rs.best_params_

print("Best Hyper Parameters are", best_hparams)


train_acc = accuracy_score(y_train, best_model.predict(x_train))
valid_acc = accuracy_score(y_validate, best_model.predict(x_validate))

print(f'Training Accuracy: {train_acc}')
print(f'Validation Accuracy: {valid_acc}')


y_pred = best_model.predict(test_df)

new_df = pd.DataFrame({'LoanID': ids, 'Default': y_pred})
new_df.to_csv('./csv_submissions/stack.csv', index=False)

encoding done
going to fit
Fitting 5 folds for each of 10 candidates, totalling 50 fits


15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "/home/prateek/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/prateek/anaconda3/lib/python3.12/site-packages/sklearn/ensemble/_stacking.py", line 669, in fit
    return super().fit(X, y_encoded, sample_weight)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/prateek/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

fit done
Best Hyper Parameters are {'xgb__subsample': 1.0, 'xgb__n_estimators': 50, 'xgb__min_child_weight': 1, 'xgb__max_depth': 7, 'xgb__learning_rate': 0.1, 'xgb__gamma': 0, 'xgb__colsample_bytree': 0.7, 'rfc__n_estimators': 50, 'rfc__min_samples_split': 2, 'rfc__min_samples_leaf': 1, 'rfc__max_features': 'log2', 'rfc__max_depth': 5, 'rfc__bootstrap': True, 'dtre__min_samples_split': 10, 'dtre__max_depth': None, 'ada__n_estimators': 30, 'ada__learning_rate': 0.01}
Training Accuracy: 0.8887291107018069
Validation Accuracy: 0.886454865870374
