In [None]:
import pandas as pd
from datetime import datetime
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector, f_classif, SelectKBest
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [None]:
df_train = pd.read_csv("../data/clean/df-train.csv", index_col="loan_id")
df_test = pd.read_csv("../data/clean/df-comp.csv", index_col="loan_id")


In [None]:
X_train = df_train.drop(columns=["status"], axis=1)
y_train = df_train[["status"]]

X_test = df_test.drop(columns=["status"], axis=1)


In [None]:
oversample = True
feature_selection = True
params = {"C": 0.01, "class_weight": "balanced", "max_iter": 500, "solver": "newton-cg"}
pipeline = []

# model_instance = RandomForestClassifier(criterion='entropy', max_depth=15, n_estimators=200, n_jobs=-1)
model_instance = LogisticRegression(**params)
# model_instance = XGBClassifier(gamma=1.5, max_depth=5, min_child_weight=1, reg_alpha= 0.1)
rfe = SequentialFeatureSelector(model_instance, n_features_to_select="auto", tol=0.07)
# rfe = SelectKBest(f_classif, k=10)

if oversample:
    pipeline.append(("sampling", SMOTE(n_jobs=-1)))

if feature_selection:
    pipeline.append(("rfe", rfe))

pipeline.append(("model", model_instance))

pipe = Pipeline(steps=pipeline)


In [None]:
pipe.fit(X_train, y_train)
y_result = pipe.predict_proba(X_test)[:, 1]

In [None]:
result = pd.DataFrame({"Id": df_test.index, "Predicted": y_result})

time = datetime.now().strftime("%d_%H-%M-%S")
result.to_csv(f"../results/result-{time}.csv", index=False)
with open(f"../results/result-{time}.txt", "w") as f:
    f.write(
        f"Logistic Regression\n{'Feature Selection ' if feature_selection else ''}{'Oversample' if oversample else ''}\n{model_instance.get_params()}"
    )
