In [24]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SequentialFeatureSelector, f_classif, SelectKBest
from datetime import datetime
from sklearn.linear_model import LogisticRegression


In [25]:
df_train = pd.read_csv('../data/clean/df-train.csv')
df_test = pd.read_csv('../data/clean/df-comp.csv')

In [26]:
X_train = df_train.drop(columns='status', axis=1)
y_train = df_train[['status']]

X_test = df_test.drop(columns='status', axis=1)

In [27]:
oversample = True
feature_selection = False

pipeline = []
#model_instance = RandomForestClassifier(criterion='entropy', max_depth=15, n_estimators=200, n_jobs=-1)
model_instance = LogisticRegression(max_iter=5000, solver='saga', class_weight='balanced')
#rfe = SequentialFeatureSelector(model_instance, n_features_to_select="auto", tol=0.07)
rfe = SelectKBest(f_classif, k=10)

if oversample:
  pipeline.append(('sampling', SMOTE(n_jobs=-1)))

if feature_selection:
  pipeline.append(('rfe', rfe))

pipeline.append(("model", model_instance))

pipe = Pipeline(steps=pipeline)

In [28]:
pipe.fit(X_train, y_train)
y_result = pipe.predict_proba(X_test)[:, 0]

  y = column_or_1d(y, warn=True)


In [29]:
result = pd.DataFrame({"Id": X_test["loan_id"], "Predicted": y_result})
result.drop_duplicates(inplace=True)
time = datetime.now().strftime('%d_%H-%M-%S')
result.to_csv(f"./results/result-{time}.csv", index=False)
with open(f"./results/result-{time}.txt", 'w') as f:
    f.write(f"Random Forest\n{'Feature Selection ' if feature_selection else ''}{'Oversample' if oversample else ''}\n{model_instance.get_params()}")

In [30]:
df_test

Unnamed: 0,loan_id,amount,duration,payments,status,frequency,has_disponent,gender,ratio_of_urban_inhabitants,average_salary,...,avg_balance,min_balance,max_balance,std_balance,negative_balance,last_balance_negative,has_card,age_at_loan,days_between,same_district
0,5895,93960,60,1566,,1,0,1,0.535,8390,...,54520.202247,800.0,88246.7,13768.555214,0,0,0,34,452,0
1,5172,50976,36,1416,,1,0,1,0.535,8390,...,28050.299187,400.0,48735.9,9147.726881,0,0,0,18,496,0
2,6207,184620,60,3077,,1,0,0,0.535,8390,...,34785.534513,1100.0,65517.7,10154.957756,0,0,0,34,606,0
3,7122,260640,36,7240,,1,0,0,0.524,8620,...,31518.182051,-718.6,88731.8,20629.925861,1,0,0,47,490,0
4,7067,136368,24,5682,,1,0,1,0.524,8620,...,60593.840678,600.0,119446.2,21974.288190,0,0,0,28,260,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349,7294,39168,24,1632,,1,0,0,0.634,9920,...,55438.851852,300.0,81495.6,14622.737828,0,0,0,19,347,0
350,6321,38496,12,3208,,1,0,0,0.634,9920,...,42732.336111,800.0,92238.0,27353.589391,0,0,1,34,390,0
351,6469,99744,24,4156,,2,0,0,0.634,9920,...,40126.968627,700.0,79967.9,16479.640951,0,0,0,27,523,0
352,5614,253560,60,4226,,1,0,0,0.899,10446,...,39822.711111,900.0,76509.7,13798.585863,0,0,0,47,645,0
