# Modeling

First let's open our train dataset.

In [1]:
import pandas as pd

_train_df = pd.read_csv('./train.csv')
_train_df.head()

Unnamed: 0,loan_id,account_id,loan_date,loan_amount,duration,payments,status,birth_number,district_id,gender,...,trans_type_std_withdrawal,trans_type_std_credit,days_last_trans,last_balance_l,max_balance_l,age_months,bal_per_month,trans_per_month,owner_age_at,account_age
0,4959,-5.199338,-1.543098,-0.430727,-0.53022,-0.216904,1,-0.952023,-5.199338,5.199338,...,0.063341,0.063341,0.178175,0.11165,0.236579,-0.249863,-0.187886,-0.268923,0.816627,-0.269066
1,4961,-3.03425,0.604585,-1.335178,-5.199338,-0.622082,-1,-1.619856,-0.458679,-5.199338,...,0.321971,0.321971,2.455101,0.544529,1.112094,0.049948,-0.808732,0.553766,1.807354,0.114185
2,4973,-2.603792,0.619855,0.356532,-0.53022,1.07555,1,-1.024053,-0.604585,-5.199338,...,-1.119968,-1.119968,-0.781781,-0.935819,-0.112402,1.029957,-0.425841,0.651081,1.029957,1.029957
3,4996,-2.356668,1.544916,-0.301747,-5.199338,1.367558,1,-0.902159,0.01266,-5.199338,...,1.399657,1.399657,0.403108,0.925573,0.51689,-1.025023,1.439869,-0.166433,1.029957,-1.021681
4,5002,-2.272299,-1.006949,-0.067424,-5.199338,2.301079,1,-1.496373,0.987837,5.199338,...,-0.048867,-0.048867,0.403108,-0.21349,-0.265617,-0.987837,0.381108,-0.544847,1.399657,-0.979545


In [2]:
_train_df['status'].value_counts()

 1    282
-1     46
Name: status, dtype: int64

In [3]:
_train_df.loc[_train_df['status'] == 1, 'status'] = 2
_train_df.loc[_train_df['status'] == -1, 'status'] = 1
_train_df.loc[_train_df['status'] == 2, 'status'] = 0

In [4]:
_train_df['status'].value_counts()

0    282
1     46
Name: status, dtype: int64

In [5]:
from imblearn.over_sampling import SMOTE

x_train_df = _train_df.drop(['status'], axis=1)
y_train_df = _train_df['status']

sm = SMOTE(random_state=42)
x_train_df, y_train_df = sm.fit_resample(x_train_df, y_train_df)


In [6]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

# params = [
#     {'class_weight': [{0: 1, 1: v} for v in range(1, 7)], 'C': [1.0, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001]},
#     {'class_weight': [{0: 1, 1: v} for v in range(1, 10)], 'solver': ['liblinear'], 'penalty': ['l1'], 'C': [1.0, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001]}
# ]
# params = [
#   {'C': [1.0, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001], 'class_weight': [{0: 1, 1: v} for v in range(1, 10)]},
# ]
# params = {'p': [1, 2], 'weights': ['uniform', 'distance']}
# params = {'class_weight': [{0: 1, 1: v} for v in range(1, 7)]}
# params = {
#     'n_neighbors': [2, 5, 7],
#     'weights': ['uniform', 'distance'],
#     'metric': ['euclidean', 'manhattan']
# }
# params = {
#     'hidden_layer_sizes': [(64,), (32,)],
#     'alpha': [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001],
#     'batch_size': ['auto', 64],
#     'learning_rate': ['constant', 'invscaling', 'adaptive'],
#     'tol': [1e-4, 1e-6, 1e-8]
# }
params = {
    'class_weight': [{0: 1, 1: v} for v in range(1, 7)],
    'C': [20.0, 10.0, 1.0],
    'penalty': ['l2', 'none']
}
grid = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000000),
    # estimator=SVC(probability=True),
    # estimator=KNeighborsClassifier(),
    # estimator=MLPClassifier(max_iter=100000, random_state=42),
    param_grid=params,
    scoring={'precision': make_scorer(precision_score), 'recall': make_scorer(recall_score), 'auc': make_scorer(roc_auc_score)},
    refit='auc',
    cv=4,
    n_jobs=-1,
)
grid.fit(x_train_df, y_train_df)

GridSearchCV(cv=4, estimator=LogisticRegression(max_iter=1000000), n_jobs=-1,
             param_grid={'C': [20.0, 21.0, 22.0, 23.0, 24.0, 19.0, 18.0, 17.0,
                               16.0, 15.0, 14.0, 13.0, 12.0, 11.0],
                         'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 2},
                                          {0: 1, 1: 3}, {0: 1, 1: 4},
                                          {0: 1, 1: 5}, {0: 1, 1: 6}],
                         'penalty': ['l2', 'none'],
                         'tol': [0.0001, 1e-06, 1e-08, 0.01]},
             refit='auc',
             scoring={'auc': make_scorer(roc_auc_score),
                      'precision': make_scorer(precision_score),
                      'recall': make_scorer(recall_score)})

In [7]:
# pd.DataFrame(grid.cv_results_)

In [8]:
estimator = grid.best_estimator_
estimator

LogisticRegression(C=20.0, class_weight={0: 1, 1: 3}, max_iter=1000000)

In [9]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_train_df, estimator.predict(x_train_df)).ravel()
print(f'True positives: {tp}')
print(f'True negatives: {tn}')
print(f'False positives: {fp}')
print(f'False negatives: {fn}')

True positives: 282
True negatives: 249
False positives: 33
False negatives: 0


In [10]:
roc_auc_score(y_train_df, estimator.predict(x_train_df))

0.9414893617021276

In [11]:
_test_df = pd.read_csv('./test.csv')
_test_df.head()

Unnamed: 0,loan_id,account_id,loan_date,loan_amount,duration,payments,status,birth_number,district_id,gender,...,trans_type_std_withdrawal,trans_type_std_credit,days_last_trans,last_balance_l,max_balance_l,age_months,bal_per_month,trans_per_month,owner_age_at,account_age
0,4962,-5.199338,-0.007078,-1.469873,-5.199338,-0.63527,0,0.218844,0.967422,5.199338,...,-1.029957,-1.029957,-0.731217,1.092947,2.105326,0.38788,0.217059,2.359463,-0.255962,0.374541
1,4967,-2.946355,1.635315,1.178285,5.199338,0.520571,0,-0.310862,-0.472789,5.199338,...,-1.414318,-1.414318,-0.731217,-1.528477,-1.058357,0.0,0.328428,1.519353,0.362241,-0.015923
2,4968,-2.92151,0.57446,-0.059357,0.430727,-0.76722,0,-1.399657,-0.501298,-5.199338,...,0.25065,0.25065,0.908458,0.01041,-0.354863,-0.870846,-0.395023,0.544847,1.549706,-0.870846
3,4986,-2.339828,-0.504125,-0.112187,-5.199338,1.859844,0,-1.08568,1.746017,5.199338,...,-1.370489,-1.370489,-0.335333,0.208095,-0.20463,0.211167,-0.790363,1.220448,1.051717,0.205333
4,4988,-2.310821,-0.023236,0.869136,-0.088734,1.214261,0,0.650173,0.055913,5.199338,...,2.147025,2.147025,0.403108,-1.339243,-0.525566,1.194396,-0.296193,-1.168782,-0.604585,1.22064


In [12]:
x_test_df = _test_df.drop(['status'], axis=1)

In [13]:
from os import listdir, mkdir
import logging
try:
    mkdir('log/')
except:
    pass

n = len([f for f in listdir('log/') if f.endswith('csv')])

logging.basicConfig(filename='log/submissions.log',
                    level=logging.INFO, format='%(message)s')
logging.info(f'[submission-{n}.csv] Score of {roc_auc_score(y_train_df, estimator.predict(x_train_df))}')


results = {
    'Id': x_test_df['loan_id'],
    'Predicted': estimator.predict_proba(x_test_df)[:, 1]
}

df = pd.DataFrame(results)
df.to_csv(f'log/submission-{n}.csv', index=False)
