# Modeling

First let's open our train dataset.

In [None]:
import pandas as pd

_train_df = pd.read_csv('./train.csv')
_train_df.head()

In [None]:
_train_df['status'].value_counts()

In [None]:
_train_df.loc[_train_df['status'] == 1, 'status'] = 2
_train_df.loc[_train_df['status'] == 0, 'status'] = 1
_train_df.loc[_train_df['status'] == 2, 'status'] = 0

In [None]:
_train_df['status'].value_counts()

In [None]:
x_train_df = _train_df.drop(['status'], axis=1)
y_train_df = _train_df['status']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

params = {'class_weight': [{0: 1, 1: v} for v in range(1, 7)]}
# params = [
#   {'C': [1, 10, 100, 1000], 'class_weight': [{0: 1, 1: v} for v in range(1, 7)]},
# ]
# params = {'p': [1, 2], 'weights': ['uniform', 'distance']}
# params = {'activation': ['logistic', 'identity', 'relu', 'tanh']}

grid = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000000),
    # estimator=SVC(probability=True),
    param_grid=params,
    scoring={'precision': make_scorer(precision_score), 'recall': make_scorer(recall_score), 'auc': make_scorer(roc_auc_score)},
    refit='auc',
    cv=4,
    n_jobs=-1,
)
grid.fit(x_train_df, y_train_df)

In [None]:
pd.DataFrame(grid.cv_results_)

In [None]:
estimator = grid.best_estimator_
estimator

In [None]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_train_df, estimator.predict(x_train_df)).ravel()
print(f'True positives: {tp}')
print(f'True negatives: {tn}')
print(f'False positives: {fp}')
print(f'False negatives: {fn}')

In [None]:
roc_auc_score(y_train_df, estimator.predict(x_train_df))

In [None]:
_test_df = pd.read_csv('./test.csv')
_test_df.head()

In [None]:
x_test_df = _test_df.drop(['status'], axis=1)

In [None]:
from os import listdir, mkdir
import logging
try:
    mkdir('log/')
except:
    pass

n = len([f for f in listdir('log/') if f.endswith('csv')])

logging.basicConfig(filename='log/submissions.log',
                    level=logging.INFO, format='%(message)s')
logging.info(f'[submission-{n}.csv] Score of {roc_auc_score(y_train_df, estimator.predict(x_train_df))}')


results = {
    'Id': x_test_df['loan_id'],
    'Predicted': estimator.predict_proba(x_test_df)[:, 1]
}

df = pd.DataFrame(results)
df.to_csv(f'log/submission-{n}.csv', index=False)
