# Install

In [None]:
!pip install numpy pandas scikit-learn matplotlib



# calc_all_metrics

In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
import os

from typing import Any, Dict, Tuple

import numpy as np
import pandas as pd
from sklearn.metrics import (
    mean_absolute_percentage_error,
    mean_squared_log_error,
    roc_auc_score
)
import matplotlib.pyplot as plt

In [22]:

def calc_all_metrics(data: Any) -> Dict[str, float]:
    def is_credit_issued(x: Any):
        ratio = x['__price_predict'] / x['__price_doc']
        if x['__priority'] <= 0:
            value = 0.0
        elif 0.9 < ratio < 1.0:
            value = x['__price_predict']
        elif 1.0 <= ratio < 1.1:
            value = x['__price_doc']
        else:
            value = 0.0

        return value

    def calc_profit(x: pd.DataFrame) -> np.array:
        if x['is_credit'] == 0.0:
            return 0.0
        if x['__churn'] == 1:
            return -x['debt'] * 2.0
        if x['debt'] < 5:
            return x['debt'] * 0.3
        if x['debt'] < 9:
            return x['debt'] * 0.4
        if x['debt'] >= 9:
            return x['debt'] * 0.5

    max_account = 25e3

    s = (
        data[['__priority', '__churn', '__churn_prob', '__price_doc', '__price_predict']]
        .sort_values('__priority', ascending=False)
        .copy(True)
    )

    s['debt'] = s.apply(is_credit_issued, axis=1)
    s['debt_cum'] = s['debt'].cumsum()
    s['is_credit'] = 0
    s.loc[(s['debt'] > 0) & (s['debt_cum'] <= max_account), 'is_credit'] = 1
    s['profit'] = s.apply(calc_profit, axis=1)

    total_profit = round(s['profit'].sum(), 2)
    good_credits_count = int(s['is_credit'].sum())
    good_credits_debt = int(s[s['is_credit'] == 1]['debt'].sum())
    bad_credits_count = s[s['is_credit'] == 1]['__churn'].sum()

    return {
        'total_profit': int(total_profit),
        'issue_amount': good_credits_debt,
        'bad_loans': round(bad_credits_count / (good_credits_count + bad_credits_count) * 100.0, 1),
        'churn_auc': round(roc_auc_score(y_true=s['__churn'], y_score=s['__churn_prob']), 3),
        'price_nmsle': round(
            -mean_squared_log_error(y_true=s['__price_doc'], y_pred=s['__price_predict']),
            3,
        ),
    }


METRICS_DESC = {
    'total_profit': 'Итоговая полученная прибыль (Ключевая метрика), млн руб.',
    'issue_amount': 'Итоговая выданная сумма (25 000 максимум), млн руб.',
    'bad_loans': 'Доля выданных кредитов с задолженностью, %',
    'churn_auc': 'Метрика ROC AUC по модели предсказания задолженности',
    'price_nmsle': 'Метрика Negative Mean Squared Logarithmic Error по модели предсказания стоимости',
}

# Constants

In [23]:
import datetime

RANDOM_STATE = 47

now = datetime.datetime.now().strftime('%Y-%m-%d_%H%M')
SUBMISSION_PATH = f'/content/drive/MyDrive/challenge-mortgage/data/ENot_{now}.csv'
SUBMISSION_PATH

'/content/drive/MyDrive/challenge-mortgage/data/ENot_2024-12-01_1017.csv'

# Read datasets

In [24]:
data = pd.read_csv('/content/drive/MyDrive/challenge-mortgage/data/train.csv')
submission = pd.read_csv('/content/drive/MyDrive/challenge-mortgage/data/test.csv')
data.shape, submission.shape

((20483, 61), (9988, 59))

# Train / Test split

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression


train, test = train_test_split(data, test_size=0.5, random_state=RANDOM_STATE)
train.shape, test.shape, submission.shape

((10241, 61), (10242, 61), (9988, 59))

# Prepare features and dataset

In [27]:
remove_features = train.columns[train.columns.str.startswith('__')].tolist()
remove_features

['__churn', '__price_doc']

In [28]:
continuous_features = list(set(train.dtypes[train.dtypes != 'object'].index.tolist())
                           - set(remove_features))

continuous_features2 = list(set(train.dtypes[train.dtypes != 'object'].index.tolist())
                           - set(remove_features))

len(continuous_features)

39

In [29]:
X_train = train[continuous_features].fillna(0.)
X_test = test[continuous_features].fillna(0.)
X_sub = submission[continuous_features].fillna(0.)

In [30]:
import pandas as pd

def clean_feature_name(name):
    return ''.join(c if c.isalnum() or c == '_' else '_' for c in name)

# Предположим, df — это ваш DataFrame
data.columns = [clean_feature_name(col) for col in data.columns]

# Fit models

In [31]:
import lightgbm as lgb
import pandas as pd

# Предполагаем, что у вас уже есть DataFrame train, test и submission
# Обучение модели LightGBM
reg_model = lgb.LGBMRegressor(random_state=RANDOM_STATE)
reg_model.fit(X_train, train['__price_doc'])

# Получаем предсказания для тренировочного, тестового и отправляемого наборов данных
train['__price_predict'] = reg_model.predict(X_train)
test['__price_predict'] = reg_model.predict(X_test)
submission['__price_predict'] = reg_model.predict(X_sub)

# Обработка случаев, когда предсказанная цена ниже 0.1
train.loc[train['__price_predict'] < 0.1, '__price_predict'] = 0.1
test.loc[test['__price_predict'] < 0.1, '__price_predict'] = 0.1
submission.loc[submission['__price_predict'] < 0.1, '__price_predict'] = 0.1

# Выводим первые несколько предсказанных значений для проверки
print(train[['__price_doc', '__price_predict']].head())
print(test[['__price_predict']].head())
print(submission[['__price_predict']].head())

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001530 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4968
[LightGBM] [Info] Number of data points in the train set: 10241, number of used features: 39
[LightGBM] [Info] Start training from score 6.783591
       __price_doc  __price_predict
15135     4.700000         4.771922
5305      5.800000         4.995096
11477     4.328140         4.137917
615       9.200000         7.786944
10674     5.581796         5.927803
       __price_predict
20408         9.564849
6310          8.368296
7795          4.290726
10308         7.672831
19643         4.596409
   __price_predict
0         8.333478
1         5.415152
2         5.426103
3         3.637653
4        16.219056


In [32]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

# Предположим, что train — это ваш исходный DataFrame
# Выбираем необходимые признаки
selected_features = [
    'total_revolving_bal', 'credit_limit', 'total_trans_ct', 'avg_utilization_ratio', 'months_inactive_12_mon', 'avg_open_to_buy', 'total_relationship_count'

]

# Создаем новый DataFrame с выбранными признаками
X_train_selected = train[selected_features]

# Создаем и обучаем модель логистической регрессии
clf_model = LogisticRegression(random_state=RANDOM_STATE)
clf_model.fit(X_train_selected, train['__churn'])

# Получаем вероятности оттока для тренировочного набора данных
train['__churn_prob'] = clf_model.predict_proba(X_train_selected)[:, 1]

# Аналогично для тестового набора данных
X_test_selected = test[selected_features]

test['__churn_prob'] = clf_model.predict_proba(X_test_selected)[:, 1]

# И для набора данных для отправки (submission)
X_sub_selected = submission[selected_features]

submission['__churn_prob'] = clf_model.predict_proba(X_sub_selected)[:, 1]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [33]:
from sklearn.ensemble import GradientBoostingClassifier
clf_model = GradientBoostingClassifier(random_state=RANDOM_STATE)
clf_model.fit(X_train, train['__churn'])
train['__churn_prob'] = clf_model.predict_proba(X_train)[:, 1]
test['__churn_prob'] = clf_model.predict_proba(X_test)[:, 1]
submission['__churn_prob'] = clf_model.predict_proba(X_sub)[:, 1]

# Select Priority Algorithm

In [34]:
def alg(x):
    """
    Приоритет в выдаче кредита (от -∞ до +∞)

    +∞: максимальный приоритет, решение о выдаче будет рассматриваться первым
     0: - минимальный приоритет, решение о выдаче будет рассматриваться последним
    -1: - данная сделка не будет рассматриваться
    """
    return ((1-(x['__churn_prob'])- 0.85))


train['__priority'] = train.apply(alg, axis=1)
test['__priority'] = test.apply(alg, axis=1)
submission['__priority'] = submission.apply(alg, axis=1)

In [35]:
train['__churn_prob'].describe()

Unnamed: 0,__churn_prob
count,10241.0
mean,0.159578
std,0.286316
min,0.000883
25%,0.003851
50%,0.011588
75%,0.134316
max,0.996296


# Calculate statistics

In [36]:
pd.concat([
    train['__price_predict'].describe(),
    train['__price_doc'].describe(),
    train['__churn_prob'].describe(),
    train['__churn'].describe(),
], axis=1)

Unnamed: 0,__price_predict,__price_doc,__churn_prob,__churn
count,10241.0,10241.0,10241.0,10241.0
mean,6.783591,6.783591,0.159578,0.159555
std,3.620455,4.285971,0.286316,0.36621
min,2.083366,0.19,0.000883,0.0
25%,4.885893,4.582,0.003851,0.0
50%,5.884699,6.0,0.011588,0.0
75%,7.336345,7.8694,0.134316,0.0
max,57.867214,77.0,0.996296,1.0


# Calculate key metrics

In [37]:
score = pd.concat([
    pd.Series(calc_all_metrics(train), name='train'),
    pd.Series(calc_all_metrics(test), name='test'),
], axis=1)

# добавляем колонку с описанием метрики
score['desc'] = score.index.map(METRICS_DESC)
score

Unnamed: 0,train,test,desc
total_profit,9996.0,7789.0,Итоговая полученная прибыль (Ключевая метрика)...
issue_amount,24991.0,21718.0,"Итоговая выданная сумма (25 000 максимум), млн..."
bad_loans,0.5,1.6,"Доля выданных кредитов с задолженностью, %"
churn_auc,0.979,0.968,Метрика ROC AUC по модели предсказания задолже...
price_nmsle,-0.098,-0.145,Метрика Negative Mean Squared Logarithmic Erro...


# Submission

Файл с результатами вашего решения должен содержать только следующие колонки ['__price_predict', '__churn_prob', '__priority']

In [38]:
mysub = submission[['__price_predict', '__churn_prob', '__priority']]
mysub.to_csv(SUBMISSION_PATH, index=False)

if mysub.shape != (9988, 3):
    raise ValueError('Неправильный размер submission файла')

In [39]:
!ls -lh ../data/submissions/*.csv

ls: cannot access '../data/submissions/*.csv': No such file or directory
