In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
import os
from custom import calc_all_metrics
import seaborn as sns

In [2]:
data= pd.read_csv('train_pr.csv')
submission = pd.read_csv('test_pr.csv')

In [3]:
train, test = train_test_split(data, test_size=0.2)
train.shape, test.shape


remove_features = train.columns[train.columns.str.startswith('__')].tolist()


continuous_features = list(set(train.dtypes[train.dtypes != 'object'].index.tolist()) 
                           - set(remove_features))

In [4]:
X_train = train[continuous_features].fillna(0.)
X_test = test[continuous_features].fillna(0.)
X_sub = submission[continuous_features].fillna(0.)

In [5]:
reg_model = GradientBoostingRegressor(random_state=47,
        max_depth=9,n_estimators=99)
reg_model.fit(X_train, train['__price_doc'])

train['__price_predict'] = reg_model.predict(X_train)
test['__price_predict'] = reg_model.predict(X_test)
submission['__price_predict'] = reg_model.predict(X_sub)

# В случае когда значение ниже 0
train.loc[train['__price_predict'] < 0.1, '__price_predict'] = 0.1
test.loc[test['__price_predict'] < 0.1, '__price_predict'] = 0.1
submission.loc[submission['__price_predict'] < 0.1, '__price_predict'] = 0.1

In [6]:
log_model = GradientBoostingClassifier(random_state=47,
        max_depth=4,n_estimators=100)
log_model.fit(X_train, train['__churn'])

train_prob = log_model.predict_proba(X_train)
test_prob = log_model.predict_proba(X_test)
sub_prob = log_model.predict_proba(X_sub)

# Выбираем вероятность для 1 класса
train['__churn_prob'] = train_prob[:, 1]
test['__churn_prob'] = test_prob[:, 1]
submission['__churn_prob'] = sub_prob[:, 1]

In [7]:
from scipy.optimize import linprog

In [20]:
def alg1(y):
    """
    Алгоритм приоритизации #1 в выдаче ипотеки
    """
    y["ex_inc"]=(1-y['__churn_prob'])*y['__price_predict']
    y["var"]=y['__churn_prob']*(1-y['__churn_prob'])*y['__price_predict']**2
    
    obj = list(-1*y["ex_inc"])
    
    lhs_ineq = [y["var"]] 
    rhs_ineq = [y["var"].median()*8000]
    
    #bnd = [(0,i) for i in y['__churn_prob']]
    bnd = [(0,1)]*len(y)
    
    opt = linprog(c=obj, A_ub=lhs_ineq, b_ub=rhs_ineq, bounds=bnd,
              method='highs')
    
    return opt.x*y['__price_predict']*(1-y['__churn_prob'])


#train['__priority'] = train.apply(alg1, axis=1)
#test['__priority'] = test.apply(alg1, axis=1)
#submission['__priority'] = submission.apply(alg1, axis=1)

train['__priority'] = alg1(train)
test['__priority'] = alg1(test)
submission['__priority'] = alg1(submission)

In [21]:
calc_all_metrics(train)

{'total_profit': 4995.896268970735,
 '%profit': 50.0,
 'good_credits_count': 599,
 'good_credits_debt': 9991.79253794147,
 'bad_credits_count': 0,
 'bad_credits_losses': 0.0,
 'bad_credits_%losses': 0.0,
 'churn_rocauc': 0.9816,
 'price_msle': 0.0451}

In [22]:
calc_all_metrics(test)

{'total_profit': 3042.0011843188877,
 '%profit': 34.38,
 'good_credits_count': 1413,
 'good_credits_debt': 8847.248876343709,
 'bad_credits_count': 37,
 'bad_credits_losses': 212.8948970194385,
 'bad_credits_%losses': 7.0,
 'churn_rocauc': 0.9734,
 'price_msle': 0.1475}

In [23]:
mysub = submission[['__price_predict', '__churn_prob', '__priority']]

mysub.to_csv('submission.csv', index=False)

if mysub.shape != (9988, 3):
    raise ValueError('Не корректный размер файла с предсказаниями')