In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import pickle
import matplotlib.pyplot as plt
from ipywidgets import interactive

### Препроцессинг винтажей из прошлого дз

In [None]:
df = pd.read_csv('/content/gdrive/My Drive/data_scores_and_features.csv')
                                       
data_vintages_1 = pd.read_csv('/content/gdrive/My Drive/vintages_1.csv')

data_vintages_2 = pd.read_csv('/content/gdrive/My Drive/vintages_2.csv')

data_vintages = pd.concat([data_vintages_1, data_vintages_2])

data_vintages.dropna(subset=['user_id'])

data_vintages['statement_num'].unique()

data_vintages = data_vintages.drop([1246634])

data_vintages['statement_num'] = data_vintages['statement_num'].apply(lambda x : int(x))

df = df.drop([32506, 68471, 93755])

filename = '/content/gdrive/My Drive/model.sav'
model = pickle.load(open(filename, 'rb'))

features = [
            'max_util',
            'soc_dem_score',
            'other_util',
            'last_credit_time_years',
            'credits_4y',
            'avg_limit_mortgage',
            'min_limit_mfo',
            'close_balance_amt',
            'avg_active_time',
]

df['avg_active_time'] = df['avg_active_time'].fillna(df['avg_active_time'].median())
df['avg_limit_mortgage'] = -df['avg_limit_mortgage']
df['close_balance_amt'] = -df['close_balance_amt']
df['max_util'] = df['max_util'].clip(upper=2)

df['model_pd'] = model.predict_proba(df[features])[:,1]

df = df.sort_values(['model_pd'], ignore_index=True)
df['pd_bucket'] = np.minimum(df.index // (df.shape[0]//3),2)
print(df.groupby(['pd_bucket'])['model_pd'].mean())

data_vintages = data_vintages.merge(df[['user_id', 'pd_bucket', 'model_pd']], how='left', on='user_id').dropna()

pd_bucket
0    0.026104
1    0.048586
2    0.096909
Name: model_pd, dtype: float32


In [None]:
def polynom_approximation(df, start=1, finish=20, degree = 1, curve_type='def_rate'):
    """
    Parameters
    ----------

    df : pd.DataFrame
         датафрейм, содержащий значения кривой в стейтментах

    start : int
            стейтмент, начиная с которого производится сглаживание

    finish : int
             стейтмент, на котором заканчивается производится сглаживание
    
    degree : int
             степень аппроксимирующего многочлена

    curve_type : str
                 тип кривой, к которой применяется сглаживание, из набора ['def_rate', 'dlq_ratio', 'clo_rate']

    Returns
    -------

    df_res : pd.DataFrame
             датафрейм, содержащий сглаженные значения кривой в стейтментах

    """
    df_res = pd.DataFrame(df)
    df_res['Полином'] = df[curve_type]                   
    linear_coeff = np.polyfit(np.arange(start, finish+1),df.loc[start:finish, curve_type], degree)  
    linear_coeff = np.poly1d(linear_coeff)                                                 
    df_res.loc[start:finish, 'Полином'] = linear_coeff(np.arange(start, finish+1)) 
    df_res.loc[finish+1:, 'Полином'] = linear_coeff(finish+1)                    
    return df_res

In [None]:
def linear_interpolation(curve_1, curve_2, score_1_mean, score_2_mean, score):
  """
  Parameters
  ----------

  curve_1 : np.array
            значения в стейтментах кривой, для которой средний score в бакете ограничивает интерполируемый score снизу
  
  curve_2 : np.array
            значения в стейтментах кривой, для которой средний score в бакете ограничивает интерполируемый score сверху

  score_1_mean : float
                 средний score в бакете, ограничивающий интерполируемый score снизу

  score_2_mean : float
                 средний score в бакете, ограничивающий интерполируемый score сверху

  score : float 
          интерполируемый score

  Returns
  -------

  score_curve : np.array
                значения в стейтментах кривой для интерполируемого score
  """
  score_curve = np.zeros(25)
  for i in range(1, 25):
    score_curve[i] = curve_1[i-1] + (score-score_1_mean)*(curve_2[i-1]-curve_1[i-1])/(score_2_mean-score_1_mean)
  return score_curve

### Для pd = 5 % для интерполяции будем использовать 1 и 2 бакеты

### Также, чтобы построить диаграмму изменения PV, нам нужно построить кривые при pd = 4 % и pd = 6 % 

### Кривая def rate

In [None]:
data_vintages = data_vintages.eval('def_new_flg = def_flg * prev_act_flg', inplace=False)

def_vintages = data_vintages.groupby(['pd_bucket', "statement_num"])\
            .agg({"def_new_flg":np.sum, "prev_act_flg":np.sum})\
            .rename(columns = ({"def_new_flg":'def_cnt', "prev_act_flg":'prev_act_cnt'}))\
            .eval('def_rate = def_cnt / prev_act_cnt', inplace=False)

pd_1_mean =df.groupby(['pd_bucket'])['model_pd'].mean()[1]

pd_2_mean =df.groupby(['pd_bucket'])['model_pd'].mean()[2]

def_1_bucket = def_vintages.query('pd_bucket == 1').reset_index().set_index(['statement_num'])

def_2_bucket = def_vintages.query('pd_bucket == 2').reset_index().set_index(['statement_num'])

def_polynom_1 = polynom_approximation(def_1_bucket, 5,24,2)

def_polynom_2 = polynom_approximation(def_2_bucket, 5,24,2)

#### pd = 5

In [None]:
def_curve_pd_5 = linear_interpolation(np.array(def_polynom_1['Полином']), np.array(def_polynom_2['Полином']), pd_1_mean, pd_2_mean, 0.05)

for i, def_rate in enumerate(def_curve_pd_5):
  print(f"def rate for statement {i} = {def_rate * 100} %")

def rate for statement 0 = 0.0 %
def rate for statement 1 = 0.0 %
def rate for statement 2 = 0.0 %
def rate for statement 3 = 0.0 %
def rate for statement 4 = 2.498287992323106 %
def rate for statement 5 = 0.48803983239302845 %
def rate for statement 6 = 0.5571066354681323 %
def rate for statement 7 = 0.6201451804014636 %
def rate for statement 8 = 0.6771554671930227 %
def rate for statement 9 = 0.7281374958428093 %
def rate for statement 10 = 0.7730912663508233 %
def rate for statement 11 = 0.8120167787170651 %
def rate for statement 12 = 0.8449140329415344 %
def rate for statement 13 = 0.8717830290242314 %
def rate for statement 14 = 0.8926237669651558 %
def rate for statement 15 = 0.907436246764308 %
def rate for statement 16 = 0.9162204684216876 %
def rate for statement 17 = 0.918976431937295 %
def rate for statement 18 = 0.9157041373111299 %
def rate for statement 19 = 0.906403584543192 %
def rate for statement 20 = 0.8910747736334821 %
def rate for statement 21 = 0.86971770458199

#### pd = 4

In [None]:
def_curve_pd_4 = linear_interpolation(np.array(def_polynom_1['Полином']), np.array(def_polynom_2['Полином']), pd_1_mean, pd_2_mean, 0.04)

for i, def_rate in enumerate(def_curve_pd_4):
  print(f"def rate for statement {i} = {def_rate * 100} %")

def rate for statement 0 = 0.0 %
def rate for statement 1 = 0.0 %
def rate for statement 2 = 0.0 %
def rate for statement 3 = 0.0 %
def rate for statement 4 = 2.343649210661973 %
def rate for statement 5 = 0.3894177660335274 %
def rate for statement 6 = 0.45412520216568475 %
def rate for statement 7 = 0.5132469254701275 %
def rate for statement 8 = 0.5667829359468561 %
def rate for statement 9 = 0.6147332335958705 %
def rate for statement 10 = 0.6570978184171702 %
def rate for statement 11 = 0.6938766904107557 %
def rate for statement 12 = 0.725069849576627 %
def rate for statement 13 = 0.7506772959147837 %
def rate for statement 14 = 0.7706990294252261 %
def rate for statement 15 = 0.7851350501079543 %
def rate for statement 16 = 0.7939853579629678 %
def rate for statement 17 = 0.7972499529902671 %
def rate for statement 18 = 0.7949288351898521 %
def rate for statement 19 = 0.7870220045617226 %
def rate for statement 20 = 0.7735294611058788 %
def rate for statement 21 = 0.754451204822

#### pd = 6

In [None]:
def_curve_pd_6 = linear_interpolation(np.array(def_polynom_1['Полином']), np.array(def_polynom_2['Полином']), pd_1_mean, pd_2_mean, 0.06)

for i, def_rate in enumerate(def_curve_pd_6):
  print(f"def rate for statement {i} = {def_rate * 100} %")

def rate for statement 0 = 0.0 %
def rate for statement 1 = 0.0 %
def rate for statement 2 = 0.0 %
def rate for statement 3 = 0.0 %
def rate for statement 4 = 2.6529267739842393 %
def rate for statement 5 = 0.5866618987525295 %
def rate for statement 6 = 0.6600880687705799 %
def rate for statement 7 = 0.7270434353327996 %
def rate for statement 8 = 0.787527998439189 %
def rate for statement 9 = 0.8415417580897481 %
def rate for statement 10 = 0.8890847142844764 %
def rate for statement 11 = 0.9301568670233743 %
def rate for statement 12 = 0.964758216306442 %
def rate for statement 13 = 0.992888762133679 %
def rate for statement 14 = 1.0145485045050855 %
def rate for statement 15 = 1.0297374434206616 %
def rate for statement 16 = 1.0384555788804073 %
def rate for statement 17 = 1.0407029108843227 %
def rate for statement 18 = 1.0364794394324073 %
def rate for statement 19 = 1.0257851645246614 %
def rate for statement 20 = 1.0086200861610852 %
def rate for statement 21 = 0.98498420434167

### DLQ Ratio

In [None]:
dlq_vintages = data_vintages.groupby(['pd_bucket', "statement_num"])\
            .agg({"dlq_flg":np.sum, "act_flg":np.sum})\
            .rename(columns = ({"dlq_flg":'dlq_cnt', "act_flg":'act_cnt'}))\
            .eval('dlq_ratio = dlq_cnt / act_cnt', inplace=False)

dlq_1_bucket = dlq_vintages.query('pd_bucket == 1').reset_index().set_index(['statement_num'])

dlq_2_bucket = dlq_vintages.query('pd_bucket == 2').reset_index().set_index(['statement_num'])

dlq_polynom_1 = polynom_approximation(dlq_1_bucket,4,22,2,'dlq_ratio')

dlq_polynom_2 = polynom_approximation(dlq_2_bucket, 4,22,2,'dlq_ratio')

#### pd = 5

In [None]:
dlq_curve_pd_5 = linear_interpolation(np.array(dlq_polynom_1['Полином']), np.array(dlq_polynom_2['Полином']), pd_1_mean, pd_2_mean, 0.05)

for i, dlq in enumerate(dlq_curve_pd_5):
  print(f"dlq ratio for statement {i} = {dlq * 100} %")

dlq ratio for statement 0 = 0.0 %
dlq ratio for statement 1 = 7.5868618784402955 %
dlq ratio for statement 2 = 8.80551094535839 %
dlq ratio for statement 3 = 9.782699880737974 %
dlq ratio for statement 4 = 8.400802999223357 %
dlq ratio for statement 5 = 9.020824741829804 %
dlq ratio for statement 6 = 9.603893427226645 %
dlq ratio for statement 7 = 10.150009055413872 %
dlq ratio for statement 8 = 10.659171626391496 %
dlq ratio for statement 9 = 11.131381140159512 %
dlq ratio for statement 10 = 11.566637596717916 %
dlq ratio for statement 11 = 11.964940996066716 %
dlq ratio for statement 12 = 12.326291338205907 %
dlq ratio for statement 13 = 12.650688623135492 %
dlq ratio for statement 14 = 12.938132850855466 %
dlq ratio for statement 15 = 13.188624021365833 %
dlq ratio for statement 16 = 13.402162134666593 %
dlq ratio for statement 17 = 13.578747190757745 %
dlq ratio for statement 18 = 13.718379189639288 %
dlq ratio for statement 19 = 13.821058131311226 %
dlq ratio for statement 20 = 13

#### pd = 4

In [None]:
dlq_curve_pd_4 = linear_interpolation(np.array(dlq_polynom_1['Полином']), np.array(dlq_polynom_2['Полином']), pd_1_mean, pd_2_mean, 0.04)

for i, dlq in enumerate(dlq_curve_pd_4):
  print(f"dlq ratio for statement {i} = {dlq * 100} %")

dlq ratio for statement 0 = 0.0 %
dlq ratio for statement 1 = 7.2425007913315 %
dlq ratio for statement 2 = 8.408135517173172 %
dlq ratio for statement 3 = 9.26680351560585 %
dlq ratio for statement 4 = 7.764261851569948 %
dlq ratio for statement 5 = 8.350510646239956 %
dlq ratio for statement 6 = 8.902815468233747 %
dlq ratio for statement 7 = 9.42117631755132 %
dlq ratio for statement 8 = 9.905593194192676 %
dlq ratio for statement 9 = 10.356066098157815 %
dlq ratio for statement 10 = 10.772595029446732 %
dlq ratio for statement 11 = 11.155179988059436 %
dlq ratio for statement 12 = 11.503820973995921 %
dlq ratio for statement 13 = 11.81851798725619 %
dlq ratio for statement 14 = 12.099271027840242 %
dlq ratio for statement 15 = 12.346080095748073 %
dlq ratio for statement 16 = 12.55894519097969 %
dlq ratio for statement 17 = 12.737866313535088 %
dlq ratio for statement 18 = 12.882843463414268 %
dlq ratio for statement 19 = 12.993876640617227 %
dlq ratio for statement 20 = 13.0709658

#### pd = 6

In [None]:
dlq_curve_pd_6 = linear_interpolation(np.array(dlq_polynom_1['Полином']), np.array(dlq_polynom_2['Полином']), pd_1_mean, pd_2_mean, 0.06)

for i, dlq in enumerate(dlq_curve_pd_6):
  print(f"dlq ratio for statement {i} = {dlq * 100} %")

dlq ratio for statement 0 = 0.0 %
dlq ratio for statement 1 = 7.931222965549092 %
dlq ratio for statement 2 = 9.202886373543608 %
dlq ratio for statement 3 = 10.298596245870097 %
dlq ratio for statement 4 = 9.037344146876768 %
dlq ratio for statement 5 = 9.69113883741965 %
dlq ratio for statement 6 = 10.304971386219538 %
dlq ratio for statement 7 = 10.878841793276425 %
dlq ratio for statement 8 = 11.412750058590316 %
dlq ratio for statement 9 = 11.906696182161207 %
dlq ratio for statement 10 = 12.360680163989098 %
dlq ratio for statement 11 = 12.774702004073996 %
dlq ratio for statement 12 = 13.148761702415893 %
dlq ratio for statement 13 = 13.48285925901479 %
dlq ratio for statement 14 = 13.776994673870691 %
dlq ratio for statement 15 = 14.031167946983594 %
dlq ratio for statement 16 = 14.245379078353498 %
dlq ratio for statement 17 = 14.419628067980403 %
dlq ratio for statement 18 = 14.553914915864311 %
dlq ratio for statement 19 = 14.648239622005221 %
dlq ratio for statement 20 = 14

In [None]:
x_scores = pd.read_csv('/content/gdrive/My Drive/data_scores.csv', sep=';', decimal=',')

x_scores = x_scores.sort_values(['x_score'], ignore_index=True)
x_scores['x_bucket'] = np.minimum(x_scores.index // (df.shape[0]//3),2)
print(x_scores.groupby(['x_bucket'])['x_score'].mean())

data_vintages = data_vintages.merge(x_scores[['user_id', 'x_bucket', 'x_score']], how='left', on='user_id', suffixes=None).dropna()

x_score_0_mean = x_scores.groupby(['x_bucket'])['x_score'].mean()[0]

x_score_1_mean = x_scores.groupby(['x_bucket'])['x_score'].mean()[1]

x_bucket
0    0.133269
1    0.220834
2    0.379446
Name: x_score, dtype: float64


### Для x_score = 15 % для интерполяции будем использовать 0 и 1 бакеты

### Аналогично для построения диаграммы изменения PV нам понадобятся кривые при x_score = 14 % и x_score = 16 %

In [None]:
data_vintages = data_vintages.eval('clo_new_flg = clo_flg * prev_act_flg', inplace=False)

clo_vintages = data_vintages.groupby(['x_bucket', "statement_num"])\
            .agg({"clo_new_flg":np.sum, "prev_act_flg":np.sum})\
            .rename(columns = ({"clo_new_flg":'clo_cnt', "prev_act_flg":'prev_act_cnt'}))\
            .eval('clo_rate = clo_cnt / prev_act_cnt', inplace=False)

clo_0_bucket = clo_vintages.query('x_bucket == 0').reset_index().set_index(['statement_num'])

clo_1_bucket = clo_vintages.query('x_bucket == 1').reset_index().set_index(['statement_num'])

clo_polynom_0 = polynom_approximation(clo_0_bucket,2,24,2,'clo_rate')

clo_polynom_1 = polynom_approximation(clo_1_bucket,2,24,2,'clo_rate')

#### x_score = 15 %

In [None]:
clo_curve_x_score_15 = linear_interpolation(np.array(clo_polynom_0['Полином']), np.array(clo_polynom_1['Полином']), x_score_0_mean, x_score_1_mean, 0.15)

for i, clo in enumerate(clo_curve_x_score_15):
  print(f"clo rate for statement {i} = {clo * 100} %")

clo rate for statement 0 = 0.0 %
clo rate for statement 1 = 3.538941082782338 %
clo rate for statement 2 = 2.2830407965409654 %
clo rate for statement 3 = 2.3083539506010853 %
clo rate for statement 4 = 2.338600903025087 %
clo rate for statement 5 = 2.3737816538129715 %
clo rate for statement 6 = 2.4138962029647386 %
clo rate for statement 7 = 2.458944550480388 %
clo rate for statement 8 = 2.5089266963599206 %
clo rate for statement 9 = 2.5638426406033354 %
clo rate for statement 10 = 2.623692383210633 %
clo rate for statement 11 = 2.688475924181813 %
clo rate for statement 12 = 2.7581932635168753 %
clo rate for statement 13 = 2.832844401215821 %
clo rate for statement 14 = 2.912429337278649 %
clo rate for statement 15 = 2.996948071705359 %
clo rate for statement 16 = 3.086400604495952 %
clo rate for statement 17 = 3.1807869356504272 %
clo rate for statement 18 = 3.280107065168786 %
clo rate for statement 19 = 3.3843609930510263 %
clo rate for statement 20 = 3.49354871929715 %
clo rate

#### x_score = 14 %

In [None]:
clo_curve_x_score_14 = linear_interpolation(np.array(clo_polynom_0['Полином']), np.array(clo_polynom_1['Полином']), x_score_0_mean, x_score_1_mean, 0.14)

for i, clo in enumerate(clo_curve_x_score_14):
  print(f"clo rate for statement {i} = {clo * 100} %")

clo rate for statement 0 = 0.0 %
clo rate for statement 1 = 3.3055083247475068 %
clo rate for statement 2 = 2.1429764377308604 %
clo rate for statement 3 = 2.1684996613341383 %
clo rate for statement 4 = 2.199033778242738 %
clo rate for statement 5 = 2.234578788456661 %
clo rate for statement 6 = 2.275134691975906 %
clo rate for statement 7 = 2.3207014888004736 %
clo rate for statement 8 = 2.3712791789303638 %
clo rate for statement 9 = 2.426867762365577 %
clo rate for statement 10 = 2.4874672391061123 %
clo rate for statement 11 = 2.55307760915197 %
clo rate for statement 12 = 2.6236988725031503 %
clo rate for statement 13 = 2.6993310291596533 %
clo rate for statement 14 = 2.7799740791214793 %
clo rate for statement 15 = 2.865628022388627 %
clo rate for statement 16 = 2.9562928589610977 %
clo rate for statement 17 = 3.051968588838891 %
clo rate for statement 18 = 3.152655212022007 %
clo rate for statement 19 = 3.2583527285104448 %
clo rate for statement 20 = 3.3690611383042066 %
clo r

#### x_score = 16 %

In [None]:
clo_curve_x_score_16 = linear_interpolation(np.array(clo_polynom_0['Полином']), np.array(clo_polynom_1['Полином']), x_score_0_mean, x_score_1_mean, 0.16)

for i, clo in enumerate(clo_curve_x_score_16):
  print(f"clo rate for statement {i} = {clo * 100} %")

clo rate for statement 0 = 0.0 %
clo rate for statement 1 = 3.7723738408171705 %
clo rate for statement 2 = 2.423105155351071 %
clo rate for statement 3 = 2.4482082398680327 %
clo rate for statement 4 = 2.478168027807436 %
clo rate for statement 5 = 2.512984519169283 %
clo rate for statement 6 = 2.5526577139535713 %
clo rate for statement 7 = 2.5971876121603032 %
clo rate for statement 8 = 2.6465742137894774 %
clo rate for statement 9 = 2.7008175188410943 %
clo rate for statement 10 = 2.759917527315154 %
clo rate for statement 11 = 2.8238742392116567 %
clo rate for statement 12 = 2.8926876545306013 %
clo rate for statement 13 = 2.966357773271989 %
clo rate for statement 14 = 3.044884595435819 %
clo rate for statement 15 = 3.1282681210220917 %
clo rate for statement 16 = 3.216508350030807 %
clo rate for statement 17 = 3.3096052824619644 %
clo rate for statement 18 = 3.4075589183155652 %
clo rate for statement 19 = 3.5103692575916083 %
clo rate for statement 20 = 3.618036300290095 %
clo 