In [1]:
import numpy as np
import pandas as pd

import itertools

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, learning_curve
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from scipy.stats import mannwhitneyu
from scipy.stats import chi2_contingency

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [4]:
def evaluate_preds(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

**Пути к директориям и файлам**

In [5]:
TRAIN_DATASET_PATH = 'course_project_train.csv'
TEST_DATASET_PATH = 'course_project_test.csv'

In [6]:
train_df = pd.read_csv(TRAIN_DATASET_PATH)
train_df.head(1)

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0


In [7]:
train_df.shape

(7500, 17)

In [8]:
test_df = pd.read_csv(TEST_DATASET_PATH)
test_df.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score
0,Rent,,4 years,0.0,9.0,12.5,220968.0,0.0,70.0,0.0,debt consolidation,Short Term,162470.0,105906.0,6813.0,
1,Rent,231838.0,1 year,0.0,6.0,32.7,55946.0,0.0,8.0,0.0,educational expenses,Short Term,78298.0,46037.0,2318.0,699.0
2,Home Mortgage,1152540.0,3 years,0.0,10.0,13.7,204600.0,0.0,,0.0,debt consolidation,Short Term,200178.0,146490.0,18729.0,7260.0
3,Home Mortgage,1220313.0,10+ years,0.0,16.0,17.0,456302.0,0.0,70.0,0.0,debt consolidation,Short Term,217382.0,213199.0,27559.0,739.0
4,Home Mortgage,2340952.0,6 years,0.0,11.0,23.6,1207272.0,0.0,,0.0,debt consolidation,Long Term,777634.0,425391.0,42605.0,706.0


In [9]:
train_df_base_solution = train_df.copy()
train_df_base_solution.head(1)

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0


#### заполнение количественных признаков медианами для базового решения

In [10]:
train_df_base_solution['Annual Income'] =\
    train_df_base_solution['Annual Income'].fillna(train_df_base_solution['Annual Income'].median())

train_df_base_solution['Months since last delinquent'] =\
    train_df_base_solution['Months since last delinquent'].fillna(train_df_base_solution['Months since last delinquent'].median())

train_df_base_solution['Credit Score'] =\
    train_df_base_solution['Credit Score'].fillna(train_df_base_solution['Credit Score'].median())

In [11]:
print(train_df_base_solution['Annual Income'].isna().sum())
print(train_df_base_solution['Months since last delinquent'].isna().sum())
print(train_df_base_solution['Credit Score'].isna().sum())

0
0
0


#### заполнение категориальных признаков модами для базового решения

In [12]:
train_df_base_solution['Years in current job'] =\
    train_df_base_solution['Years in current job'].fillna(train_df_base_solution['Years in current job'].mode()[0])
train_df_base_solution['Bankruptcies'] =\
    train_df_base_solution['Bankruptcies'].fillna(train_df_base_solution['Bankruptcies'].mode()[0])

In [13]:
print(train_df_base_solution['Years in current job'].isna().sum())
print(train_df_base_solution['Bankruptcies'].isna().sum())

0
0


#### проверка на отсутствие пропусков в датафрейме для базового решения

In [14]:
train_df_base_solution.isna().sum().sum()

0

In [15]:
train_df_base_solution.dtypes

Home Ownership                   object
Annual Income                   float64
Years in current job             object
Tax Liens                       float64
Number of Open Accounts         float64
Years of Credit History         float64
Maximum Open Credit             float64
Number of Credit Problems       float64
Months since last delinquent    float64
Bankruptcies                    float64
Purpose                          object
Term                             object
Current Loan Amount             float64
Current Credit Balance          float64
Monthly Debt                    float64
Credit Score                    float64
Credit Default                    int64
dtype: object

#### получение get_dummies для категориальных переменных и удаление переменной из которой делали get_dummies

In [16]:
list_of_features_to_get_dummies = ['Home Ownership',
                                   'Years in current job',
                                   'Purpose', 
                                   'Term']

for feature in list_of_features_to_get_dummies:
    # получение get_dummies
    train_df_base_solution = pd.concat([train_df_base_solution, 
                                        pd.get_dummies(pd.get_dummies(train_df_base_solution[feature],
                                        prefix=feature))],
                                        axis=1)
    # удаление признака
    train_df_base_solution = train_df_base_solution.drop([feature], axis=1)

In [17]:
train_df_base_solution.dtypes

Annual Income                     float64
Tax Liens                         float64
Number of Open Accounts           float64
Years of Credit History           float64
Maximum Open Credit               float64
Number of Credit Problems         float64
Months since last delinquent      float64
Bankruptcies                      float64
Current Loan Amount               float64
Current Credit Balance            float64
Monthly Debt                      float64
Credit Score                      float64
Credit Default                      int64
Home Ownership_Have Mortgage        uint8
Home Ownership_Home Mortgage        uint8
Home Ownership_Own Home             uint8
Home Ownership_Rent                 uint8
Years in current job_1 year         uint8
Years in current job_10+ years      uint8
Years in current job_2 years        uint8
Years in current job_3 years        uint8
Years in current job_4 years        uint8
Years in current job_5 years        uint8
Years in current job_6 years      

#### Разбиение на train и test

In [18]:
TARGET_NAME = 'Credit Default'

X = train_df_base_solution.drop(TARGET_NAME, axis=1)
y = train_df_base_solution[TARGET_NAME]

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    shuffle=True,
                                                    test_size=0.3,
                                                    random_state=21,
                                                    stratify=y
                                                   )

display(y_train.value_counts(normalize=True), y_test.value_counts(normalize=True))

0    0.718286
1    0.281714
Name: Credit Default, dtype: float64

0    0.718222
1    0.281778
Name: Credit Default, dtype: float64

#### Масштабрование данных

In [19]:
scaler = StandardScaler()

X_train_norm = X_train.copy()
X_test_norm = X_test.copy()

X_train_norm = scaler.fit_transform(X_train_norm)
X_test_norm = scaler.transform(X_test_norm)

In [20]:
model_lr = LogisticRegression()
model_lr.fit(X_train_norm, y_train)

evaluate_preds(model_lr, X_train_norm, X_test_norm, y_train, y_test)

TRAIN

              precision    recall  f1-score   support

           0       0.78      0.97      0.86      3771
           1       0.81      0.29      0.43      1479

    accuracy                           0.78      5250
   macro avg       0.80      0.63      0.65      5250
weighted avg       0.79      0.78      0.74      5250

TEST

              precision    recall  f1-score   support

           0       0.77      0.97      0.86      1616
           1       0.77      0.25      0.38       634

    accuracy                           0.77      2250
   macro avg       0.77      0.61      0.62      2250
weighted avg       0.77      0.77      0.72      2250

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1569   47
1                474  160


In [21]:
model_tree = DecisionTreeClassifier(random_state=21,
                                    class_weight={0:1, 1:2.55},
                                    max_depth=4
                                    )
model_tree.fit(X_train, y_train)

evaluate_preds(model_tree, X_train, X_test, y_train, y_test)

TRAIN

              precision    recall  f1-score   support

           0       0.81      0.82      0.82      3771
           1       0.53      0.51      0.52      1479

    accuracy                           0.73      5250
   macro avg       0.67      0.67      0.67      5250
weighted avg       0.73      0.73      0.73      5250

TEST

              precision    recall  f1-score   support

           0       0.80      0.81      0.81      1616
           1       0.51      0.49      0.50       634

    accuracy                           0.72      2250
   macro avg       0.65      0.65      0.65      2250
weighted avg       0.72      0.72      0.72      2250

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1312  304
1                322  312


# ТЕКУЩИЕ ВОЗМОЖНОСТИ

In [22]:
TARGET_NAME = 'Credit Default'

In [23]:
def generate_dummies_from_home_ownership(X):
    X['G_home_ownership_home_mortgage'] = 0
    X.loc[X['Home Ownership'] == 'Home Mortgage', 'G_home_ownership_home_mortgage'] = 1
    
    X['G_home_ownership_rent'] = 0
    X.loc[X['Home Ownership'] == 'Rent', 'G_home_ownership_rent'] = 1
    
    X['G_home_ownership_own_home'] = 0
    X.loc[X['Home Ownership'] == 'Own Home', 'G_home_ownership_own_home'] = 1
    
    X['G_home_ownership_have_mortgage'] = 0
    X.loc[X['Home Ownership'] == 'Have Mortgage', 'G_home_ownership_have_mortgage'] = 1
    
    X['G_home_ownership_other'] = 0
    X.loc[~X['Home Ownership'].isin(['Home Mortgage', 'Rent', 'Own Home', 'Have Mortgage']), 'G_home_ownership_other'] = 1
    
    return X

In [24]:
def generate_G_home_ownership_reduced_to_binary(X):
    X['G_home_ownership_reduced_to_binary'] = 0
    X.loc[~X['Home Ownership'].isin(['Own Home', 'Rent']), 'G_home_ownership_reduced_to_binary'] = 1
    return X

In [25]:
def generate_G_years_in_current_job_reduced_to_binary(X):
    X['G_years_in_current_job_reduced_to_binary'] = 0
    X.loc[X['Years in current job'].isna(), 'G_years_in_current_job_reduced_to_binary'] = 1
    return X

In [26]:
def generate_G_purpose_reduced_to_binary(X):
    X['G_purpose_reduced_to_binary'] = 0
    X.loc[X['Purpose'].isin(['business loan', 'small business']), 'G_purpose_reduced_to_binary'] = 1
    return X

In [27]:
def generate_G_term_transformed_to_binary(X):
    X['G_term_transformed_to_binary'] = 0
    X.loc[X['Term'] == 'Long Term', 'G_term_transformed_to_binary'] = 1
    return X

In [28]:
def generate_G_tax_liens_reduced_to_binary(X):
    X['G_tax_liens_reduced_to_binary'] = 0
    X.loc[X['Tax Liens'].isin([2, 3, 4, 5]), 'G_tax_liens_reduced_to_binary'] = 1   
    return X

In [29]:
def generate_G_number_of_credit_problems_reduced_to_binary(X):
    X['G_number_of_credit_problems_reduced_to_binary'] = 0
    X.loc[~X['Number of Credit Problems'].isin([0, 1]), 'G_number_of_credit_problems_reduced_to_binary'] = 1   
    return X

In [30]:
def generate_G_bankruptcies_reduced_to_binary(X):
    X['G_bankruptcies_reduced_to_binary'] = 0
    X.loc[~X['Number of Credit Problems'].isin([0, 1]), 'G_bankruptcies_reduced_to_binary'] = 1  
    return X

In [31]:
def generate_G_anual_income_was_nan(X):
    X['G_anual_income_was_nan'] = 0
    X.loc[X['Annual Income'].isna(), 'G_anual_income_was_nan'] = 1  
    return X

In [32]:
def generate_target_encoded_feature_for_a_numerical_feature(df, number_of_categories, 
                                                            feature_name, target_name, 
                                                            push_right_min_outliers_in_feature_name, 
                                                            push_left_max_outliers_in_feature_name):
    # функция генерирует новый признак
    # путём разбиения пространства оригинального признака на заданное количество равных интервалов
    # и просчитывает пропорцию / вероятность  объектов с целевой переменной == 1
    # к общему числу объектов на каждом интервале
    # с присвоением значения вероятности какждому объекту в виде значения нового признака.
    # NB! если в оригинальном признаке есть пропуски, то для них будет создана отдельная,
    # дополнительная подкатегория внутри нового признака, вдобавок к заданному количеству в параметре number_of_categories.
    # при генерации нового признака используя действительную функцию
    # следует проверить целесообразность такого действия 
    # при помощи другой, написанной выше функции check_whether_categorization_with_target_encoding_is_valid
    # NB! функция может присваивать 2.5% минимальных и 2.5% максимальных значений (выбросы) в оригинальном признаке
    # значения равные 2.5% от минимума и 2.5% от максимума соответственно.
    
    X = df.copy()
    
    # присваивание 2.5% минимальных и 2.5% максимальных значений (выбросам) в оригинальном признаке
    # значений равных 2.5% от минимума и 2.5% от максимума соответственно
    value_q_0025 = np.quantile(X.loc[~X[feature_name].isna(), feature_name], q=0.025)
    value_q_0975 = np.quantile(X.loc[~X[feature_name].isna(), feature_name], q=0.975)
    
    if push_right_min_outliers_in_feature_name:
        X.loc[X[feature_name] < value_q_0025, feature_name] = value_q_0025
        
    if push_left_max_outliers_in_feature_name:
        X.loc[X[feature_name] > value_q_0975, feature_name] = value_q_0975
    
    # выявление минимального и максимального значений на пространстве значений исходного признака
    feature_min_value = X[feature_name].min()
    feature_max_value = X[feature_name].max()
    
    # создание интервалов из пространства значений исходного признака
    limits_of_categories = np.linspace(feature_min_value, feature_max_value, number_of_categories + 1)
    
    # генерация имени нового признака
    new_feature_name = 'G_' + f'{feature_name}'.lower().replace(' ', '_') + '_target_encoded'
    
    # заполнение нового признака средним значением целевой переменной -
    # перестраховка, чтобы избежать возможности пропусков
    X[new_feature_name] = X[target_name].mean()
    
    # заполнение пропусков в признаке средним значением целевой переменной на объектах,
    # где есть пропуски в признаках
    X.loc[df[feature_name].isna(), new_feature_name] = X.loc[df[feature_name].isna(), target_name].mean()
    
    # присвоение каждому объекту значения нового признака равного вероятности 
    # появления значения целевой переменной == 1 на заданном интервале оригинального признака
    i = 0
    while True:
        if i == number_of_categories:
            break

        try:
            category_stat =\
                X.loc[(df[feature_name] >= limits_of_categories[i]) & 
                      (df[feature_name] < limits_of_categories[i + 1])][target_name].value_counts()
            
            X.loc[(df[feature_name] >= limits_of_categories[i]) & 
                  (df[feature_name] < limits_of_categories[i + 1]), 
                  new_feature_name] = round((category_stat[1] / sum(category_stat)), 3) 
            
        except:
            None

        i += 1
        
    return X

In [33]:
df = train_df.copy()

In [34]:
df.head(1)

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0


In [35]:
df = generate_dummies_from_home_ownership(df)
df.head(1)

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,...,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default,G_home_ownership_home_mortgage,G_home_ownership_rent,G_home_ownership_own_home,G_home_ownership_have_mortgage,G_home_ownership_other
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,...,99999999.0,47386.0,7914.0,749.0,0,0,0,1,0,0


In [36]:
df = generate_G_home_ownership_reduced_to_binary(df)
df.head(1)

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,...,Current Credit Balance,Monthly Debt,Credit Score,Credit Default,G_home_ownership_home_mortgage,G_home_ownership_rent,G_home_ownership_own_home,G_home_ownership_have_mortgage,G_home_ownership_other,G_home_ownership_reduced_to_binary
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,...,47386.0,7914.0,749.0,0,0,0,1,0,0,0


In [37]:
df = generate_G_years_in_current_job_reduced_to_binary(df)
df.head(1)

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,...,Monthly Debt,Credit Score,Credit Default,G_home_ownership_home_mortgage,G_home_ownership_rent,G_home_ownership_own_home,G_home_ownership_have_mortgage,G_home_ownership_other,G_home_ownership_reduced_to_binary,G_years_in_current_job_reduced_to_binary
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,...,7914.0,749.0,0,0,0,1,0,0,0,1


In [38]:
df = generate_G_purpose_reduced_to_binary(df)
df.head(1)

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,...,Credit Score,Credit Default,G_home_ownership_home_mortgage,G_home_ownership_rent,G_home_ownership_own_home,G_home_ownership_have_mortgage,G_home_ownership_other,G_home_ownership_reduced_to_binary,G_years_in_current_job_reduced_to_binary,G_purpose_reduced_to_binary
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,...,749.0,0,0,0,1,0,0,0,1,0


In [39]:
df = generate_G_term_transformed_to_binary(df)
df.head(1)

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,...,Credit Default,G_home_ownership_home_mortgage,G_home_ownership_rent,G_home_ownership_own_home,G_home_ownership_have_mortgage,G_home_ownership_other,G_home_ownership_reduced_to_binary,G_years_in_current_job_reduced_to_binary,G_purpose_reduced_to_binary,G_term_transformed_to_binary
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,...,0,0,0,1,0,0,0,1,0,0


In [40]:
df = generate_G_tax_liens_reduced_to_binary(df)
df.head(1)

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,...,G_home_ownership_home_mortgage,G_home_ownership_rent,G_home_ownership_own_home,G_home_ownership_have_mortgage,G_home_ownership_other,G_home_ownership_reduced_to_binary,G_years_in_current_job_reduced_to_binary,G_purpose_reduced_to_binary,G_term_transformed_to_binary,G_tax_liens_reduced_to_binary
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,...,0,0,1,0,0,0,1,0,0,0


In [41]:
df = generate_G_number_of_credit_problems_reduced_to_binary(df)
df.head(1)

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,...,G_home_ownership_rent,G_home_ownership_own_home,G_home_ownership_have_mortgage,G_home_ownership_other,G_home_ownership_reduced_to_binary,G_years_in_current_job_reduced_to_binary,G_purpose_reduced_to_binary,G_term_transformed_to_binary,G_tax_liens_reduced_to_binary,G_number_of_credit_problems_reduced_to_binary
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,...,0,1,0,0,0,1,0,0,0,0


In [42]:
df = generate_G_bankruptcies_reduced_to_binary(df)
df.head(1)

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,...,G_home_ownership_own_home,G_home_ownership_have_mortgage,G_home_ownership_other,G_home_ownership_reduced_to_binary,G_years_in_current_job_reduced_to_binary,G_purpose_reduced_to_binary,G_term_transformed_to_binary,G_tax_liens_reduced_to_binary,G_number_of_credit_problems_reduced_to_binary,G_bankruptcies_reduced_to_binary
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,...,1,0,0,0,1,0,0,0,0,0


In [43]:
df = generate_G_anual_income_was_nan(df)
df.head(1)

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,...,G_home_ownership_have_mortgage,G_home_ownership_other,G_home_ownership_reduced_to_binary,G_years_in_current_job_reduced_to_binary,G_purpose_reduced_to_binary,G_term_transformed_to_binary,G_tax_liens_reduced_to_binary,G_number_of_credit_problems_reduced_to_binary,G_bankruptcies_reduced_to_binary,G_anual_income_was_nan
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,...,0,0,0,1,0,0,0,0,0,0


In [44]:
df.isna().sum()

Home Ownership                                      0
Annual Income                                    1557
Years in current job                              371
Tax Liens                                           0
Number of Open Accounts                             0
Years of Credit History                             0
Maximum Open Credit                                 0
Number of Credit Problems                           0
Months since last delinquent                     4081
Bankruptcies                                       14
Purpose                                             0
Term                                                0
Current Loan Amount                                 0
Current Credit Balance                              0
Monthly Debt                                        0
Credit Score                                     1557
Credit Default                                      0
G_home_ownership_home_mortgage                      0
G_home_ownership_rent       

In [45]:
df = generate_target_encoded_feature_for_a_numerical_feature(df=df, 
                                                             number_of_categories=10, 
                                                             feature_name='Annual Income',
                                                             target_name=TARGET_NAME, 
                                                             push_right_min_outliers_in_feature_name=True, 
                                                             push_left_max_outliers_in_feature_name=True)
df.head(1)

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,...,G_home_ownership_other,G_home_ownership_reduced_to_binary,G_years_in_current_job_reduced_to_binary,G_purpose_reduced_to_binary,G_term_transformed_to_binary,G_tax_liens_reduced_to_binary,G_number_of_credit_problems_reduced_to_binary,G_bankruptcies_reduced_to_binary,G_anual_income_was_nan,G_annual_income_target_encoded
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,...,0,0,1,0,0,0,0,0,0,0.34


In [46]:
df = generate_target_encoded_feature_for_a_numerical_feature(df=df, 
                                                             number_of_categories=10, 
                                                             feature_name='Current Loan Amount',
                                                             target_name=TARGET_NAME, 
                                                             push_right_min_outliers_in_feature_name=False, 
                                                             push_left_max_outliers_in_feature_name=False)

In [47]:
df = generate_target_encoded_feature_for_a_numerical_feature(df=df, 
                                                             number_of_categories=10, 
                                                             feature_name='Maximum Open Credit',
                                                             target_name=TARGET_NAME, 
                                                             push_right_min_outliers_in_feature_name=True, 
                                                             push_left_max_outliers_in_feature_name=True)

In [48]:
df = generate_target_encoded_feature_for_a_numerical_feature(df=df, 
                                                             number_of_categories=5, 
                                                             feature_name='Number of Open Accounts',
                                                             target_name=TARGET_NAME, 
                                                             push_right_min_outliers_in_feature_name=False, 
                                                             push_left_max_outliers_in_feature_name=True)

In [49]:
df = generate_target_encoded_feature_for_a_numerical_feature(df=df, 
                                                             number_of_categories=10, 
                                                             feature_name='Years of Credit History',
                                                             target_name=TARGET_NAME, 
                                                             push_right_min_outliers_in_feature_name=False, 
                                                             push_left_max_outliers_in_feature_name=False)

#### заполнение количественных признаков медианами для базового решения

In [50]:
df['Annual Income'] = df['Annual Income'].fillna(df['Annual Income'].median())

df['Months since last delinquent'] = df['Months since last delinquent'].fillna(df['Months since last delinquent'].median())

df['Credit Score'] = df['Credit Score'].fillna(df['Credit Score'].median())

In [51]:
print(df['Annual Income'].isna().sum())
print(df['Months since last delinquent'].isna().sum())
print(df['Credit Score'].isna().sum())

0
0
0


#### заполнение категориальных признаков модами для базового решения

In [52]:
df['Years in current job'] = df['Years in current job'].fillna(df['Years in current job'].mode()[0])
df['Bankruptcies'] = df['Bankruptcies'].fillna(df['Bankruptcies'].mode()[0])

In [53]:
print(df['Years in current job'].isna().sum())
print(df['Bankruptcies'].isna().sum())

0
0


#### проверка на отсутствие пропусков в датафрейме для базового решения

In [54]:
df.isna().sum().sum()

0

In [55]:
df.dtypes

Home Ownership                                    object
Annual Income                                    float64
Years in current job                              object
Tax Liens                                        float64
Number of Open Accounts                          float64
Years of Credit History                          float64
Maximum Open Credit                              float64
Number of Credit Problems                        float64
Months since last delinquent                     float64
Bankruptcies                                     float64
Purpose                                           object
Term                                              object
Current Loan Amount                              float64
Current Credit Balance                           float64
Monthly Debt                                     float64
Credit Score                                     float64
Credit Default                                     int64
G_home_ownership_home_mortgage 

In [56]:
df = df.drop(['Home Ownership', 
              'Years in current job', 
              'Purpose', 
              'Term'], axis=1)

In [57]:
df.columns

Index(['Annual Income', 'Tax Liens', 'Number of Open Accounts',
       'Years of Credit History', 'Maximum Open Credit',
       'Number of Credit Problems', 'Months since last delinquent',
       'Bankruptcies', 'Current Loan Amount', 'Current Credit Balance',
       'Monthly Debt', 'Credit Score', 'Credit Default',
       'G_home_ownership_home_mortgage', 'G_home_ownership_rent',
       'G_home_ownership_own_home', 'G_home_ownership_have_mortgage',
       'G_home_ownership_other', 'G_home_ownership_reduced_to_binary',
       'G_years_in_current_job_reduced_to_binary',
       'G_purpose_reduced_to_binary', 'G_term_transformed_to_binary',
       'G_tax_liens_reduced_to_binary',
       'G_number_of_credit_problems_reduced_to_binary',
       'G_bankruptcies_reduced_to_binary', 'G_anual_income_was_nan',
       'G_annual_income_target_encoded',
       'G_current_loan_amount_target_encoded',
       'G_maximum_open_credit_target_encoded',
       'G_number_of_open_accounts_target_encoded',
   

#### удаление объектов

In [58]:
df = df.drop(['Annual Income', 
              'Tax Liens', 
              'Number of Open Accounts', 
              'Years of Credit History', 
              'Maximum Open Credit', 
              'Number of Credit Problems', 
              'Months since last delinquent',
              'Bankruptcies'], axis=1)

df = df.drop([
#                 'Current Credit Balance',
#               'Monthly Debt',
#               'Credit Score'
], axis=1)

In [59]:
df.dtypes

Current Loan Amount                              float64
Current Credit Balance                           float64
Monthly Debt                                     float64
Credit Score                                     float64
Credit Default                                     int64
G_home_ownership_home_mortgage                     int64
G_home_ownership_rent                              int64
G_home_ownership_own_home                          int64
G_home_ownership_have_mortgage                     int64
G_home_ownership_other                             int64
G_home_ownership_reduced_to_binary                 int64
G_years_in_current_job_reduced_to_binary           int64
G_purpose_reduced_to_binary                        int64
G_term_transformed_to_binary                       int64
G_tax_liens_reduced_to_binary                      int64
G_number_of_credit_problems_reduced_to_binary      int64
G_bankruptcies_reduced_to_binary                   int64
G_anual_income_was_nan         

#### Разбиение на train и test

In [60]:
TARGET_NAME = 'Credit Default'

X = df.drop(TARGET_NAME, axis=1)
y = df[TARGET_NAME]

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    shuffle=True,
                                                    test_size=0.3,
                                                    random_state=21,
                                                    stratify=y
                                                   )

display(y_train.value_counts(normalize=True), y_test.value_counts(normalize=True))

0    0.718286
1    0.281714
Name: Credit Default, dtype: float64

0    0.718222
1    0.281778
Name: Credit Default, dtype: float64

# ПРИСВОИМ ОБЪЕКТАМ С Current Loan Amount' == 99999999.0 ЗНАЧЕНИЕ TARGET == 0

In [61]:
X_train.shape

(5250, 22)

In [62]:
X_test.shape

(2250, 22)

In [63]:
X_train[X_train['Current Loan Amount'] == 99999999.0]['Current Loan Amount'].count()

603

In [64]:
X_test[X_test['Current Loan Amount'] == 99999999.0]['Current Loan Amount'].count()

267

In [65]:
indexes_of_X_train_with_target_0 = X_train[X_train['Current Loan Amount'] == 99999999.0]['Current Loan Amount'].index
indexes_of_X_test_with_target_0 = X_test[X_test['Current Loan Amount'] == 99999999.0]['Current Loan Amount'].index

In [66]:
X_train = X_train.drop(['Current Loan Amount'], axis=1)
X_test = X_test.drop(['Current Loan Amount'], axis=1)

In [67]:
y_train[indexes_of_X_train_with_target_0].value_counts()

0    603
Name: Credit Default, dtype: int64

In [68]:
y_test[indexes_of_X_test_with_target_0].value_counts()

0    267
Name: Credit Default, dtype: int64

In [69]:
X_train_reduced = X_train.drop(indexes_of_X_train_with_target_0)
y_train_reduced = y_train.drop(indexes_of_X_train_with_target_0)
X_test_reduced = X_test.drop(indexes_of_X_test_with_target_0)
y_test_reduced = y_test.drop(indexes_of_X_test_with_target_0)

In [70]:
X_train.shape

(5250, 21)

In [71]:
X_test.shape

(2250, 21)

In [72]:
y_train.value_counts()

0    3771
1    1479
Name: Credit Default, dtype: int64

In [73]:
y_test.value_counts()

0    1616
1     634
Name: Credit Default, dtype: int64

In [74]:
X_train_reduced.columns

Index(['Current Credit Balance', 'Monthly Debt', 'Credit Score',
       'G_home_ownership_home_mortgage', 'G_home_ownership_rent',
       'G_home_ownership_own_home', 'G_home_ownership_have_mortgage',
       'G_home_ownership_other', 'G_home_ownership_reduced_to_binary',
       'G_years_in_current_job_reduced_to_binary',
       'G_purpose_reduced_to_binary', 'G_term_transformed_to_binary',
       'G_tax_liens_reduced_to_binary',
       'G_number_of_credit_problems_reduced_to_binary',
       'G_bankruptcies_reduced_to_binary', 'G_anual_income_was_nan',
       'G_annual_income_target_encoded',
       'G_current_loan_amount_target_encoded',
       'G_maximum_open_credit_target_encoded',
       'G_number_of_open_accounts_target_encoded',
       'G_years_of_credit_history_target_encoded'],
      dtype='object')

## model

#### DecisionTreeClassifier

In [75]:
# model = DecisionTreeClassifier(random_state=21,
#                                     class_weight={0:1, 1:2.13},
#                                     max_depth=5)

#### XGBClassifier

In [76]:
import xgboost as xgb, lightgbm as lgbm, catboost as catb
from xgboost import XGBClassifier

In [77]:
# %%time
# model = xgb.XGBClassifier(random_state=21, 
#                           n_estimators=20
#                           )

#### LightGBM

In [78]:
from lightgbm import LGBMClassifier

In [79]:
# %%time
# model = lgbm.LGBMClassifier(random_state=21, 
#                             class_weight={0:1, 1:2.13},
#                             n_estimators=7
#                            )

#### CatBoost

In [80]:
from catboost import CatBoostClassifier

In [81]:
%%time
model = catb.CatBoostClassifier(silent=True, random_state=21)

Wall time: 4 ms


In [82]:
model.fit(X_train_reduced, y_train_reduced)
y_train_pred_reduced = model.predict(X_train_reduced)
y_test_pred_reduced = model.predict(X_test_reduced)

In [83]:
y_train_pred = y_train.copy()
y_test_pred = y_test.copy()

In [84]:
np.array(y_test_pred)

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [85]:
y_train_pred.loc[y_train_reduced.index] = y_train_pred_reduced
y_train_pred.loc[indexes_of_X_train_with_target_0] = 0
y_test_pred.loc[y_test_reduced.index] = y_test_pred_reduced
y_test_pred.loc[indexes_of_X_test_with_target_0] = 0
y_train_pred = np.array(y_train_pred)
y_test_pred = np.array(y_test_pred)

In [86]:
get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.84      0.99      0.91      3771
           1       0.95      0.52      0.67      1479

    accuracy                           0.86      5250
   macro avg       0.89      0.75      0.79      5250
weighted avg       0.87      0.86      0.84      5250

TEST

              precision    recall  f1-score   support

           0       0.78      0.94      0.85      1616
           1       0.67      0.33      0.45       634

    accuracy                           0.77      2250
   macro avg       0.73      0.64      0.65      2250
weighted avg       0.75      0.77      0.74      2250

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1513  103
1                422  212
