In [793]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import xgboost as xgb, lightgbm as lgbm, catboost as catb
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
import warnings
warnings.simplefilter('ignore')

In [794]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [795]:
def balance_df_by_target(df, target_name):

    target_counts = df[target_name].value_counts()

    major_class_name = target_counts.argmax()
    minor_class_name = target_counts.argmin()

    disbalance_coeff = int(target_counts[major_class_name] / target_counts[minor_class_name]) - 1

    for i in range(disbalance_coeff):
        sample = df[df[target_name] == minor_class_name].sample(target_counts[minor_class_name])
        df = df.append(sample, ignore_index=True)

    return df.sample(frac=1) 

In [796]:
def convert_category(df, cat_feature_names):
    for colname in cat_feature_names:
        df[colname] = pd.Categorical(df[colname])
        
    return df

In [797]:
def processing_omissions(df, cat_feature_names, num_feature_names):
    df_num = df[num_feature_names].fillna(df.mean())
    df_cat = df[cat_feature_names].fillna(df.mode().iloc[0])
    df = pd.concat([df_num, df_cat], axis=1)
    
    return df

In [798]:
def scaling(df, num_feature_names):
    scaler = StandardScaler()
    df_norm = df.copy()
    df_norm[num_feature_names] = scaler.fit_transform(df_norm[num_feature_names])

    df = df_norm.copy()
    
    return df

In [799]:
DATASET_PATH = 'course_project_train.csv'
PREP_DATASET_PATH = 'course_project_test.csv'

In [800]:
df_train_base = pd.read_csv(DATASET_PATH)

In [801]:
df_valid = pd.read_csv(PREP_DATASET_PATH)

In [802]:
NUM_FEATURE_NAMES = ['Annual Income', 'Tax Liens', 'Number of Open Accounts', 'Years of Credit History',
                    'Maximum Open Credit', 'Number of Credit Problems', 'Bankruptcies', 'Current Loan Amount',
                     'Current Credit Balance', 'Monthly Debt', 'Credit Score', 'Months since last delinquent']
CAT_FEATURE_NAMES = ['Home Ownership', 'Years in current job', 'Purpose', 'Term']
TARGET_NAME = 'Credit Default'

In [803]:
df_train = processing_omissions(df_train_base, CAT_FEATURE_NAMES, NUM_FEATURE_NAMES)
df_train.head(2)

Unnamed: 0,Annual Income,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Bankruptcies,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Months since last delinquent,Home Ownership,Years in current job,Purpose,Term
0,482087.0,0.0,11.0,26.3,685960.0,1.0,1.0,99999999.0,47386.0,7914.0,749.0,34.6926,Own Home,10+ years,debt consolidation,Short Term
1,1025487.0,0.0,15.0,15.3,1181730.0,0.0,0.0,264968.0,394972.0,18373.0,737.0,34.6926,Own Home,10+ years,debt consolidation,Long Term


In [804]:
df_train = convert_category(df_train, CAT_FEATURE_NAMES)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   Annual Income                 7500 non-null   float64 
 1   Tax Liens                     7500 non-null   float64 
 2   Number of Open Accounts       7500 non-null   float64 
 3   Years of Credit History       7500 non-null   float64 
 4   Maximum Open Credit           7500 non-null   float64 
 5   Number of Credit Problems     7500 non-null   float64 
 6   Bankruptcies                  7500 non-null   float64 
 7   Current Loan Amount           7500 non-null   float64 
 8   Current Credit Balance        7500 non-null   float64 
 9   Monthly Debt                  7500 non-null   float64 
 10  Credit Score                  7500 non-null   float64 
 11  Months since last delinquent  7500 non-null   float64 
 12  Home Ownership                7500 non-null   ca

In [805]:
df_train = scaling(df_train, NUM_FEATURE_NAMES)
df_train.head(2)

Unnamed: 0,Annual Income,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Bankruptcies,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Months since last delinquent,Home Ownership,Years in current job,Purpose,Term
0,-1.175263,-0.110953,-0.026674,1.133645,-0.016174,1.664779,2.545372,2.76052,-0.762772,-0.872085,-0.281552,-4.852876e-16,Own Home,10+ years,debt consolidation,Short Term
1,-0.453071,-0.110953,0.788223,-0.428528,0.014763,-0.340979,-0.337765,-0.36362,0.330781,0.004909,-0.289955,-4.852876e-16,Own Home,10+ years,debt consolidation,Long Term


In [806]:
X = df_train[NUM_FEATURE_NAMES + CAT_FEATURE_NAMES]
y = df_train_base[TARGET_NAME]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.30, random_state=21)

In [807]:
df_for_balancing = pd.concat([X_train, y_train], axis=1)
df_balanced = balance_df_by_target(df_for_balancing, TARGET_NAME)
    
df_balanced[TARGET_NAME].value_counts()

0    3771
1    2958
Name: Credit Default, dtype: int64

In [808]:
X_train = df_balanced.drop(columns=TARGET_NAME)
y_train = df_balanced[TARGET_NAME]

In [809]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [810]:
model_catb = catb.CatBoostClassifier(silent=True, random_state=21, cat_features=CAT_FEATURE_NAMES)
model_catb.fit(X_train, y_train)

y_train_pred = model_catb.predict(X_train)
y_test_pred = model_catb.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.87      0.94      0.90      3771
           1       0.91      0.82      0.87      2958

    accuracy                           0.89      6729
   macro avg       0.89      0.88      0.89      6729
weighted avg       0.89      0.89      0.89      6729

TEST

              precision    recall  f1-score   support

           0       0.81      0.85      0.83      1616
           1       0.56      0.49      0.52       634

    accuracy                           0.75      2250
   macro avg       0.68      0.67      0.67      2250
weighted avg       0.74      0.75      0.74      2250

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1373  243
1                325  309


In [811]:
df_valid = processing_omissions(df_valid, CAT_FEATURE_NAMES, NUM_FEATURE_NAMES)
df_valid = convert_category(df_valid, CAT_FEATURE_NAMES)
df_valid = scaling(df_valid, NUM_FEATURE_NAMES)
X_valid = df_valid[NUM_FEATURE_NAMES + CAT_FEATURE_NAMES]

In [812]:
y_valid_pred = model_catb.predict(X_valid)
y_valid_pred

array([0, 1, 1, ..., 0, 0, 1])

In [816]:
np.savetxt('Credit Default', y_valid_pred)