In [None]:
!pip install catboost

In [3]:
import pandas as pd
import numpy as np
import warnings
import os
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

In [7]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


# Load Data

In [4]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [5]:
data = pd.read_csv('/content/drive/MyDrive/5001/Loan_default_classification-main/data/process_train.csv')
data = reduce_mem_usage(data)
X_train = data.drop('isDefault', axis=1)
y_train = data['isDefault']
# 数据集划分
#X_train_split, X_test, y_train_split, y_test = train_test_split(X_train, y_train, test_size=0.2)

Memory usage of dataframe is 678400128.00 MB
Memory usage after optimization is: 166400128.00 MB
Decreased by 75.5%


# CatBoost

In [6]:
cb = CatBoostClassifier(
            loss_function="Logloss",
            eval_metric="AUC",
            task_type="CPU",
            learning_rate=0.1,
            iterations=500,
            random_seed=2020,
            od_type="Iter",
            depth=7)


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


In [9]:
n_folds = 5
sk = StratifiedKFold(n_splits = n_folds, shuffle = True, random_state = 2019)
roc_auc_mean_score = 0
auc_mean_score = 0
precision_mean_score = 0
recall_mean_score = 0
f1macro_mean_score = 0
for train, val in sk.split(X_train, y_train):
  xx_train = X_train.iloc[train]
  yy_train = y_train.iloc[train]
  xx_val = X_train.iloc[val]
  yy_val = y_train.iloc[val]
  clf = cb.fit(xx_train,yy_train, eval_set=(xx_val,yy_val),verbose=500)
  yy_pred_valid = clf.predict(xx_val,prediction_type='Probability')[:,-1]
  
  y_pre = [(1 if item > 0.5 else 0) for item in yy_pred_valid]
  print('cat验证的auc:{}'.format(accuracy_score(yy_val, y_pre)))
  auc_mean_score += accuracy_score(yy_val, y_pre) / n_folds

  print('cat验证的roc_auc:{}'.format(roc_auc_score(yy_val, y_pre)))
  roc_auc_mean_score += roc_auc_score(yy_val, y_pre) / n_folds


  print('cat验证的precision:{}'.format(precision_score(yy_val, y_pre)))
  precision_mean_score += precision_score(yy_val, y_pre) / n_folds


  print('cat验证的recall:{}'.format(recall_score(yy_val, y_pre)))
  recall_mean_score += recall_score(yy_val, y_pre) / n_folds


  print('cat验证的f1_macro:{}'.format(f1_score(yy_val, y_pre, average='macro')))
  f1macro_mean_score += f1_score(yy_val, y_pre, average='macro') / n_folds
  # y_pred_valid = clf.predict(X_test,prediction_type='Probability')[:,-1]
  # answers.append(y_pred_valid)



0:	test: 0.6912263	best: 0.6912263 (0)	total: 443ms	remaining: 3m 41s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.73395541
bestIteration = 478

Shrink model to first 479 iterations.
cat验证的auc:0.806725
cat验证的roc_auc:0.5460296636146533
cat验证的precision:0.5809014267185474
cat验证的recall:0.11224234070546958
cat验证的f1_macro:0.5392193924802364
0:	test: 0.6947614	best: 0.6947614 (0)	total: 437ms	remaining: 3m 38s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7363541919
bestIteration = 361

Shrink model to first 362 iterations.
cat验证的auc:0.807075
cat验证的roc_auc:0.5455662405356784
cat验证的precision:0.587891927951968
cat验证的recall:0.11042541194160767
cat验证的f1_macro:0.5382491773729325
0:	test: 0.6918150	best: 0.6918150 (0)	total: 448ms	remaining: 3m 43s
499:	test: 0.7366034	best: 0.7366085 (496)	total: 2m 59s	remaining: 0us

bestTest = 0.7366084503
bestIteration = 496

Shrink model to first 497 iterations.
cat验证的auc:0.80696875
cat验证的roc_auc:0.54773414323219
c

In [12]:
print("auc:", auc_mean_score) 
print("roc_auc:", roc_auc_mean_score)
print("precision:", precision_mean_score) 
print("recall:", recall_mean_score) 
print("f1macro:", f1macro_mean_score) 

auc: 0.8070075000000001
roc_auc: 0.5466576760115419
precision: 0.5841729164614144
recall: 0.11344527285257816
f1macro: 0.5402183110608284
