In [32]:
import pandas as pd
import numpy as np
import warnings
import os
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

In [None]:
!pip install catboost

# Load Data

In [1]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
data = pd.read_csv('/content/drive/MyDrive/5001/Loan_default_classification-main/data/process_train.csv')
data = reduce_mem_usage(data)
X_train = data.drop('isDefault', axis=1)
y_train = data['isDefault']
# 数据集划分
X_train_split, X_test, y_train_split, y_test = train_test_split(X_train, y_train, test_size=0.2)

# CatBoost

In [None]:
cb = CatBoostClassifier(
            loss_function="Logloss",
            eval_metric="AUC",
            task_type="CPU",
            learning_rate=0.1,
            iterations=500,
            random_seed=2020,
            od_type="Iter",
            depth=7)


In [23]:
n_folds = 5
sk = StratifiedKFold(n_splits = n_folds, shuffle = True, random_state = 2019)
mean_score = 0
answers = []
for train, val in sk.split(X_train_split, y_train_split):
  x_train = X_train_split.iloc[train]
  y_train = y_train_split.iloc[train]
  x_val = X_train_split.iloc[val]
  y_val = y_train_split.iloc[val]
  clf = cb.fit(x_train,y_train, eval_set=(x_val,y_val),verbose=500)
  yy_pred_valid = clf.predict(x_val,prediction_type='Probability')[:,-1]
  print('cat验证的auc:{}'.format(roc_auc_score(y_val, yy_pred_valid)))
  mean_score += roc_auc_score(y_val, yy_pred_valid) / n_folds
  y_pred_valid = clf.predict(X_test,prediction_type='Probability')[:,-1]
  answers.append(y_pred_valid)



0:	test: 0.6916952	best: 0.6916952 (0)	total: 464ms	remaining: 3m 51s
499:	test: 0.7350327	best: 0.7350564 (494)	total: 2m 26s	remaining: 0us

bestTest = 0.7350563679
bestIteration = 494

Shrink model to first 495 iterations.
cat验证的auc:0.7350563678533308
0:	test: 0.6909177	best: 0.6909177 (0)	total: 340ms	remaining: 2m 49s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7337436477
bestIteration = 453

Shrink model to first 454 iterations.
cat验证的auc:0.7337436476653862
0:	test: 0.6948259	best: 0.6948259 (0)	total: 348ms	remaining: 2m 53s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7374157829
bestIteration = 459

Shrink model to first 460 iterations.
cat验证的auc:0.7374157829146317
0:	test: 0.6911005	best: 0.6911005 (0)	total: 352ms	remaining: 2m 55s
499:	test: 0.7334069	best: 0.7334069 (499)	total: 2m 25s	remaining: 0us

bestTest = 0.7334069469
bestIteration = 499

cat验证的auc:0.733406946873947
0:	test: 0.6944213	best: 0.6944213 (0)	total: 340ms	rem

In [30]:
cat_pre=sum(answers)/n_folds
print(roc_auc_score(y_test, cat_pre))

0.7348004969314569


# XGBOOST

In [None]:
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = XGBClassifier(learning_rate=0.1, n_estimators=150, max_depth=4, min_child_weight=2, subsample=0.7,objective='binary:logistic')
 
vclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('xgb', clf3)], voting='soft', weights=[2, 1, 1])
vclf = vclf .fit(X_train_split,y_train_split)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
