In [None]:
%%time
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('whitegrid')
import os
import sys
import time
import datetime
from tqdm import tqdm
import lightgbm as lgb
import operator
import xgboost as xgb
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from imblearn.under_sampling import RandomUnderSampler
import warnings
from imblearn.over_sampling import SMOTE
from scipy.stats import ks_2samp
from sklearn import manifold
warnings.filterwarnings("ignore")

print(os.listdir("../input"))
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [None]:
#EDA
print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
#数据大小
print('Rows: ',train.shape[0],'Columns: ',train.shape[1])
print(train.head())
print('Rows: ',test.shape[0],'Columns: ',test.shape[1])
print(test.head())

In [None]:
print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
#label的分布
print(train['target'].value_counts())
sns.countplot(train['target'])
sns.set_style('whitegrid')

In [None]:
print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
#每个特征的缺省值
total = train.isnull().sum().sort_values(ascending = False)
percent = (train.isnull().sum()/train.isnull().count()*100).sort_values(ascending = False)
missing_train_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
print(missing_train_data)

In [None]:
print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
#每个特征下的数据取值不同个数
for col in train.columns[2:]:
    print("Number of unique values of {} : {}".format(col, train[col].nunique()))
#看一下var_68这个特征
print('-------------------------------')
print(train['var_68'].value_counts()) 
print('-------------------------------')
print(test['var_68'].value_counts())   

In [None]:
print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
#每一个特征和label之间的相关系数
corr = train.corr()
print(abs(corr['target']).sort_values(ascending=False))

In [None]:
print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
#每一个特征和label之间的相关可视化
target_mask = train['target'] == 1
non_target_mask = train['target'] == 0 
statistics_array = []
for col in train.columns[2:]:
    statistic, pvalue = ks_2samp(train.loc[non_target_mask, col], train.loc[target_mask, col])
    statistics_array.append(statistic)
    fig, ax = plt.subplots(1, 1, figsize=(10, 4))
    sns.kdeplot(train.loc[non_target_mask, col], ax=ax, label='Target == 0')
    sns.kdeplot(train.loc[target_mask, col], ax=ax, label='Target == 1')

    ax.set_title('name: {}, statistics: {:.5f}, pvalue: {:5f}'.format(col, statistic, pvalue))
    plt.show()

In [None]:
test_x = test.drop(['ID_code','var_185','var_27','var_30','var_17','var_38','var_41','var_126','var_103'],axis=1)
train_x = train.drop(['ID_code','target','var_185','var_27','var_30','var_17','var_38','var_41','var_126','var_103'],axis=1)
train_y = train['target']

In [None]:
# #归一化
# scaler = StandardScaler()
# train_scaler_x = scaler.fit_transform(train_x)
# test_scaler_x = scaler.transform(test_x)
# train_x = pd.DataFrame(train_scaler_x,columns=train_x.columns.tolist())
# test_x = pd.DataFrame(test_scaler_x,columns=train_x.columns.tolist())

In [None]:
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True)
# #欠采样
# ros=RandomUnderSampler(random_state=42)
# x_resampled,y_resampled=ros.fit_sample(train_x.values,train_y.values)
# x_resampled = pd.DataFrame(x_resampled,columns=train_x.columns.tolist())
# # #过采样
# # x_resampled, y_resampled = SMOTE(kind='borderline1').fit_sample(train_x.values, train_y.values)

In [None]:
# # 后项搜索
# def modeling_cross_validation(X=x_resampled.values ,y=y_resampled,featurename=train_x.columns.tolist(),params=None, folds=folds, model_type='lgb',model=None):
#     scores = []
#     feature_importance = pd.DataFrame()
#     for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
#         print('Fold', fold_n, 'started at', time.ctime())
#         X_train, X_valid = X[train_index], X[valid_index]
#         y_train, y_valid = y[train_index], y[valid_index]
#         if model_type == 'lgb':
#             train_data = lgb.Dataset(data=X_train, label=y_train)
#             valid_data = lgb.Dataset(data=X_valid, label=y_valid)
#             model = lgb.train(params,train_data,num_boost_round=20000,
#                     valid_sets = [train_data, valid_data],verbose_eval=2000,early_stopping_rounds = 200)
#             y_pred_valid = model.predict(X_valid)
            
#         if model_type == 'xgb':
#             train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=featurename)
#             valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=featurename)
#             watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
#             model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=1000, params=params)
#             y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=featurename), ntree_limit=model.best_ntree_limit)
            
#         if model_type == 'rcv':
#             model = RidgeCV(alphas=(0.01, 0.1, 1.0, 10.0, 100.0), scoring='neg_mean_absolute_error', cv=3)
#             model.fit(X_train, y_train)
#             print(model.alpha_)

#             y_pred_valid = model.predict(X_valid).reshape(-1,)
#             score = mean_absolute_error(y_valid, y_pred_valid)
        
#         if model_type == 'sklearn':
#             model = model
#             model.fit(X_train, y_train)
            
#             y_pred_valid = model.predict(X_valid).reshape(-1,)
#             score = mean_absolute_error(y_valid, y_pred_valid)
        
#         if model_type == 'cat':
#             model = CatBoostRegressor(iterations=20000,  eval_metric='auc', **params)
#             model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)
#             y_pred_valid = model.predict(X_valid)
        
#         fpr, tpr, thresholds = metrics.roc_curve(y_valid, y_pred_valid, pos_label=1)
#         scores.append(metrics.auc(fpr, tpr))   
#     return  np.mean(scores)           

# def featureSelect(init_cols):
#     params = {'num_leaves': 10,
#          'min_data_in_leaf': 42,
#          'objective': 'binary',
#          'max_depth': 18,
#          'learning_rate': 0.01,
#          'boosting': 'gbdt',
#          'bagging_freq': 6,
#          'bagging_fraction': 0.8,
#          'feature_fraction': 0.9,
#          'bagging_seed': 11,
#          'reg_alpha': 2,
#          'reg_lambda': 5,
#          'random_state': 42,
#          'metric': 'auc',
#          'verbosity': -1,
#          'subsample': 0.9,
#          'min_gain_to_split': 0.01077313523861969,
#          'min_child_weight': 19.428902804238373,
#          'num_threads': 4}
#     best_cols = init_cols.copy()
#     best_score = modeling_cross_validation(X=x_resampled[best_cols].values, y=y_resampled,featurename=best_cols,params=params)
    
#     print("初始CV score: {:<8.8f}".format(best_score))
#     for f in init_cols:

#         best_cols.remove(f)
#         score = modeling_cross_validation(X=x_resampled[best_cols].values, y=y_resampled,featurename=best_cols,params=params)
#         diff = score - best_score 
#         print('-'*10)
#         if diff > 0.00005:
#             print("当前移除特征: {}, CV score: {:<8.8f}, 最佳cv score: {:<8.8f}, 有效果,删除！！".format(f,score,best_score))
#             best_score = score
#         else:
#             print("当前移除特征: {}, CV score: {:<8.8f}, 最佳cv score: {:<8.8f}, 没效果,保留！！".format(f,score,best_score))
#             best_cols.append(f)
#     print('-'*10)
#     print("优化后CV score: {:<8.8f}".format(best_score))
    
#     return best_cols

# best_features = featureSelect(train_x.columns.tolist())
# print(best_features)

In [None]:
def train_model(X=train_x.values ,y=train_y.values,featurename=train_x.columns.tolist(), X_test=test_x, params=None, folds=folds, model_type='lgb', plot_feature_importance=False, model=None):
    oof = np.zeros(len(X))
    prediction = np.zeros(len(X_test))
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        if model_type == 'lgb':
            train_data = lgb.Dataset(data=X_train, label=y_train)
            valid_data = lgb.Dataset(data=X_valid, label=y_valid)
            model = lgb.train(params,train_data,num_boost_round=20000,
                    valid_sets = [train_data, valid_data],verbose_eval=1000,early_stopping_rounds = 200)
            
            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration)
            
        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=featurename)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=featurename)
            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=1000, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=featurename), ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test, feature_names=featurename), ntree_limit=model.best_ntree_limit)
            
        if model_type == 'rcv':
            model = RidgeCV(alphas=(0.01, 0.1, 1.0, 10.0, 100.0), scoring='neg_mean_absolute_error', cv=3)
            model.fit(X_train, y_train)
            print(model.alpha_)

            y_pred_valid = model.predict(X_valid).reshape(-1,)
            score = mean_absolute_error(y_valid, y_pred_valid)
            print(f'Fold {fold_n}. MAE: {score:.4f}.')
            print('')
            y_pred = model.predict(X_test).reshape(-1,)
        
        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            
            y_pred_valid = model.predict(X_valid).reshape(-1,)
            score = mean_absolute_error(y_valid, y_pred_valid)
            print(f'Fold {fold_n}. MAE: {score:.4f}.')
            print('')
            
            y_pred = model.predict(X_test).reshape(-1,)
        
        if model_type == 'cat':
            model = CatBoostRegressor(iterations=20000,  eval_metric='auc', **params)
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)
            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)
        
        oof[valid_index] = y_pred_valid.reshape(-1,)
        fpr, tpr, thresholds = metrics.roc_curve(y_valid, y_pred_valid, pos_label=1)
        scores.append(metrics.auc(fpr, tpr))

        prediction += y_pred    
        
        if model_type == 'lgb':
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = featurename
            fold_importance["importance"] = model.feature_importance()
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
        if model_type == 'xgb':
            fold_importance =  model.get_fscore() 
            fold_importance = sorted(fold_importance.items(), key=operator.itemgetter(1))  
            feature_importance = pd.DataFrame(fold_importance, columns=['feature', 'importance'])           
    prediction /= n_fold
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    if model_type == 'lgb':
        feature_importance["importance"] /= n_fold
        if plot_feature_importance:
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16,26))
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
            plt.title('LGB Features (avg over folds)')
        
            return oof, prediction, feature_importance
        return oof, prediction
    
    elif model_type == 'xgb':
        feature_importance['importance'] /= n_fold
        if plot_feature_importance:
            plt.figure(figsize=(16,26))  
            feature_importance.plot(kind='barh', x='feature', y='importance', legend=False, figsize=(6, 10))  
            plt.title('XGB Features (avg over folds)')  
            plt.xlabel('relative importance')  
            plt.show() 
            return oof, prediction, feature_importance
        return oof, prediction
    else:
        return oof, prediction

In [None]:
params = {'num_leaves': 10,
         'min_data_in_leaf': 42,
         'objective': 'binary',
         'max_depth': 18,
         'learning_rate': 0.01,
         'boosting': 'gbdt',
         'bagging_freq': 6,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.9,
         'bagging_seed': 11,
         'reg_alpha': 2,
         'reg_lambda': 5,
         'random_state': 42,
         'metric': 'auc',
         'verbosity': -1,
         'subsample': 0.9,
         'min_gain_to_split': 0.01077313523861969,
         'min_child_weight': 19.428902804238373,
         'num_threads': 4}
oof_lgb, prediction_lgb, feature_importance_lgb = train_model(params=params, model_type='lgb',plot_feature_importance=True)
# params = {'eta': 0.05, 
#               'max_depth': 3, 
#               'subsample': 0.9, 
#               'colsample_bytree': 0.9, 
#               'objective': 'binary:logistic', 
#               'eval_metric': 'auc', 
#               'silent': True, 
#               'nthread': 4}
# oof_xgb, prediction_xgb, feature_importance_xgb = train_model(params=params, model_type='xgb',plot_feature_importance=True)

In [None]:
submission = pd.DataFrame({"ID_code": test.ID_code.values})
submission["target"] = prediction_lgb
submission.to_csv("submission.csv", index=False)