In [2]:
import torch
from torch.autograd import Variable
from torch import nn
from torch.nn import init
import torch.utils.data as Data
import torch.nn.functional as F
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import os
import matplotlib.pyplot as plt
import warnings
plt.switch_backend('agg')

warnings.simplefilter("ignore")

TrainTestDataDir = '/home/songyue/homeCredit/HomeCreditDefaultRisk/Data/TrainTestData'
DNNModelsdIR = '/home/songyue/homeCredit/HomeCreditDefaultRisk/DNN/models'
%matplotlib inline

In [3]:
def GenMyDNN(InputSize=220, OutputSize=1, DropPro=0.5):
    
    DNNnet = torch.nn.Sequential(
        torch.nn.Linear(InputSize, InputSize),
        torch.nn.Dropout(0.3),
        torch.nn.Linear(InputSize, 512),
        torch.nn.ReLU(),
        torch.nn.Dropout(0.7),
        torch.nn.Linear(512, 256),
        torch.nn.ReLU(),
        torch.nn.Dropout(0.5),
        torch.nn.Linear(256, 128),
        torch.nn.Dropout(0.5),
        torch.nn.ReLU(),
        torch.nn.Linear(128, 64),
        torch.nn.Dropout(0.3),
        torch.nn.ReLU(),
        torch.nn.Linear(64, 32),
        torch.nn.Dropout(0.1),
        torch.nn.ReLU(),
        torch.nn.Linear(32, 16),
        torch.nn.ReLU(),
        torch.nn.Linear(16, OutputSize),
        torch.nn.Sigmoid()
    )
    return DNNnet


def loadData(DataName):
    DataPath = os.path.join(TrainTestDataDir, DataName)
    if not os.path.exists(DataPath):
        print('%s does not exist!' % DataPath)
        return
    OriginData = pd.read_csv(DataPath, index_col=0)
    OriginData = OriginData.sample(frac=1)  # 打乱顺序后返回
    return OriginData


def NormalData(TrainData, TestData):
    # 对一些列的均值大于100的进行归一化处理
    AllData = TrainData.append(TestData)
    for col in AllData.columns:
        if abs(AllData[col].mean())>1:
            scaler = StandardScaler().fit(np.atleast_2d(AllData[col]).T)
            TrainData[col] = scaler.transform(np.atleast_2d(TrainData[col]).T)
    return TrainData

In [61]:
# 把所有的训练数据都加载出来
TrainData = loadData('train1.csv')
for i in range(2, 11):
    TrainData = TrainData.append(loadData('train%d.csv' % i))
TrainData.reset_index(drop=True, inplace=True)
TrainData.to_csv(os.path.join(TrainTestDataDir,'AllTrain.csv'))

In [62]:
TrainData.shape

(530936, 222)

In [59]:
TrainData = pd.read_csv(os.path.join(TrainTestDataDir,'AllTrain.csv'))
TrainData.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_traintest,AMT_ANNUITY_traintest,AMT_GOODS_PRICE_traintest,...,NAME_TYPE_SUITE_other_prevapp,NAME_CLIENT_TYPE_New,NAME_CLIENT_TYPE_Refreshed,NAME_CLIENT_TYPE_Repeater,NAME_PORTFOLIO_Cards,NAME_PORTFOLIO_Cash,NAME_PORTFOLIO_POS,NAME_PORTFOLIO_other,CommonCosSimi,TARGET
0,411042,1,0,1,1,0,157500.0,1458414.0,38601.0,1273500.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.431354,0
1,426735,1,0,1,1,0,315000.0,835605.0,29736.0,697500.0,...,0.0,0.055556,0.0,0.944444,0.055556,0.277778,0.166667,0.5,0.930577,0
2,441703,1,0,0,1,0,144000.0,99000.0,10791.0,99000.0,...,0.071429,0.357143,0.0,0.642857,0.285714,0.357143,0.214286,0.142857,0.997743,1
3,406257,1,0,1,1,2,225000.0,219042.0,22572.0,193500.0,...,0.0,0.25,0.0,0.75,0.0,0.0,0.75,0.25,0.968531,0
4,114569,1,1,1,1,0,180000.0,301464.0,20277.0,238500.0,...,0.0,0.5,0.0,0.5,0.166667,0.166667,0.5,0.166667,0.997575,0


In [46]:
TrainData.drop('SK_ID_CURR', axis=1, inplace=True)
TrainData.drop('TARGET', axis=1, inplace=True)

In [28]:
FeatureRandomSelection = random.sample(range(TrainData.shape[1]),121)

In [48]:
import json
FeatureRandomSelection = []
DNNnum = 100
for i in range(DNNnum):
    FeatureRandomSelection.append(random.sample(range(TrainData.shape[1]),121))
with open(os.path.join(DNNModelsdIR,'FeatureRandomSelection.json'),'w') as fp:
    json.dump(FeatureRandomSelection,fp)

In [47]:
TrainData.shape[1]

220

In [49]:
TrainData.iloc[:,FeatureRandomSelection[0]].head()

Unnamed: 0,DAYS_CREDIT_UPDATE,FLAG_OWN_CAR,EXT_SOURCE_2,NAME_CONTRACT_STATUS_Canceled,AMT_PAYMENT_minus,FLAG_DOCUMENT_7,isEqualofAMT_PAYMENT,credit_previous_application_count,NAME_FAMILY_STATUS_Widow,REGION_RATING_CLIENT_W_CITY,...,NAME_CASH_LOAN_PURPOSE_XNA,CREDIT_ACTIVE_Closed,NAME_CONTRACT_STATUS_Active_prevapp,FLAG_EMAIL,DAYS_CREDIT,DEF_60_CNT_SOCIAL_CIRCLE,NAME_SELLER_INDUSTRY,AMT_CREDIT_traintest,AMT_REQ_CREDIT_BUREAU_QRT,FLAG_WORK_PHONE
0,-991.0,0,0.662821,0.0,-113.42625,0,0.0,4.0,0,2,...,0.0,1.0,0.0,0,-2563.5,1.0,1.0,728460.0,0.0,1
1,-759.166667,0,0.453786,0.5,0.0,0,0.0,2.0,0,2,...,0.5,0.833333,0.0,0,-1260.5,1.0,0.5,528633.0,0.0,0
2,-1.0,0,0.192014,0.0,0.0,0,0.0,1.0,0,2,...,0.0,-1.0,0.0,0,-1.0,0.0,1.0,724581.0,0.0,1
3,-695.1,0,0.264745,0.333333,-17.396907,0,0.063158,7.0,0,2,...,0.833333,0.8,0.768421,0,-1484.6,0.0,0.166667,360000.0,0.0,0
4,-1.0,0,0.530606,0.0,-969.113571,0,0.0,2.0,0,3,...,0.5,-1.0,0.0,0,-1.0,0.0,1.0,814041.0,0.0,0


In [51]:
# 把所有的训练数据都加载出来
TrainData = loadData('train1.csv')

In [56]:
TrainData.shape

(530936, 222)

In [55]:
for i in range(2, 11):
    TrainData = TrainData.append(loadData('train%d.csv' % i))

### Smote 解决样本不均衡

In [45]:
# 把所有的训练数据都加载出来
TrainData = loadData('train1.csv')
TrainDataPos = TrainData[TrainData.TARGET==1]
TrainDataNeg = TrainData[TrainData.TARGET==0]
for i in range(2, 11):
    TrainDataTmp = loadData('train%d.csv' % i)
    TrainDataTmp = TrainDataTmp[TrainDataTmp.TARGET==0]
    TrainDataNeg = TrainDataNeg.append(TrainDataTmp)
TrainData = TrainDataNeg.append(TrainDataPos)
TrainData.reset_index(drop=True, inplace=True)

In [46]:
TrainData.to_csv(os.path.join(TrainTestDataDir,'AllTrain.csv'))

In [5]:
TrainData.shape 

(307511, 222)

In [6]:
TrainData.TARGET.sum()/len(TrainData)

0.08072881945686496

In [7]:
TrainData = TrainData.sample(frac=1)

In [8]:
TrainData.reset_index(drop=True, inplace=True)

In [10]:
TrainData.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_traintest,AMT_ANNUITY_traintest,AMT_GOODS_PRICE_traintest,...,NAME_TYPE_SUITE_other_prevapp,NAME_CLIENT_TYPE_New,NAME_CLIENT_TYPE_Refreshed,NAME_CLIENT_TYPE_Repeater,NAME_PORTFOLIO_Cards,NAME_PORTFOLIO_Cash,NAME_PORTFOLIO_POS,NAME_PORTFOLIO_other,CommonCosSimi,TARGET
0,136061,1,1,1,1,0,58500.0,254700.0,14350.5,225000.0,...,0.0,0.142857,0.0,0.857143,0.0,0.285714,0.428571,0.285714,0.98355,0
1,247842,1,0,0,1,0,337500.0,1066752.0,31320.0,931500.0,...,0.0,0.1,0.0,0.9,0.1,0.5,0.2,0.2,0.992946,0
2,250871,1,0,1,1,2,202500.0,646920.0,25065.0,540000.0,...,0.0,0.125,0.25,0.625,0.25,0.125,0.25,0.375,0.95764,1
3,365156,1,1,0,1,0,270000.0,1125000.0,104107.5,1125000.0,...,0.0,0.333333,0.0,0.666667,0.333333,0.0,0.333333,0.333333,0.980595,0
4,210965,1,1,1,1,0,360000.0,1718473.5,50247.0,1345500.0,...,0.0,0.0,0.0,1.0,0.0,0.857143,0.142857,0.0,0.999174,0


In [9]:
import pandas_ml as pdml

In [12]:
TrainDataID = TrainData.SK_ID_CURR
TrainData.drop('SK_ID_CURR',axis=1,inplace=True)

In [13]:
TrainDataTARGET = TrainData.TARGET
TrainData.drop('TARGET',axis=1,inplace=True)

In [14]:
TrainData.shape

(307511, 220)

In [15]:
 df = pdml.ModelFrame(TrainData.values, target=TrainDataTARGET.values,columns=TrainData.columns.tolist())

In [16]:
df.head()

Unnamed: 0,.target,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_traintest,AMT_ANNUITY_traintest,AMT_GOODS_PRICE_traintest,...,NAME_TYPE_SUITE_family,NAME_TYPE_SUITE_other_prevapp,NAME_CLIENT_TYPE_New,NAME_CLIENT_TYPE_Refreshed,NAME_CLIENT_TYPE_Repeater,NAME_PORTFOLIO_Cards,NAME_PORTFOLIO_Cash,NAME_PORTFOLIO_POS,NAME_PORTFOLIO_other,CommonCosSimi
0,0,1.0,1.0,1.0,1.0,0.0,58500.0,254700.0,14350.5,225000.0,...,0.142857,0.0,0.142857,0.0,0.857143,0.0,0.285714,0.428571,0.285714,0.98355
1,0,1.0,0.0,0.0,1.0,0.0,337500.0,1066752.0,31320.0,931500.0,...,0.3,0.0,0.1,0.0,0.9,0.1,0.5,0.2,0.2,0.992946
2,1,1.0,0.0,1.0,1.0,2.0,202500.0,646920.0,25065.0,540000.0,...,0.125,0.0,0.125,0.25,0.625,0.25,0.125,0.25,0.375,0.95764
3,0,1.0,1.0,0.0,1.0,0.0,270000.0,1125000.0,104107.5,1125000.0,...,0.333333,0.0,0.333333,0.0,0.666667,0.333333,0.0,0.333333,0.333333,0.980595
4,0,1.0,1.0,1.0,1.0,0.0,360000.0,1718473.5,50247.0,1345500.0,...,0.428571,0.0,0.0,0.0,1.0,0.0,0.857143,0.142857,0.0,0.999174


In [17]:
df.target.value_counts()

0    282686
1     24825
Name: .target, dtype: int64

In [19]:
sampler = df.imbalance.over_sampling.SMOTE()

In [20]:
sampler

SMOTE(k=None, k_neighbors=5, kind='regular', m=None, m_neighbors=10, n_jobs=1,
   out_step=0.5, random_state=None, ratio='auto', svm_estimator=None)

In [21]:
sampled = df.fit_sample(sampler)

In [26]:
sampled.head()

Unnamed: 0,.target,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_traintest,AMT_ANNUITY_traintest,AMT_GOODS_PRICE_traintest,...,NAME_TYPE_SUITE_family,NAME_TYPE_SUITE_other_prevapp,NAME_CLIENT_TYPE_New,NAME_CLIENT_TYPE_Refreshed,NAME_CLIENT_TYPE_Repeater,NAME_PORTFOLIO_Cards,NAME_PORTFOLIO_Cash,NAME_PORTFOLIO_POS,NAME_PORTFOLIO_other,CommonCosSimi
0,0,1.0,1.0,1.0,1.0,0.0,58500.0,254700.0,14350.5,225000.0,...,0.142857,0.0,0.142857,0.0,0.857143,0.0,0.285714,0.428571,0.285714,0.98355
1,0,1.0,0.0,0.0,1.0,0.0,337500.0,1066752.0,31320.0,931500.0,...,0.3,0.0,0.1,0.0,0.9,0.1,0.5,0.2,0.2,0.992946
2,1,1.0,0.0,1.0,1.0,2.0,202500.0,646920.0,25065.0,540000.0,...,0.125,0.0,0.125,0.25,0.625,0.25,0.125,0.25,0.375,0.95764
3,0,1.0,1.0,0.0,1.0,0.0,270000.0,1125000.0,104107.5,1125000.0,...,0.333333,0.0,0.333333,0.0,0.666667,0.333333,0.0,0.333333,0.333333,0.980595
4,0,1.0,1.0,1.0,1.0,0.0,360000.0,1718473.5,50247.0,1345500.0,...,0.428571,0.0,0.0,0.0,1.0,0.0,0.857143,0.142857,0.0,0.999174


In [27]:
sampled.shape

(565372, 221)

In [28]:
sampled.target.value_counts()

1    282686
0    282686
Name: .target, dtype: int64

In [29]:
sampled.isnull().sum().sort_values().head()

.target                                     0
NAME_CONTRACT_STATUS_Active_traintest       0
NAME_CONTRACT_STATUS_Completed_traintest    0
NAME_CONTRACT_STATUS_other_traintest        0
NAME_CONTRACT_STATUS_Signed_traintest       0
dtype: int64

In [30]:
sampled[sampled.target==1].head()

Unnamed: 0,.target,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_traintest,AMT_ANNUITY_traintest,AMT_GOODS_PRICE_traintest,...,NAME_TYPE_SUITE_family,NAME_TYPE_SUITE_other_prevapp,NAME_CLIENT_TYPE_New,NAME_CLIENT_TYPE_Refreshed,NAME_CLIENT_TYPE_Repeater,NAME_PORTFOLIO_Cards,NAME_PORTFOLIO_Cash,NAME_PORTFOLIO_POS,NAME_PORTFOLIO_other,CommonCosSimi
2,1,1.0,0.0,1.0,1.0,2.0,202500.0,646920.0,25065.0,540000.0,...,0.125,0.0,0.125,0.25,0.625,0.25,0.125,0.25,0.375,0.95764
34,1,1.0,1.0,0.0,1.0,0.0,198000.0,1216201.5,35689.5,1062000.0,...,0.333333,0.0,0.333333,0.0,0.666667,0.0,0.0,1.0,0.0,0.999403
42,1,1.0,0.0,1.0,0.0,0.0,315000.0,873000.0,31059.0,873000.0,...,0.692308,0.0,0.0,0.153846,0.846154,0.076923,0.615385,0.307692,0.0,0.998715
45,1,1.0,1.0,0.0,1.0,0.0,225000.0,635962.5,37998.0,549000.0,...,0.5,0.0,0.5,0.0,0.5,0.5,0.0,0.5,0.0,0.985067
65,1,1.0,1.0,0.0,1.0,0.0,171000.0,314100.0,19111.5,225000.0,...,0.6,0.0,0.0,0.0,1.0,0.2,0.2,0.2,0.4,0.942613


In [31]:
sampled.info()

<class 'pandas_ml.core.frame.ModelFrame'>
RangeIndex: 565372 entries, 0 to 565371
Columns: 221 entries, .target to CommonCosSimi
dtypes: float64(220), int64(1)
memory usage: 953.3 MB


In [32]:
sampled.to_csv(os.path.join(TrainTestDataDir,'AllTrainSMOTE.csv'))

In [33]:
# 把所有的训练数据都加载出来
TrainData = loadData('AllTrainSMOTE.csv')
TrainData.head()

Unnamed: 0,.target,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_traintest,AMT_ANNUITY_traintest,AMT_GOODS_PRICE_traintest,...,NAME_TYPE_SUITE_family,NAME_TYPE_SUITE_other_prevapp,NAME_CLIENT_TYPE_New,NAME_CLIENT_TYPE_Refreshed,NAME_CLIENT_TYPE_Repeater,NAME_PORTFOLIO_Cards,NAME_PORTFOLIO_Cash,NAME_PORTFOLIO_POS,NAME_PORTFOLIO_other,CommonCosSimi
213866,0,1.0,1.0,1.0,1.0,0.0,202500.0,528687.0,30478.5,436500.0,...,0.6,0.0,0.2,0.0,0.8,0.2,0.2,0.4,0.2,0.977479
252540,0,1.0,1.0,0.0,0.0,0.0,144000.0,225000.0,17554.5,225000.0,...,0.5,0.0,0.5,0.0,0.5,0.0,0.5,0.5,0.0,0.998196
79746,0,1.0,0.0,0.0,1.0,0.0,135000.0,497520.0,52920.0,450000.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.448261
71423,1,1.0,1.0,0.0,1.0,0.0,135000.0,521280.0,25209.0,450000.0,...,0.5,0.0,0.5,0.0,0.5,0.5,0.5,0.0,0.0,0.998841
477271,1,1.0,0.374379,0.0,0.625621,0.0,149076.465979,295307.068043,23385.865287,241847.068043,...,0.593595,0.0,0.406405,0.5,0.093595,0.0,0.0,0.906405,0.093595,0.990804


In [34]:
TrainData.shape

(565372, 221)

In [38]:
TrainData.rename(columns={'.target':'TARGET'},inplace=True)

In [43]:
TrainData.head()

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_traintest,AMT_ANNUITY_traintest,AMT_GOODS_PRICE_traintest,...,NAME_TYPE_SUITE_family,NAME_TYPE_SUITE_other_prevapp,NAME_CLIENT_TYPE_New,NAME_CLIENT_TYPE_Refreshed,NAME_CLIENT_TYPE_Repeater,NAME_PORTFOLIO_Cards,NAME_PORTFOLIO_Cash,NAME_PORTFOLIO_POS,NAME_PORTFOLIO_other,CommonCosSimi
0,0,1.0,1.0,1.0,1.0,0.0,202500.0,528687.0,30478.5,436500.0,...,0.6,0.0,0.2,0.0,0.8,0.2,0.2,0.4,0.2,0.977479
1,0,1.0,1.0,0.0,0.0,0.0,144000.0,225000.0,17554.5,225000.0,...,0.5,0.0,0.5,0.0,0.5,0.0,0.5,0.5,0.0,0.998196
2,0,1.0,0.0,0.0,1.0,0.0,135000.0,497520.0,52920.0,450000.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.448261
3,1,1.0,1.0,0.0,1.0,0.0,135000.0,521280.0,25209.0,450000.0,...,0.5,0.0,0.5,0.0,0.5,0.5,0.5,0.0,0.0,0.998841
4,1,1.0,0.374379,0.0,0.625621,0.0,149076.465979,295307.068043,23385.865287,241847.068043,...,0.593595,0.0,0.406405,0.5,0.093595,0.0,0.0,0.906405,0.093595,0.990804


In [40]:
TrainData.reset_index(drop=True, inplace=True)

In [42]:
TrainData.to_csv(os.path.join(TrainTestDataDir,'AllTrainSMOTE.csv'))

In [44]:
TrainData[TrainData.TARGET==1]

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_traintest,AMT_ANNUITY_traintest,AMT_GOODS_PRICE_traintest,...,NAME_TYPE_SUITE_family,NAME_TYPE_SUITE_other_prevapp,NAME_CLIENT_TYPE_New,NAME_CLIENT_TYPE_Refreshed,NAME_CLIENT_TYPE_Repeater,NAME_PORTFOLIO_Cards,NAME_PORTFOLIO_Cash,NAME_PORTFOLIO_POS,NAME_PORTFOLIO_other,CommonCosSimi
3,1,1.000000,1.000000,0.000000,1.000000,0.000000,135000.000000,5.212800e+05,25209.000000,4.500000e+05,...,0.500000,0.000000,0.500000,0.000000,0.500000,0.500000,0.500000,0.000000,0.000000,0.998841
4,1,1.000000,0.374379,0.000000,0.625621,0.000000,149076.465979,2.953071e+05,23385.865287,2.418471e+05,...,0.593595,0.000000,0.406405,0.500000,0.093595,0.000000,0.000000,0.906405,0.093595,0.990804
6,1,1.000000,1.000000,0.000000,1.000000,0.000000,67500.000000,2.589816e+05,22201.203174,2.155273e+05,...,-0.789496,-0.578993,-0.578993,-0.789496,-0.789496,-0.789496,-0.789496,-0.578993,-0.789496,-0.155847
11,1,1.000000,0.902040,0.902040,0.902040,0.000000,116908.192622,1.106574e+06,30057.741970,9.405918e+05,...,0.676530,0.000000,0.245102,0.000000,0.754898,0.000000,0.039184,0.941224,0.019592,0.994936
12,1,1.000000,0.398676,0.000000,1.000000,0.601324,118031.048739,2.998641e+05,18918.314514,2.331179e+05,...,0.199338,0.200441,0.399779,0.000000,0.600221,0.000000,0.000000,0.799559,0.200441,0.972266
13,1,1.000000,0.829934,0.170066,0.829934,0.170066,183826.481902,2.597510e+05,29015.435445,2.250000e+05,...,0.325235,0.000000,0.116510,0.000000,0.883490,0.000000,0.349530,0.233020,0.417450,0.958240
14,1,1.000000,0.000000,1.000000,0.241711,1.758289,157500.000000,3.425506e+05,17040.898858,2.529385e+05,...,0.620855,0.000000,0.500000,0.000000,0.500000,0.000000,0.000000,1.000000,0.000000,0.985190
15,1,0.115413,1.000000,0.115413,0.884587,0.000000,120461.287068,1.817939e+05,9618.553551,1.763645e+05,...,0.353835,0.028853,0.205771,0.234624,0.559606,0.000000,0.000000,0.588459,0.411541,0.986805
16,1,1.000000,0.000000,0.312366,1.000000,0.687634,272792.050168,6.140945e+05,28940.398955,5.160655e+05,...,0.640672,0.062473,0.148427,0.000000,0.851573,0.148427,0.062473,0.234382,0.554718,0.980887
17,1,1.000000,0.515448,0.000000,1.000000,0.515448,141402.413142,1.163272e+05,9003.044687,1.125000e+05,...,0.000000,0.000000,0.121138,0.121138,0.757724,0.371138,0.257724,0.371138,0.000000,0.998732


In [3]:
def loadData(DataName):
    DataPath = os.path.join(TrainTestDataDir, DataName)
    if not os.path.exists(DataPath):
        print('%s does not exist!' % DataPath)
        return
    OriginData = pd.read_csv(DataPath, index_col=0)
    #OriginData = OriginData.sample(frac=1)  # 打乱顺序后返回
    return OriginData

In [8]:
print('loading AllTrain...')
TrainData = loadData('AllTrain.csv')
TrainData.reset_index(drop=True, inplace=True)
print(TrainData.shape)
print('Num of positive samples:',len(TrainData[TrainData.TARGET==1]))
print('Num of negtive samples:',len(TrainData[TrainData.TARGET==0]))

loading AllTrain...
(307511, 222)
Num of positive samples: 24825
Num of negtive samples: 282686


In [9]:
# 更改训练样本的index
TrainTarget = TrainData.TARGET
TrainData.drop('TARGET', axis=1, inplace=True)
TrainColumns = TrainData.columns

In [27]:
TrainData[TrainData.SK_ID_CURR==368365]

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_traintest,AMT_ANNUITY_traintest,AMT_GOODS_PRICE_traintest,...,NAME_TYPE_SUITE_family,NAME_TYPE_SUITE_other_prevapp,NAME_CLIENT_TYPE_New,NAME_CLIENT_TYPE_Refreshed,NAME_CLIENT_TYPE_Repeater,NAME_PORTFOLIO_Cards,NAME_PORTFOLIO_Cash,NAME_PORTFOLIO_POS,NAME_PORTFOLIO_other,CommonCosSimi
88197,368365,1,1,0,1,0,220500.0,807984.0,26703.0,697500.0,...,0.6,0.0,0.2,0.0,0.8,0.2,0.4,0.4,0.0,0.999108


In [10]:
TrainColumns

Index(['SK_ID_CURR', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT_traintest', 'AMT_ANNUITY_traintest',
       'AMT_GOODS_PRICE_traintest',
       ...
       'NAME_TYPE_SUITE_family', 'NAME_TYPE_SUITE_other_prevapp',
       'NAME_CLIENT_TYPE_New', 'NAME_CLIENT_TYPE_Refreshed',
       'NAME_CLIENT_TYPE_Repeater', 'NAME_PORTFOLIO_Cards',
       'NAME_PORTFOLIO_Cash', 'NAME_PORTFOLIO_POS', 'NAME_PORTFOLIO_other',
       'CommonCosSimi'],
      dtype='object', length=221)

In [11]:
# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(TrainData.values,TrainTarget.values, test_size=0.3,random_state=611)

In [12]:
X_train.shape

(215257, 221)

In [13]:
Traindf = pd.DataFrame(data=X_train,columns=TrainColumns)

In [21]:
Traindf.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_traintest,AMT_ANNUITY_traintest,AMT_GOODS_PRICE_traintest,...,NAME_TYPE_SUITE_other_prevapp,NAME_CLIENT_TYPE_New,NAME_CLIENT_TYPE_Refreshed,NAME_CLIENT_TYPE_Repeater,NAME_PORTFOLIO_Cards,NAME_PORTFOLIO_Cash,NAME_PORTFOLIO_POS,NAME_PORTFOLIO_other,CommonCosSimi,TARGET
0,159336,1.0,1.0,0.0,1.0,0.0,67500.0,203760.0,11376.0,180000.0,...,0.0,0.333333,0.333333,0.333333,0.0,0.333333,0.666667,0.0,0.999124,0
1,346354,1.0,0.0,0.0,1.0,0.0,220500.0,675000.0,32472.0,675000.0,...,0.0,0.25,0.0,0.75,0.25,0.0,0.75,0.0,0.952306,0
2,128100,0.0,1.0,1.0,1.0,2.0,180000.0,540000.0,27000.0,540000.0,...,0.0,0.5,0.5,0.0,0.0,0.0,1.0,0.0,0.99466,0
3,231205,1.0,1.0,1.0,0.0,0.0,193500.0,508495.5,24592.5,454500.0,...,0.0,0.066667,0.0,0.933333,0.066667,0.733333,0.133333,0.066667,0.998972,0
4,383460,0.0,1.0,0.0,1.0,0.0,90000.0,135000.0,6750.0,135000.0,...,0.0,0.2,0.0,0.8,0.0,0.6,0.2,0.2,0.993438,0


In [20]:
Traindf['TARGET'] = y_train
Traindf.SK_ID_CURR = Traindf.SK_ID_CURR.astype(int)

In [22]:
Valdf = pd.DataFrame(data=X_val,columns=TrainColumns)
Valdf['TARGET'] = y_val
Valdf.SK_ID_CURR = Valdf.SK_ID_CURR.astype(int)

In [23]:
Valdf.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_traintest,AMT_ANNUITY_traintest,AMT_GOODS_PRICE_traintest,...,NAME_TYPE_SUITE_other_prevapp,NAME_CLIENT_TYPE_New,NAME_CLIENT_TYPE_Refreshed,NAME_CLIENT_TYPE_Repeater,NAME_PORTFOLIO_Cards,NAME_PORTFOLIO_Cash,NAME_PORTFOLIO_POS,NAME_PORTFOLIO_other,CommonCosSimi,TARGET
0,368365,1.0,1.0,0.0,1.0,0.0,220500.0,807984.0,26703.0,697500.0,...,0.0,0.2,0.0,0.8,0.2,0.4,0.4,0.0,0.999108,0
1,346089,1.0,0.0,1.0,1.0,0.0,292500.0,675000.0,32602.5,675000.0,...,0.0,0.333333,0.0,0.666667,0.0,0.0,1.0,0.0,0.993392,0
2,263003,0.0,1.0,0.0,1.0,0.0,99000.0,270000.0,13500.0,270000.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.999994,0
3,126436,1.0,0.0,1.0,0.0,0.0,162000.0,450000.0,20979.0,450000.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.99939,0
4,235801,1.0,1.0,0.0,0.0,0.0,94500.0,717003.0,21091.5,598500.0,...,0.0,0.333333,0.0,0.666667,0.0,0.333333,0.666667,0.0,0.997723,0


In [24]:
Valdf.shape

(92254, 222)

In [25]:
Traindf.shape

(215257, 222)

In [26]:
#保存数据
Traindf.to_csv(os.path.join(TrainTestDataDir,'train.csv'))
Valdf.to_csv(os.path.join(TrainTestDataDir,'validation.csv'))

In [31]:
X_train = TrainData[TrainData.columns.difference(['SK_ID_CURR','TARGET'])].values

In [32]:
X_train.shape

(307511, 220)

In [33]:
Traindf.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_traintest,AMT_ANNUITY_traintest,AMT_GOODS_PRICE_traintest,...,NAME_TYPE_SUITE_other_prevapp,NAME_CLIENT_TYPE_New,NAME_CLIENT_TYPE_Refreshed,NAME_CLIENT_TYPE_Repeater,NAME_PORTFOLIO_Cards,NAME_PORTFOLIO_Cash,NAME_PORTFOLIO_POS,NAME_PORTFOLIO_other,CommonCosSimi,TARGET
0,159336,1.0,1.0,0.0,1.0,0.0,67500.0,203760.0,11376.0,180000.0,...,0.0,0.333333,0.333333,0.333333,0.0,0.333333,0.666667,0.0,0.999124,0
1,346354,1.0,0.0,0.0,1.0,0.0,220500.0,675000.0,32472.0,675000.0,...,0.0,0.25,0.0,0.75,0.25,0.0,0.75,0.0,0.952306,0
2,128100,0.0,1.0,1.0,1.0,2.0,180000.0,540000.0,27000.0,540000.0,...,0.0,0.5,0.5,0.0,0.0,0.0,1.0,0.0,0.99466,0
3,231205,1.0,1.0,1.0,0.0,0.0,193500.0,508495.5,24592.5,454500.0,...,0.0,0.066667,0.0,0.933333,0.066667,0.733333,0.133333,0.066667,0.998972,0
4,383460,0.0,1.0,0.0,1.0,0.0,90000.0,135000.0,6750.0,135000.0,...,0.0,0.2,0.0,0.8,0.0,0.6,0.2,0.2,0.993438,0


In [36]:
Valdf.isnull().sum().sort_values(ascending=True).head()

SK_ID_CURR                                  0
NAME_CONTRACT_STATUS_Completed_traintest    0
NAME_CONTRACT_STATUS_other_traintest        0
NAME_CONTRACT_STATUS_Signed_traintest       0
CNT_INSTALMENT                              0
dtype: int64