# 个贷违约预测

# 方案介绍
1. 对于样本不平衡的问题，修改损失函数的权重，将负样本的权重设为0.2，正样本为1.0
1. 对于连续数值型变量，对他们进行 均值-方差归一化
1. 对于分类变量，在网络中进行重编码（即增加全连接层，用于模拟embedding）
1. 本方案直接弃用了地区编码，时间等信息

In [25]:
# 读取数据
import pandas as pd

# 算力有限，只使用了train_public中的10000条数据作为训练集
train_df=pd.read_csv('data/data130186/train_public.csv')

# 展示列名
print(train_df.columns)
# 展示数据集大小
print(train_df.shape)

Index(['loan_id', 'user_id', 'total_loan', 'year_of_loan', 'interest',
       'monthly_payment', 'class', 'employer_type', 'industry', 'work_year',
       'house_exist', 'censor_status', 'issue_date', 'use', 'post_code',
       'region', 'debt_loan_ratio', 'del_in_18month', 'scoring_low',
       'scoring_high', 'known_outstanding_loan', 'known_dero',
       'pub_dero_bankrup', 'recircle_b', 'recircle_u', 'initial_list_status',
       'app_type', 'earlies_credit_mon', 'title', 'policy_code', 'f0', 'f1',
       'f2', 'f3', 'f4', 'early_return', 'early_return_amount',
       'early_return_amount_3mon', 'isDefault'],
      dtype='object')
(10000, 39)


# 构造dataset

In [26]:
# 飞桨的主库，paddle 根目录下保留了常用API的别名，当前包括：paddle.tensor、paddle.framework、paddle.device目录下的所有API
import paddle
import numpy as np

class MyDateset(paddle.io.Dataset):
    # csv_dir对应要读取的数据地址，standard_csv_dir用于生成均值和方差信息对数据进行归一化的文件地址
    def __init__(self,csv_dir,standard_csv_dir='data/data130186/train_public.csv',mode = 'train'):
        super(MyDateset, self).__init__()

        # 读取数据
        # csv_dir = 'data/data130186/train_public.csv'
        self.df = pd.read_csv(csv_dir)
        
        # 构造各个变量的均值和方差
        st_df = pd.read_csv(standard_csv_dir)
        self.mean_df = st_df.mean()
        self.std_df = st_df.std()

        # 分别指定数值型变量/分类变量/不使用的变量
        self.num_item = ['total_loan', 'year_of_loan', 'interest','monthly_payment',
        'debt_loan_ratio', 'del_in_18month', 'scoring_low','scoring_high', 'known_outstanding_loan', 'known_dero','pub_dero_bankrup', 'recircle_b', 'recircle_u', 
        'f0', 'f1','f2', 'f3', 'f4', 'early_return', 'early_return_amount','early_return_amount_3mon']
        self.un_num_item = ['class','employer_type','industry','work_year','house_exist', 'censor_status',
        'use',
        'initial_list_status','app_type',
        'policy_code']
        self.un_use_item = ['loan_id', 'user_id',
        'issue_date', 
        'post_code', 'region',
        'earlies_credit_mon','title']

        # 构造一个映射表，将分类变量/分类字符串映射到对应数值上
        un_num_item_list = {}
        for item in self.un_num_item:
            # list(set())的功能是对原列表去重并按从小到大排序
            # 得到了每个类别变量的所有value，将其组成了列表
            un_num_item_list[item]=list(set(st_df[item].values))
        self.un_num_item_list = un_num_item_list

        self.mode = mode

    def __getitem__(self, index):
        data=[]

        # 数值型变量
        # 进行归一化，如果这个数值缺省了直接设置为0
        for item in self.num_item:
            if np.isnan(self.df[item][index]):
                data.append((0-self.mean_df[item])/self.std_df[item])
            else:
                data.append((self.df[item][index]-self.mean_df[item])/self.std_df[item])
        
        emb_data = []

        # 类别型变量
        # 将分类变量映射到对应数值上
        for item in self.un_num_item:
            try:
                # 如果该项的value不在列表中，则设置为-1
                if self.df[item][index] not in self.un_num_item_list[item]:
                    emb_data.append(-1)
                else:
                    # 否则设置为该value的下标值index
                    emb_data.append(self.un_num_item_list[item].index(self.df[item][index]))
            except:
                emb_data.append(-1)

        # 将数据转换为tensor类型，便于接下来的处理
        data = paddle.to_tensor(data).astype('float32')
        emb_data = paddle.to_tensor(emb_data).astype('float32')

        # 如果当前模式不为train，则返回对应的loan_id，用于锁定样本条目
        if self.mode == 'train':
            label = self.df['isDefault'][index]
        else:
            label = self.df['loan_id'][index]

        label = np.array(label).astype('int64')
        return data,emb_data,label

    def __len__(self):
        return len(self.df)

In [27]:
# 训练模式
dataset=MyDateset('data/data130186/train_public.csv')
[data,emb_data,label] = dataset[0]
print(dataset[0])
print(data.shape)
print(emb_data.shape)
print(label)
print(emb_data)

(Tensor(shape=[21], dtype=float32, place=Place(cpu), stop_gradient=True,
       [ 1.94507027, -0.56161529, -0.36030969,  2.81924415, -1.06214857,
        -0.35715219, -1.39864016, -1.26400948, -1.57160521, -0.37241074,
        -0.36666167, -0.41815358,  1.46703112, -1.42198610, -0.03773430,
        -0.61069232, -1.16881323, -0.84118545,  1.17932808,  2.56085277,
        -0.52783436]), Tensor(shape=[10], dtype=float32, place=Place(cpu), stop_gradient=True,
       [5. , 2. , 11., 5. , 0. , 1. , 2. , 0. , 0. , 0. ]), array(0))
[21]
[10]
0
Tensor(shape=[10], dtype=float32, place=Place(cpu), stop_gradient=True,
       [5. , 2. , 11., 5. , 0. , 1. , 2. , 0. , 0. , 0. ])


# 构造网络

In [28]:
# paddle.nn：组网相关的API，包括 Linear、卷积 Conv2D、循环神经网络LSTM、损失函数CrossEntropyLoss、激活函数ReLU等
# Linear：神经网络的全连接层函数，包含所有输入权重相加的基本神经元结构
# class MyNet(paddle.nn.Layer):
#     def __init__(self):
#         super(MyNet,self).__init__()
#         # 输入维度是21，输出维度是512
#         self.fc = paddle.nn.Linear(in_features=21, out_features=512)

#         # 输入维度是10，输出维度是2048
#         # 因为emb_data.shape=[10]
#         self.emb1 = paddle.nn.Linear(in_features=10,out_features=2048)
#         # 输入维度是2048，输出维度是512
#         self.emb2 = paddle.nn.Linear(in_features=2048,out_features=512)

#         # 输入维度是1024，输出维度是2
#         self.out = paddle.nn.Linear(in_features=1024,out_features=2)

#     def forward(self,data,emb_data):
#         x = self.fc(data)
#         # x.shape=[21]变为x.shape=[512]

#         x = paddle.nn.functional.relu(x)

#         emb = self.emb1(emb_data)
#         # emb_data.shape=[10]变为emb_data.shape=[2048]
#         emb = paddle.nn.functional.relu(emb)

#         emb = self.emb2(emb)
#         # emb_data.shape=[2048]变为emb_data.shape=[512]
#         emb = paddle.nn.functional.relu(emb)

#         # 对输入沿参数 axis 轴进行联结，返回一个新的 Tensor
#         # 连结
#         x = paddle.concat([x,emb],axis=-1)
#         # concat[x,emb].shape=[512+512=1024]

#         x = self.out(x)
        
#         x = paddle.nn.functional.sigmoid(x)
#         # x = paddle.nn.functional.relu(x)
#         return x

构造更复杂的网络

In [29]:
class MyNet(paddle.nn.Layer):
    def __init__(self):
        super(MyNet,self).__init__()
        self.fc1 = paddle.nn.Linear(in_features=31, out_features=128)
        self.Sigmoid1 = paddle.nn.Sigmoid()
        self.dropout1 = paddle.nn.Dropout(p=0.2)
        self.fc2 = paddle.nn.Linear(in_features=128, out_features=512)
        self.Sigmoid2 = paddle.nn.Sigmoid()
        self.dropout2 = paddle.nn.Dropout(p=0.3)
        self.fc3 = paddle.nn.Linear(in_features=512, out_features=2048)
        self.Sigmoid3 = paddle.nn.Sigmoid()
        self.dropout3 = paddle.nn.Dropout(p=0.2)

        self.fc4 = paddle.nn.Linear(in_features=2048, out_features=1024)
        self.Sigmoid4 = paddle.nn.Sigmoid()
        self.dropout4 = paddle.nn.Dropout(p=0.2)

        self.out = paddle.nn.Linear(in_features=1024,out_features=2)

    def forward(self,data,emb_data):

        x = paddle.concat([data,emb_data],axis=-1)

        x = self.fc1(x)
        #x = self.dropout1(x)
        #x = self.Sigmoid1(x)
        x = self.fc2(x)
        #x = self.dropout2(x)
        #x = self.Sigmoid2(x)
        
        x = self.fc3(x)
        #x = self.dropout3(x)
        #x = self.Sigmoid3(x)
        
        x = self.fc4(x)
        #x = self.Sigmoid4(x)
        
        x = self.out(x)
        
        x = paddle.nn.functional.sigmoid(x)
        return x

# 训练

In [30]:
# 构造读取器
train_dataset=MyDateset('data/data130186/train_public.csv')

train_dataloader = paddle.io.DataLoader(
    train_dataset,
    batch_size=1000,
    shuffle=True,
    drop_last=False)

In [31]:
# 构造模型
model = MyNet()
# model_dict = paddle.load('model.pdparams')
# model.set_dict(model_dict)
model.train()
max_epoch=100
opt = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())

# paddle.metric 评估指标计算相关的API，包括 Accuracy, Auc等

# 训练
now_step=0
# 最小的loss
minLoss = 0.48
# 最大的Auc
maxAuc = 0.843
m = paddle.metric.Auc()
for epoch in range(max_epoch):
    for step, data in enumerate(train_dataloader):
        now_step+=1

        data,emb_data, label = data
        pre = model(data,emb_data)
        
        #print(pre.shape)
        loss = paddle.nn.functional.cross_entropy(pre,label,weight=paddle.to_tensor([0.2,1.0]),reduction='mean')
        loss.backward()
        opt.step()
        opt.clear_gradients()
        
        if now_step%1==0:
            if minLoss > loss.mean():
                minLoss = loss.mean().numpy()
                paddle.save(model.state_dict(), 'model/modelMinLoss_mynet_{}.pdparams'.format(minLoss))

            m.update(preds=pre, labels=label)
            resAccumulate = m.accumulate()
            if maxAuc < resAccumulate:
                maxAuc = resAccumulate
                paddle.save(model.state_dict(), 'model/modelmaxAuc_mynet_{}.pdparams'.format(maxAuc))
            print("epoch: {}, batch: {}, Auc is:{}, loss is: {}".format(epoch, step, resAccumulate,loss.mean().numpy()))
        

# 保存模型到model.pdparams
paddle.save(model.state_dict(), 'modelM_mynet.pdparams')

epoch: 0, batch: 0, Auc is:0.5499751871448442, loss is: [0.69469315]
epoch: 0, batch: 1, Auc is:0.5517433456012897, loss is: [0.69435847]
epoch: 0, batch: 2, Auc is:0.553727997412739, loss is: [0.6808947]
epoch: 0, batch: 3, Auc is:0.562773089702137, loss is: [0.680034]
epoch: 0, batch: 4, Auc is:0.5686370742145967, loss is: [0.6775569]
epoch: 0, batch: 5, Auc is:0.5886830818152956, loss is: [0.6707368]
epoch: 0, batch: 6, Auc is:0.5940370714375736, loss is: [0.6798332]
epoch: 0, batch: 7, Auc is:0.6058022277917015, loss is: [0.66747534]
epoch: 0, batch: 8, Auc is:0.6103651077784652, loss is: [0.6681505]
epoch: 0, batch: 9, Auc is:0.6176003362312057, loss is: [0.6649606]
epoch: 1, batch: 0, Auc is:0.6273525722939826, loss is: [0.6651412]
epoch: 1, batch: 1, Auc is:0.6336310187465297, loss is: [0.6570655]
epoch: 1, batch: 2, Auc is:0.6416815717042497, loss is: [0.6543868]
epoch: 1, batch: 3, Auc is:0.6502967956321495, loss is: [0.6488952]
epoch: 1, batch: 4, Auc is:0.6552567138108704, l

In [32]:
# 
# # 构造模型
# model = MyNet()

# # model_dict = paddle.load('model.pdparams')
# # model.set_dict(model_dict)

# model.train()

# max_epoch=15
# # 定义优化算法，使用随机梯度下降SGD
# opt = paddle.optimizer.SGD(learning_rate=0.15, parameters=model.parameters())

# # 训练
# now_step=0
# for epoch in range(max_epoch):
#     for step, data in enumerate(train_dataloader):
#         now_step+=1

#         data,emb_data, label = data
#         pre = model(data,emb_data)
#         # binary_cross_entropy
#         loss = paddle.nn.functional.cross_entropy                                      
#         # loss = paddle.nn.functional.square_error_cost(pre,label.reshape([-1,1]).astype('float32'))
#         # loss = paddle.mean(loss)
#         loss.backward()
#         opt.step()
#         opt.clear_gradients()
#         if now_step%1==0:
#             print("epoch: {}, batch: {}, loss is: {}".format(epoch, step, loss.mean().numpy()))

# # 保存模型到model.pdparams
# paddle.save(model.state_dict(), 'model.pdparams')

# 预测

In [36]:
# 读取模型和构造读取器
model = MyNet()

# model_dict = paddle.load('/home/aistudio/model/modelMinLoss_mynet_[0.47939792].pdparams')
# model_dict = paddle.load('/home/aistudio/model/modelmaxAuc_mynet_0.8430892418873612.pdparams')
model_dict = paddle.load('/home/aistudio/model/modelmaxAuc_mynet_0.844108951825.pdparams')

model.set_dict(model_dict)

model.eval()

test_dataset=MyDateset('data/data130187/test_public.csv',mode = 'test')

test_dataloader = paddle.io.DataLoader(
    test_dataset,
    batch_size=1,
    shuffle=False,
    drop_last=False)

In [34]:
# # 将结果保存在result.csv中
# result = []
# for step, data in enumerate(test_dataloader):
#     data ,emb_data, loan_id = data
#     pre = model(data,emb_data)
#     result.append([loan_id.numpy()[0], pre[:,1].numpy()[0]])
#     # result.append([loan_id.numpy()[0], np.argmax(pre.numpy())])

# pd.DataFrame(result,columns=['id','isDefault']).to_csv('result.csv',index=None)

In [37]:
result = []
for step, data in enumerate(test_dataloader):
    data ,emb_data, loan_id = data
    pre = model(data,emb_data)
    preRes = 0
    if pre[:,1].numpy()[0] >0.3:
        preRes = 1
    result.append([loan_id.numpy()[0], pre[:,1].numpy()[0]])

pd.DataFrame(result,columns=['id','isDefault']).to_csv('result.csv',index=None)