In [2]:
# Merge data
import pandas as pd
from functools import reduce
path_dict = {
    'bond_data': 'bond_data.csv',
    'enterprise_data': 'enterprise_data_10.csv',
    'spread_data': 'spread_data.csv',
    'macro_data': 'macro_data.csv'
}

bond_data = pd.read_csv(path_dict['bond_data'])
enterprise_data = pd.read_csv(path_dict['enterprise_data'])
spread_data = pd.read_csv(path_dict['spread_data'])
macro_data = pd.read_csv(path_dict['macro_data'])

for data, path in zip([bond_data, enterprise_data, spread_data, macro_data], path_dict.values()):
    # print(data.info())
    # print('\n')
    # print(data.describe())
    print(data.shape)
    data.sort_values(by=['债券代码', '交易日期'], inplace=True)
    data.to_csv(path, index=False)

all_data = reduce(lambda left, right: pd.merge(left, right, on=['债券代码', '交易日期'], how='outer'), [bond_data, enterprise_data, macro_data, spread_data])
all_data.to_csv('all_data.csv', index=False)

bonds_to_drop = all_data[all_data['spread'] > 26]['债券代码'].unique()
all_data = all_data[~all_data['债券代码'].isin(bonds_to_drop)]
time_span = all_data.groupby('债券代码').size()
min_span = 20
max_span = 160 
valid_bonds = time_span[(time_span >= min_span) & (time_span <= max_span)].index

all_data = all_data[all_data['债券代码'].isin(valid_bonds)]
all_data.to_csv('all_data.csv', index=False)

time_span = all_data.groupby('债券代码').size()
time_span.to_csv('time_span.csv')
print(f' {all_data.shape}')
print(f'max time span: {time_span.max()}')
print(f'min time span: {time_span.min()}')

(77413, 14)
(77413, 13)
(77413, 3)
(77413, 17)
 (56867, 41)
max time span: 160
min time span: 20


In [3]:
# split data
# from sklearn.model_selection import train_test_split
train_data, valid_data, test_data = [], [], []  # 数据集拆分比例 8:1:1
for group in all_data.groupby('债券代码'):
    train = group[1].iloc[:int(0.8 * len(group[1]))]
    valid = group[1].iloc[int(0.8 * len(group[1])) : int(0.9 * len(group[1]))]
    test = group[1].iloc[int(0.9 * len(group[1])):]
    train_data.append(train)
    test_data.append(test)
    valid_data.append(valid)
train_data = pd.concat(train_data)
test_data = pd.concat(test_data)
valid_data = pd.concat(valid_data)

print(train_data.shape)
print(valid_data.shape)
print(test_data.shape)
train_data.to_csv('train_data.csv', index=False)
valid_data.to_csv('valid_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

(44901, 41)
(5591, 41)
(6375, 41)
