In [6]:
import pandas as pd 
import numpy as np
import torch.nn as nn
import torch
data = pd.read_csv('alphanet_codetest_daily.csv')
# 定义数据图片开始和结束的时间
time_start = 20190801
time_end = 20211001


stock = pd.DataFrame(data.groupby('ticker')['timestamp'].count() == 487).reset_index()
merge = data.merge(stock, on='ticker', how='left')
merge = merge[merge['timestamp_y'] == True]
merge['timestamp'] = merge['timestamp_x']
merge = merge[['timestamp','ticker','open','high','low','close','vwap','volume','pct_chg','turnover','free_turnover']] # feature
merge = merge[(merge['timestamp'] >= time_start) & (merge['timestamp'] <= time_end)]
merge

Unnamed: 0,timestamp,ticker,open,high,low,close,vwap,volume,pct_chg,turnover,free_turnover
0,20190923,1,15.34,15.47,15.18,15.38,15.3122,1403282.00,0.2608,0.7428,1.5038
1,20190923,2,26.49,26.49,26.00,26.15,26.1471,603530.46,-2.4254,0.6212,1.0882
2,20190923,4,20.36,20.65,19.61,20.04,20.0171,29893.32,-4.7076,3.6188,5.9394
3,20190923,5,3.37,3.37,3.27,3.30,3.2947,111437.30,-2.0772,1.0533,1.3495
4,20190923,6,5.34,5.40,5.21,5.38,5.2993,136997.47,1.1278,1.0161,1.5887
...,...,...,...,...,...,...,...,...,...,...,...
1958151,20210923,688168,52.80,53.30,52.00,52.45,52.5581,5919.73,1.0792,2.0771,2.1589
1958162,20210923,688188,446.80,449.00,425.20,438.14,438.5992,4444.64,-1.6830,1.6540,1.6540
1958216,20210923,688321,39.20,39.30,38.64,38.90,38.8391,14252.01,0.0772,0.5631,0.6717
1958221,20210923,688333,199.87,201.01,192.01,200.34,196.4810,4126.57,0.6177,0.9092,0.9092


In [199]:

class DataExtraction(object):
    def __init__(self,data,stride):
        if len(data.shape)!=4:
            raise Exception('Input data dimensions should be [N,C,H,W]')
        self.data = np.array(data)
        self.stride = stride
        self.data_length = data.shape[3]
        self.feat_num = data.shape[2] # 9
        self.num , self.num_rev = generate_Num_and_ReversedNum(self.feat_num)
        self.conv_feat = len(self.num)
        self.step_list = self.generate_Step_List(self.data_length,self.stride)
        self.extracted_data = self.Extraction(self.data,self.feat_num,self.conv_feat,self.stride)
    def Extraction(self,data,feat_num,conv_feat,stride):
        batch = nn.BatchNorm1d(conv_feat,affine=True)
        batch2 = nn.BatchNorm1d(feat_num,affine=True)
        conv1 = self.ts_cov4d(self.data,self.stride,self.num,self.num_rev,self.step_list).to(torch.float)
        bc1 = batch(conv1)
        conv2 = self.ts_corr4d(self.data,self.stride,self.num,self.num_rev,self.step_list).to(torch.float)
        bc2 = batch(conv2)
        conv3 = ts_stddev4d(data,stride).to(torch.float)
        bc3 = batch2(conv3)
        conv4 = ts_zscore(data,stride).to(torch.float)
        bc4 = batch2(conv4)
        conv5 = ts_return(data,stride).to(torch.float)
        bc5 = batch2(conv5)
        conv6 = ts_decaylinear(data,stride).to(torch.float)
        bc6 = batch2(conv6)

        feat_cat = torch.cat([bc1,bc2,bc3,bc4,bc5,bc6],axis = 1) # 特征聚合
        shape = feat_cat.shape
        feat_cat = feat_cat.reshape(shape[0],1,shape[1],shape[2])
        # Pooling
        ts_max = ts_pool(feat_cat,3,method = 'max')
        ts_max = nn.BatchNorm1d(108,affine = True)(ts_max)
        ts_min = ts_pool(feat_cat ,3,method = 'min')
        ts_min = nn.BatchNorm1d(108,affine = True)(ts_min)
        ts_mean = ts_pool(feat_cat ,3,method = 'mean')
        ts_mean = nn.BatchNorm1d(108,affine = True)(ts_mean)
        data_pool = torch.cat([ts_max,ts_min,ts_mean],axis = 1)
        data_pool.shape
        data_pool = data_pool.flatten(start_dim = 1)
        return data_pool
    def generateC(self,l1):
        if len(l1) == 1:
            return []
        v = [[l1[0],i] for i in l1[1:]]
        l1 = l1[1:]
        return v+generateC(l1)
    def generate_Num_and_ReversedNum(self,feat_nums):
        list1 = list(range(feat_nums))
        num = generateC(list1)
        num_rev = []
        for l in num:
            l1 = l.copy()
            l1.reverse()
            num_rev.append(l1)
        return num , num_rev
    def generate_Step_List(self,data_length,stride):
        # 构建步长列表，如果数据长度不能整除，则取剩下长度，如果剩下长度小于5，则与上一步结合一起
        if data_length % stride == 0:
            step_list = list(range(0,data_length+stride,stride))
        elif data_length % stride<=5:
            mod = data_length % stride
            step_list = list(range(0,data_length-stride,stride))+[data_length]
        else:
            mod = data_length % stride
            step_list = list(range(0,data_length+stride-mod,stride))+[data_length]
        return step_list
    def ts_cov4d(self,data,stride,num,num_rev,step_list):
        '''计算4维数据的协方差'''
        '''data:[N,C,H,W],,W:price length,N:batch size'''
        l = []
        #计算的过程中务必保持keepdims=True
        for i in range(len(step_list)-1):
            start = step_list[i]
            end = step_list[i+1]
            sub_data1 = data[:,:,num,start:end] # (2000, 1, 36, 2, 10)
            sub_data2 = data[:,:,num_rev,start:end]
            mean1 = sub_data1.mean(axis = 4,keepdims = True) # (2000, 1, 36, 2, 1)
            mean2 = sub_data2.mean(axis = 4,keepdims = True)
            spread1 = sub_data1 - mean1 # (2000, 1, 36, 2, 10)
            spread2 = sub_data2 - mean2
            cov = ((spread1*spread2).sum(axis = 4,keepdims = True)/(sub_data1.shape[4] - 1)).mean(axis = 3,keepdims = True) # (2000, 1, 36, 1, 1)
            l.append(cov)
        corr = np.squeeze(np.array(l)).transpose(1,2,0).reshape(-1,self.conv_feat,len(step_list)-1) # (2000, 1, 36, 3)
        return torch.from_numpy(corr)
    def ts_corr4d(self,data,stride,num,num_rev,step_list):
        '''计算4维数据的相关系数'''
        '''data:[N,C,H,W],,W:price length,N:batch size'''
        l = []
        for i in range(len(step_list)-1):
            start = step_list[i]
            end = step_list[i+1]
            sub_data1 = data[:,:,num,start:end]
            sub_data2 = data[:,:,num_rev,start:end]
            std1 = sub_data1.std(axis = 4,keepdims = True)
            std2 = sub_data2.std(axis = 4,keepdims = True)
            std = (std1*std2).mean(axis = 3,keepdims = True)
            l.append(std)
        std = np.squeeze(np.array(l)).transpose(1,2,0).reshape(-1,self.conv_feat,len(step_list)-1)
        cov = self.ts_cov4d(data,stride,num,num_rev,step_list)
        fct = (sub_data1.shape[4]-1)/sub_data1.shape[4]
        return (cov/torch.from_numpy(std))*fct
    def ts_stddev4d(self,data,stride):
        if len(data.shape)!=4:
            raise Exception('Input data dimensions should be [N,C,H,W]')
        data_length , feat_num , conv_feat, num , num_rev,step_list= data_info(data,stride)
        l = []
        for i in range(len(step_list)-1):
            start = step_list[i]
            end = step_list[i+1]
            sub_data1 = data[:,:,:,start:end]
            std1 = sub_data1.std(axis = 3,keepdims = True)
            l.append(std1)
        std = np.squeeze(np.array(l)).transpose(1,2,0).reshape(-1,feat_num,len(step_list)-1)
        return torch.from_numpy(std)
    def ts_zscore(self,data,stride):
        if len(data.shape)!=4:
            raise Exception('Input data dimensions should be [N,C,H,W]')
        data_length , feat_num , conv_feat, num , num_rev,step_list= data_info(data,stride)
        l = []
        for i in range(len(step_list)-1):
            start = step_list[i]
            end = step_list[i+1]
            sub_data1 = data[:,:,:,start:end]
            mean = sub_data1.mean(axis = 3,keepdims = True)
            std = sub_data1.std(axis = 3,keepdims = True)
            z_score = mean/std
            l.append(z_score)
        z_score = np.squeeze(np.array(l)).transpose(1,2,0).reshape(-1,feat_num,len(step_list)-1)
        return torch.from_numpy(z_score)
    def ts_return(self,data,stride):
        if len(data.shape)!=4:
            raise Exception('Input data dimensions should be [N,C,H,W]')
        data_length , feat_num , conv_feat, num , num_rev,step_list= data_info(data,stride)
        data[data == 0] = 1e-9
        l = []
        for i in range(len(step_list)-1):
            start = step_list[i]
            end = step_list[i+1]
            sub_data1 = data[:,:,:,start:end]
            ret = sub_data1[:,:,:,-1]/sub_data1[:,:,:,0] - 1
            l.append(ret)
        z_data = np.squeeze(np.array(l)).transpose(1,2,0).reshape(-1,feat_num,len(step_list)-1)
        z_data[z_data > 1] = 1
        return torch.from_numpy(z_data)
    def ts_decaylinear(self,data,stride):
        if len(data.shape)!=4:
            raise Exception('Input data dimensions should be [N,C,H,W]')
        data_length , feat_num , conv_feat, num , num_rev,step_list= data_info(data,stride)
        l = []
        for i in range(len(step_list)-1):
            start = step_list[i]
            end = step_list[i+1]
            time_spread = end - start
            weight = np.arange(1,time_spread+1)
            weight = weight/(weight.sum())
            sub_data1 = (data[:,:,:,start:end]*weight).mean(axis = 3,keepdims = True)
            l.append(sub_data1)
        decay_data = np.squeeze(np.array(l)).transpose(1,2,0).reshape(-1,feat_num,len(step_list)-1)
        return torch.from_numpy(decay_data)
    def ts_pool(self,data,stride,method):
        if type(data) == torch.Tensor:
            data = data.detach().numpy()
        if data.shape[-1] <= stride:
            step_list = [0,data.shape[-1]]
        if len(data.shape)!=4:
            raise Exception('Input data dimensions should be [N,C,H,W]')
        data_length , feat_num , conv_feat, num , num_rev,step_list= data_info(data,stride)
        l = []
        for i in range(len(step_list)-1):
            start = step_list[i]
            end = step_list[i+1]
            if method == 'max':
                sub_data1 = data[:,:,:,start:end].max(axis = 3,keepdims = True)
            if method == 'min':
                sub_data1 = data[:,:,:,start:end].min(axis = 3,keepdims = True)
            if method == 'mean':
                sub_data1 = data[:,:,:,start:end].mean(axis = 3,keepdims = True)
            l.append(sub_data1)
        try:
            pool_data = np.squeeze(np.array(l)).transpose(1,2,0).reshape(-1,feat_num,len(step_list) - 1)
        except:
            pool_data = np.squeeze(np.array(l)).reshape(-1,feat_num,len(step_list) - 1)
        return torch.from_numpy(pool_data)

In [202]:
dataE = DataExtraction(x,10)
dataE.extracted_data.shape

torch.Size([1520, 324])

In [2]:
def generateC(l1):
    if len(l1) == 1:
        return []
    v = [[l1[0],i] for i in l1[1:]]
    l1 = l1[1:]
    return v+generateC(l1)
def generate_Num_and_ReversedNum(feat_nums):
    list1 = list(range(feat_nums))
    num = generateC(list1)
    num_rev = []
    for l in num:
        l1 = l.copy()
        l1.reverse()
        num_rev.append(l1)
    return num , num_rev
def data_info(data,stride):
    if len(data.shape)!=4:
        raise Exception('Input data dimensions should be [N,C,H,W]')
    data_length = data.shape[3] # 30
    feat_num = data.shape[2] # 9
    """num:组合数对列表,num_rev:num的翻转列表"""
    num , num_rev = generate_Num_and_ReversedNum(feat_num)
    conv_feat = len(num)
    # 构建步长列表，如果数据长度不能整除，则取剩下长度，如果剩下长度小于5，则与上一步结合一起
    if data_length % stride == 0:
        step_list = list(range(0,data_length+stride,stride))
    elif data_length % stride<=5:
        mod = data_length % stride
        step_list = list(range(0,data_length-stride,stride))+[data_length]
    else:
        mod = data_length % stride
        step_list = list(range(0,data_length+stride-mod,stride))+[data_length]
    return data_length,feat_num,conv_feat,num,num_rev,step_list

In [158]:
day_back = 3 # 回溯几天进行计算
day = 30 # 一次提取几天的数,一般默认为30
stride = 10 # 一次学习多少天


x , y , x_delay = [] , [], [] # 初始数据集

for count , ticker in enumerate(merge['ticker'].drop_duplicates()[:10]):
#     print(count)
    one_data = merge[merge['ticker'] == ticker]
    one_data['pct_change_shift'] = (one_data['close'].shift(-day_back) - one_data['close']) / one_data['close'] * 100
    one_data = one_data.set_index(['timestamp','ticker'])
    one_data = one_data.dropna() # 丢弃因为回溯而产生的空值
    array = np.array(one_data)

    for i in range(0,array.shape[0] - day ,3): # 其中3 代表取数的步长，ex.每两天取一次数，步长为3
        x.append(array[i:i+day,:-1].T)
        y.append(array[i+day-1][-1])
x  , y = np.array(x) , np.array(y).reshape(-1,1) # x = (153, 9, 30) , y = (153,1)
x = torch.from_numpy(x.reshape(x.shape[0],1,x.shape[1],x.shape[2])) # x = (153, 1, 9, 30)
print("x.shape: ",x.shape)
print("y.shape: ",y.shape)

x.shape:  torch.Size([1520, 1, 9, 30])
y.shape:  (1520, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_data['pct_change_shift'] = (one_data['close'].shift(-day_back) - one_data['close']) / one_data['close'] * 100


In [204]:
def ts_cov4d(data,stride):
    '''计算4维数据的协方差'''
    '''data:[N,C,H,W],,W:price length,N:batch size'''
    if len(data.shape)!=4:
        raise Exception('Input data dimensions should be [N,C,H,W]')
    data_length , feat_num , conv_feat, num , num_rev,step_list= data_info(data,stride)
    l = []
    #计算的过程中务必保持keepdims=True
    for i in range(len(step_list)-1):
        start = step_list[i]
        end = step_list[i+1]
        sub_data1 = data[:,:,num,start:end] # (2000, 1, 36, 2, 10)
        sub_data2 = data[:,:,num_rev,start:end]
        mean1 = sub_data1.mean(axis = 4,keepdims = True) # (2000, 1, 36, 2, 1)
        mean2 = sub_data2.mean(axis = 4,keepdims = True)
        spread1 = sub_data1 - mean1 # (2000, 1, 36, 2, 10)
        spread2 = sub_data2 - mean2
        cov = ((spread1*spread2).sum(axis = 4,keepdims = True)/(sub_data1.shape[4] - 1)).mean(axis = 3,keepdims = True) # (2000, 1, 36, 1, 1)
        l.append(cov)
    corr = np.squeeze(np.array(l)).transpose(1,2,0).reshape(-1,conv_feat,len(step_list)-1) # (2000, 1, 36, 3)
    return torch.from_numpy(corr)
def ts_corr4d(data,stride):
    '''计算4维数据的相关系数'''
    '''data:[N,C,H,W],,W:price length,N:batch size'''
    if len(data.shape)!=4:
        raise Exception('Input data dimensions should be [N,C,H,W]')
    data_length , feat_num , conv_feat, num , num_rev,step_list= data_info(data,stride)
    l = []
    for i in range(len(step_list)-1):
        start = step_list[i]
        end = step_list[i+1]
        sub_data1 = data[:,:,num,start:end]
        sub_data2 = data[:,:,num_rev,start:end]
        std1 = sub_data1.std(axis = 4,keepdims = True)
        std2 = sub_data2.std(axis = 4,keepdims = True)
        std = (std1*std2).mean(axis = 3,keepdims = True)
        l.append(std)
    std = np.squeeze(np.array(l)).transpose(1,2,0).reshape(-1,conv_feat,len(step_list)-1)
    cov = ts_cov4d(data,stride)
    fct = (sub_data1.shape[4]-1)/sub_data1.shape[4]
    return (cov/torch.from_numpy(std))*fct
def ts_stddev4d(data,stride):
    if len(data.shape)!=4:
        raise Exception('Input data dimensions should be [N,C,H,W]')
    data_length , feat_num , conv_feat, num , num_rev,step_list= data_info(data,stride)
    l = []
    for i in range(len(step_list)-1):
        start = step_list[i]
        end = step_list[i+1]
        sub_data1 = data[:,:,:,start:end]
        std1 = sub_data1.std(axis = 3,keepdims = True)
        l.append(std1)
    std = np.squeeze(np.array(l)).transpose(1,2,0).reshape(-1,feat_num,len(step_list)-1)
    return torch.from_numpy(std)
def ts_zscore(data,stride):
    if len(data.shape)!=4:
        raise Exception('Input data dimensions should be [N,C,H,W]')
    data_length , feat_num , conv_feat, num , num_rev,step_list= data_info(data,stride)
    l = []
    for i in range(len(step_list)-1):
        start = step_list[i]
        end = step_list[i+1]
        sub_data1 = data[:,:,:,start:end]
        mean = sub_data1.mean(axis = 3,keepdims = True)
        std = sub_data1.std(axis = 3,keepdims = True)
        z_score = mean/std
        l.append(z_score)
    z_score = np.squeeze(np.array(l)).transpose(1,2,0).reshape(-1,feat_num,len(step_list)-1)
    return torch.from_numpy(z_score)
def ts_return(data,stride):
    if len(data.shape)!=4:
        raise Exception('Input data dimensions should be [N,C,H,W]')
    data_length , feat_num , conv_feat, num , num_rev,step_list= data_info(data,stride)
    data[data == 0] = 1e-9
    l = []
    for i in range(len(step_list)-1):
        start = step_list[i]
        end = step_list[i+1]
        sub_data1 = data[:,:,:,start:end]
        ret = sub_data1[:,:,:,-1]/sub_data1[:,:,:,0] - 1
        l.append(ret)
    z_data = np.squeeze(np.array(l)).transpose(1,2,0).reshape(-1,feat_num,len(step_list)-1)
    z_data[z_data > 1] = 1
    return torch.from_numpy(z_data)
def ts_decaylinear(data,stride):
    if len(data.shape)!=4:
        raise Exception('Input data dimensions should be [N,C,H,W]')
    data_length , feat_num , conv_feat, num , num_rev,step_list= data_info(data,stride)
    l = []
    for i in range(len(step_list)-1):
        start = step_list[i]
        end = step_list[i+1]
        time_spread = end - start
        weight = np.arange(1,time_spread+1)
        weight = weight/(weight.sum())
        sub_data1 = (data[:,:,:,start:end]*weight).mean(axis = 3,keepdims = True)
        l.append(sub_data1)
    decay_data = np.squeeze(np.array(l)).transpose(1,2,0).reshape(-1,feat_num,len(step_list)-1)
    return torch.from_numpy(decay_data)
def ts_pool(data,stride,method):
    if type(data) == torch.Tensor:
        data = data.detach().numpy()
    if data.shape[-1] <= stride:
        step_list = [0,data.shape[-1]]
    if len(data.shape)!=4:
        raise Exception('Input data dimensions should be [N,C,H,W]')
    data_length , feat_num , conv_feat, num , num_rev,step_list= data_info(data,stride)
    l = []
    for i in range(len(step_list)-1):
        start = step_list[i]
        end = step_list[i+1]
        if method == 'max':
            sub_data1 = data[:,:,:,start:end].max(axis = 3,keepdims = True)
        if method == 'min':
            sub_data1 = data[:,:,:,start:end].min(axis = 3,keepdims = True)
        if method == 'mean':
            sub_data1 = data[:,:,:,start:end].mean(axis = 3,keepdims = True)
        l.append(sub_data1)
    try:
        pool_data = np.squeeze(np.array(l)).transpose(1,2,0).reshape(-1,feat_num,len(step_list) - 1)
    except:
        pool_data = np.squeeze(np.array(l)).reshape(-1,feat_num,len(step_list) - 1)
    return torch.from_numpy(pool_data)

In [213]:
feat_num = 9


batch = nn.BatchNorm1d(36,affine=True)
batch2 = nn.BatchNorm1d(9,affine=True)
conv1 = ts_cov4d(x,10).to(torch.float)
bc1 = batch(conv1)
conv2 = ts_corr4d(x,10).to(torch.float)
bc2 = batch(conv2)
conv3 = ts_stddev4d(x,10).to(torch.float)
bc3 = batch2(conv3)
conv4 = ts_zscore(x,10).to(torch.float)
bc4 = batch2(conv4)
conv5 = ts_return(x,10).to(torch.float)
bc5 = batch2(conv5)
conv6 = ts_decaylinear(x,10).to(torch.float)
bc6 = batch2(conv6)

feat_cat = torch.cat([bc1,bc2,bc3,bc4,bc5,bc6],axis = 1) # 特征聚合
shape = feat_cat.shape
feat_cat = feat_cat.reshape(shape[0],1,shape[1],shape[2])
feat_cat.shape
# Pooling
ts_max = ts_pool(feat_cat ,3,method = 'max')
ts_max.shape
# ts_max = nn.BatchNorm1d(108,affine = True)(ts_max)
# ts_min = ts_pool(feat_cat ,3,method = 'min')
# ts_min = nn.BatchNorm1d(108,affine = True)(ts_min)
# ts_mean = ts_pool(feat_cat ,3,method = 'mean')
# ts_mean = nn.BatchNorm1d(108,affine = True)(ts_mean)
# data_pool = torch.cat([ts_max,ts_min,ts_mean],axis = 1)
# data_pool.shape
# data_pool = data_pool.flatten(start_dim = 1)
# type(data_pool)
# pipline = nn.Sequential(nn.Linear(324,30),
#                         nn.ReLU(),
#                       nn.Dropout(0.5),
#                        nn.Linear(30,1))
# output = pipline(data_pool)
# print(output,'\n',output.size())

torch.Size([1520, 108, 1])

tensor([[ 0.5180, -0.1648, -0.0511],
        [ 0.8800, -0.4084, -0.0650],
        [ 0.6671, -0.2635,  0.0542],
        [ 0.8957, -0.2528, -0.0577],
        [ 0.8204, -0.3303,  0.0068],
        [    nan,     nan,     nan],
        [    nan,     nan,     nan],
        [    nan,     nan,     nan],
        [    nan,     nan,     nan]], grad_fn=<SelectBackward>)

In [7]:
delay = 30 # 设置delay的天数
delay_data = data.shift(delay)
delay_data.to_csv('delay.csv')
delay_data

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,vwap,volume,pct_chg,turnover,free_turnover,pct_change_shift
timestamp,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-09-23,1,,,,,,,,,,
2019-09-24,1,,,,,,,,,,
2019-09-25,1,,,,,,,,,,
2019-09-26,1,,,,,,,,,,
2019-09-27,1,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
2021-09-15,1,18.04,18.07,17.60,17.81,17.8197,729620.08,-0.4472,0.3760,0.8483,0.280741
2021-09-16,1,17.70,18.05,17.58,17.68,17.7759,621573.88,-0.7299,0.3203,0.7227,7.805430
2021-09-17,1,17.55,17.90,17.45,17.86,17.7260,567454.13,1.0181,0.2924,0.6597,10.470325
2021-09-22,1,17.91,19.25,17.84,19.06,18.9058,1752798.00,6.7189,0.9032,2.0379,3.934942


In [22]:
array = np.array(data)
delay_array = np.array(delay_data)
day = 30
x , y , x_delay = [] , [], []
# for i in range(0,array.shape[0] - day ,3): # 其中3 代表取数的步长，ex.每两天取一次数，步长为3
for i in range(delay,delay + 4 ,3): # 其中3 代表取数的步长，ex.每两天取一次数，步长为3
    x.append(array[i:i+day,:-1].T)
    y.append(array[i+day-1][-1])
    x_delay.append(delay_array[i:i+day,:-1].T)
x  , y , x_delay= np.array(x) , np.array(y).reshape(-1,1) , np.array(x_delay) # x = (153, 9, 30) , y = (153,1)
x = x.reshape(x.shape[0],1,x.shape[1],x.shape[2]) # x = (153, 1, 9, 30)
x_delay = x_delay.reshape(x_delay.shape[0],1,x_delay.shape[1],x_delay.shape[2]) # x = (153, 1, 9, 30)
x_delay = (x-x_delay)/x_delay

  x_delay = (x-x_delay)/x_delay


In [191]:
stride = 10
# 为了cov 和 corr提取并行计算产生计算列表
def generateC(l1):
    if len(l1) == 1:
        return []
    v = [[l1[0],i] for i in l1[1:]]
    l1 = l1[1:]
    return v+generateC(l1)
def generate_Num_and_ReversedNum(feat_nums):
    list1 = list(range(feat_nums))
    num = generateC(list1)
    num_rev = []
    for l in num:
        l1 = l.copy()
        l1.reverse()
        num_rev.append(l1)
    return num , num_rev
def data_info(data,stride):
    if len(data.shape)!=4:
        raise Exception('Input data dimensions should be [N,C,H,W]')
    data_length = data.shape[3] # 30
    feat_num = data.shape[2] # 9
    """num:组合数对列表,num_rev:num的翻转列表"""
    num , num_rev = generate_Num_and_ReversedNum(feat_num)
    conv_feat = len(num)
    # 构建步长列表，如果数据长度不能整除，则取剩下长度，如果剩下长度小于5，则与上一步结合一起
    if data_length % stride == 0:
        step_list = list(range(0,data_length+stride,stride))
    elif data_length % stride<=5:
        mod = data_length % stride
        step_list = list(range(0,data_length-stride,stride))+[data_length]
    else:
        mod = data_length % stride
        step_list = list(range(0,data_length+stride-mod,stride))+[data_length]
    return data_length,feat_num,conv_feat,num,num_rev,step_list
def ts_cov4d(data,stride):
    '''计算4维数据的协方差'''
    '''data:[N,C,H,W],,W:price length,N:batch size'''
    if len(data.shape)!=4:
        raise Exception('Input data dimensions should be [N,C,H,W]')
    data_length , feat_num , conv_feat, num , num_rev,step_list= data_info(data,stride)
    l = []
    #计算的过程中务必保持keepdims=True
    for i in range(len(step_list)-1):
        start = step_list[i]
        end = step_list[i+1]
        sub_data1 = data[:,:,num,start:end] # (2000, 1, 36, 2, 10)
        sub_data2 = data[:,:,num_rev,start:end]
        mean1 = sub_data1.mean(axis = 4,keepdims = True) # (2000, 1, 36, 2, 1)
        mean2 = sub_data2.mean(axis = 4,keepdims = True)
        spread1 = sub_data1 - mean1 # (2000, 1, 36, 2, 10)
        spread2 = sub_data2 - mean2
        cov = ((spread1*spread2).sum(axis = 4,keepdims = True)/(sub_data1.shape[4] - 1)).mean(axis = 3,keepdims = True) # (2000, 1, 36, 1, 1)
        l.append(cov)
    corr = np.squeeze(np.array(l)).transpose(1,2,0).reshape(-1,1,conv_feat,len(step_list)-1) # (2000, 1, 36, 3)
    return torch.from_numpy(corr)
def ts_corr4d(data,stride):
    '''计算4维数据的相关系数'''
    '''data:[N,C,H,W],,W:price length,N:batch size'''
    if len(data.shape)!=4:
        raise Exception('Input data dimensions should be [N,C,H,W]')
    data_length , feat_num , conv_feat, num , num_rev,step_list= data_info(data,stride)
    l = []
    for i in range(len(step_list)-1):
        start = step_list[i]
        end = step_list[i+1]
        sub_data1 = data[:,:,num,start:end]
        sub_data2 = data[:,:,num_rev,start:end]
        std1 = sub_data1.std(axis = 4,keepdims = True)
        std2 = sub_data2.std(axis = 4,keepdims = True)
        std = (std1*std2).mean(axis = 3,keepdims = True)
        l.append(std)
    std = np.squeeze(np.array(l)).transpose(1,2,0).reshape(-1,1,conv_feat,len(step_list)-1)
    cov = ts_cov4d(data,stride)
    fct = (sub_data1.shape[4]-1)/sub_data1.shape[4]
    return (cov/torch.from_numpy(std))*fct
def ts_stddev4d(data,stride):
    if len(data.shape)!=4:
        raise Exception('Input data dimensions should be [N,C,H,W]')
    data_length , feat_num , conv_feat, num , num_rev,step_list= data_info(data,stride)
    l = []
    for i in range(len(step_list)-1):
        start = step_list[i]
        end = step_list[i+1]
        sub_data1 = data[:,:,:,start:end]
        std1 = sub_data1.std(axis = 3,keepdims = True)
        l.append(std1)
    std = np.squeeze(np.array(l)).transpose(1,2,0).reshape(-1,1,feat_num,len(step_list)-1)
    return torch.from_numpy(std)
def ts_zscore(data,stride):
    if len(data.shape)!=4:
        raise Exception('Input data dimensions should be [N,C,H,W]')
    data_length , feat_num , conv_feat, num , num_rev,step_list= data_info(data,stride)
    l = []
    for i in range(len(step_list)-1):
        start = step_list[i]
        end = step_list[i+1]
        sub_data1 = data[:,:,:,start:end]
        mean = sub_data1.mean(axis = 3,keepdims = True)
        std = sub_data1.std(axis = 3,keepdims = True)
        z_score = mean/std
        l.append(z_score)
    z_score = np.squeeze(np.array(l)).transpose(1,2,0).reshape(-1,1,feat_num,len(step_list)-1)
    return torch.from_numpy(z_score)
def ts_decaylinear(data,stride):
    if len(data.shape)!=4:
        raise Exception('Input data dimensions should be [N,C,H,W]')
    data_length , feat_num , conv_feat, num , num_rev,step_list= data_info(data,stride)
    l = []
    for i in range(len(step_list)-1):
        start = step_list[i]
        end = step_list[i+1]
        time_spread = end - start
        weight = np.arange(1,time_spread+1)
        weight = weight/(weight.sum())
        sub_data1 = (data[:,:,:,start:end]*weight).mean(axis = 3,keepdims = True)
        l.append(sub_data1)
    decay_data = np.squeeze(np.array(l)).transpose(1,2,0).reshape(-1,1,feat_num,len(step_list)-1)
    return torch.from_numpy(decay_data)
def ts_pool(data,stride,method):
    if type(data) == torch.Tensor:
        data = data.detach().numpy()
    if data.shape[-1] <= stride:
        step_list = [0,data.shape[-1]]
    if len(data.shape)!=4:
        raise Exception('Input data dimensions should be [N,C,H,W]')
    data_length , feat_num , conv_feat, num , num_rev,step_list= data_info(data,stride)
    l = []
    for i in range(len(step_list)-1):
        start = step_list[i]
        end = step_list[i+1]
        if method == 'max':
            sub_data1 = data[:,:,:,start:end].max(axis = 3,keepdims = True)
        if method == 'min':
            sub_data1 = data[:,:,:,start:end].min(axis = 3,keepdims = True)
        if method == 'mean':
            sub_data1 = data[:,:,:,start:end].mean(axis = 3,keepdims = True)
        l.append(sub_data1)
    try:
        pool_data = np.squeeze(np.array(l)).transpose(1,2,0).reshape(-1,1,feat_num,len(step_list) - 1)
    except:
        pool_data = np.squeeze(np.array(l)).reshape(-1,1,feat_num,len(step_list) - 1)
    return torch.from_numpy(pool_data)

In [197]:
batch = nn.BatchNorm2d(1,affine=True)
conv1 = ts_cov4d(x,10).to(torch.float)
sigmoid1 = torch.sigmoid(conv1)
bc1 = batch(sigmoid1)
conv2 = ts_corr4d(x,10).to(torch.float)
bc2 = batch(conv2)
conv3 = ts_stddev4d(x,10).to(torch.float)
sigmoid3 = torch.sigmoid(conv3)
bc3 = batch(sigmoid3)
conv4 = ts_zscore(x,10).to(torch.float)
bc4 = batch(conv4)
conv5 = ts_decaylinear(x,10).to(torch.float)
sigmoid5 = torch.sigmoid(conv5)
bc5 = batch(sigmoid5)
feat_cat = torch.cat([bc1,bc2,bc3,bc4,bc5],axis = 2) # 特征聚合
print("after conv shape: ",feat_cat.shape)
# # Pooling
ts_max = ts_pool(feat_cat ,3,method = 'max')
ts_max = batch(ts_max)
ts_min = ts_pool(feat_cat ,3,method = 'min')
ts_min = batch(ts_min)
ts_mean = ts_pool(feat_cat ,3,method = 'mean')
ts_mean = batch(ts_mean)
data_pool = torch.cat([ts_max,ts_min,ts_mean],axis = 2)
data_pool = data_pool.flatten(start_dim = 1)
print("after pooling shape: ",data_pool.shape)
pipline = nn.Sequential(nn.Linear(297,30),
                        nn.ReLU(),
                      nn.Dropout(0.5),
                       nn.Linear(30,1))
output = pipline(data_pool)
output.shape

after conv shape:  torch.Size([2, 1, 99, 3])
after pooling shape:  torch.Size([2, 297])


torch.Size([2, 1])

In [126]:
test = x[0][0]
num = [0,10,20,30]
for time in range(len(num) - 1):
    for i in range(test.shape[0]):
            print(np. (test[i,num[time]:num[time+1]]))
#             print(pd.DataFrame(test[[i,j],num[time]:num[time+1]].T).corr().iloc[0,1])
            print("________________________________________________")

0.2340598214132446
________________________________________________
0.2401020616321322
________________________________________________
0.2673200329193451
________________________________________________
0.2802213410859351
________________________________________________
0.25701352279598066
________________________________________________
352014.12266699696
________________________________________________
1.2616693901731941
________________________________________________
0.18140541998518125
________________________________________________
0.4092543805507767
________________________________________________
0.17472549899771372
________________________________________________
0.1620617166390632
________________________________________________
0.12737739202856974
________________________________________________
0.1481080686525891
________________________________________________
0.13685220641261184
________________________________________________
137800.56595758555
________________________