In [30]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import copy

In [31]:
class PositionalEncoding(nn.Module):
	'''
	The positional encoding class is used in the encoder and decoder layers.
	It's role is to inject sequence order information into the data since self-attention
	mechanisms are permuatation equivariant. Naturally, this is not required in the static
	transformer since there is no concept of 'order' in a portfolio.'''

	def __init__(self, window, d_model):
		super().__init__()

		self.register_buffer('d_model', torch.tensor(d_model, dtype = torch.float64))

		pe = torch.zeros(window, d_model)
		for pos in range(window):
			for i in range(0, d_model, 2):
			  pe[pos, i] = np.sin(pos / (10000 ** ((2 * i)/d_model)))
				
			for i in range(1, d_model, 2):
			  pe[pos, i] = np.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))             
				
		pe = pe.unsqueeze(0)
		self.register_buffer('pe', pe)
	
	def forward(self, x):
		return x * torch.sqrt(self.d_model) + self.pe[:,:x.size(1)]

In [32]:
def create_mask(seq_len):
	'''
	Create a mask to be used in the decoder.
	Returns a mask of shape (1, seq_len, seq_len)
	'''
	no_peak_mask = np.triu(np.ones((seq_len, seq_len)), k = 1).astype('uint8')
	return torch.from_numpy(no_peak_mask)

def get_clones(module, N):
	'''
	This helper function is used to create copies of encoder and decoder layers.
	These copies of encoder/decoder layers are used to construct the
	complete stacked encoder/decoder modules.
	'''
	return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [33]:
def scaled_dot_product_attention(k, q, v, mask = None):
	'''
	k : (batch, seq_len_k, heads, d_model)
	q : (batch, seq_len_q, heads, d_model)
	v : (batch, seq_len_v, heads, d_model)
	require seq_len_k == seq_len_v
	'''

	b, _, h, d = k.shape

	k = k.transpose(1, 2).contiguous().view(b * h, -1, d)
	q = q.transpose(1, 2).contiguous().view(b * h, -1, d)
	v = v.transpose(1, 2).contiguous().view(b * h, -1, d)
	
	scores = torch.matmul(q, k.transpose(1, 2))
	if mask is not None:
		scores = scores.masked_fill(mask == 0, -1e9)
	scores = F.softmax(scores,dim=2)

	# Scaled dot-product.
	scores = torch.matmul(scores, v).view(b, h, -1, d)
	return scores.transpose(1, 2).contiguous().view(b, -1, h * d)

In [34]:
class MultiHeadAttention(nn.Module):
	'''This is a Mult-Head wide self-attention class.'''
	def __init__(self, heads, d_model, dropout = 0.1):
		super().__init__()
		
		self.h = heads
		self.d_model = d_model

		self.dropout = nn.Dropout(dropout)
		
		self.q_linear = nn.Linear(d_model, heads * d_model,bias=False)
		self.v_linear = nn.Linear(d_model, heads * d_model,bias=False)
		self.k_linear = nn.Linear(d_model, heads * d_model,bias=False)
		  
		self.unifyheads = nn.Linear(heads * d_model, d_model)

	def forward(self, q, k, v, mask = None):

		b = q.shape[0]

		k = self.k_linear(k).view(b, -1, self.h, self.d_model)
		q = self.q_linear(q).view(b, -1, self.h, self.d_model)
		v = self.v_linear(v).view(b, -1, self.h, self.d_model)

		output = scaled_dot_product_attention(k, q, v, mask = mask)
		output = self.unifyheads(output)

		return output

In [35]:
class FeedForward(nn.Module):
	'''This is a pointwise feedforward network.'''
	def __init__(self, d_model, dff, dropout = 0.1):
		super().__init__()
		
		self.ff = nn.Sequential(
			nn.Linear(d_model, dff),
			nn.ReLU(),
			nn.Dropout(dropout),
			nn.Linear(dff, d_model))
	
	def forward(self, x):
		x = self.ff(x)
		return x 

In [36]:
class EncoderLayer(nn.Module):
	'''Encoder layer class.'''
	def __init__(self, heads, d_model, dff, dropout = 0.1):
		super().__init__()

		self.norm_1 = nn.LayerNorm(d_model)
		self.norm_2 = nn.LayerNorm(d_model)

		self.attn = MultiHeadAttention(heads, d_model, dropout = dropout)
		self.ff = FeedForward(d_model, dff)

		self.dropout_1 = nn.Dropout(dropout)
		self.dropout_2 = nn.Dropout(dropout)
	
	def forward(self, x):
		attn_out = self.dropout_1(self.attn(x, x, x))
		x = self.norm_1(x + attn_out)

		ffn_out = self.ff(x)
		x = self.norm_2(x + ffn_out)

		return x

In [37]:
class Encoder(nn.Module):
	'''Stacked encoder layers.'''
	def __init__(self, N, pe_window, heads, inp_dim, d_model, dff, dropout):
		super().__init__()

		self.N = N
		self.embedding = nn.Linear(inp_dim, d_model)  # change the meaning of embedding in finance data
		self.pe = PositionalEncoding(pe_window, d_model)
		self.dynamiclayers = get_clones(EncoderLayer(heads, d_model, dff, dropout = dropout), N)

	def forward(self, x):
		# x (batch, seq_len, inp_dim)

		x = self.embedding(x) # (batch, seq_len, d_model)
		
		x = self.pe(x) # (batch, seq_len, d_model)

		for i in range(self.N):
		  x = self.dynamiclayers[i](x) # (batch, seq_len, d_model)

		return x # (batch, seq_len, d_model)

In [38]:
class DecoderLayer(nn.Module):
	'''Decoder Layer class'''
	def __init__(self, heads, d_model, dff, dropout = 0.1):
		super().__init__()

		self.norm_1 = nn.LayerNorm(d_model)
		self.norm_2 = nn.LayerNorm(d_model)
		self.norm_3 = nn.LayerNorm(d_model)

		self.attn_1 = MultiHeadAttention(heads, d_model, dropout = dropout)
		self.attn_2 = MultiHeadAttention(heads, d_model, dropout = dropout)
		self.ff = FeedForward(d_model, dff)

		self.dropout_1 = nn.Dropout(dropout)
		self.dropout_2 = nn.Dropout(dropout)
		self.dropout_3 = nn.Dropout(dropout)

	def forward(self, x, enc_out, mask = None):
		# x (batch, seq_len, d_model)
		# enc_out (batch, enc_seq_len, d_model)

		attn_1_out = self.dropout_1(self.attn_1(x, x, x, mask = mask))
		x = self.norm_1(x + attn_1_out) # (batch, seq_len, d_model)

		attn_2_out = self.dropout_2(self.attn_2(x, enc_out, enc_out))
		x = self.norm_2(x + attn_2_out) # (batch, seq_len, d_model)

		ffn_out = self.dropout_3(self.ff(x))
		x = self.norm_3(x + ffn_out) # (batch, seq_len, d_model)

		return x # (batch, seq_len, d_model)

In [39]:
class Decoder(nn.Module):
	'''Stacked decoder layers.'''
	def __init__(self, N, pe_window, heads, inp_dim, d_model, dff, dropout = 0.1):
		super().__init__()

		self.N = N
		self.embedding = nn.Linear(inp_dim, d_model)
		self.pe = PositionalEncoding(pe_window, d_model)
		self.decoderlayers = get_clones(DecoderLayer(heads, d_model, dff, dropout = dropout), N)

	def forward(self, x, enc_out, mask = None):
		# x (batch, seq_len, inp_dim)
		# enc_out (batch, enc_seq_len, d_model)

		x = self.embedding(x) # (batch, seq_len, d_model)

		x = self.pe(x) # (batch, seq_len, d_model)
		
		for i in range(self.N):
			x = self.decoderlayers[i](x, enc_out, mask = mask) # (batch, seq_len, d_model)
		
		return x # (batch, seq_len, d_model)


In [43]:
class Ml4fTransformer(nn.Module):
	'''
	Main transformer class.
	experiment : selects sigmoid final activation if 'movement' else linear
	inp_dim_e : number of dimensions of encoder input
	inp_dim_d : number of dimensions of decoder input
	d_model : model embedding dimension
	dff : hidden dimension of feed forward network
	N_e : number of encoder layers
	N_d : number of decoder layers
	heads : number of heads
	'''
	def __init__(self, inp_dim_e, inp_dim_d, d_model = 20,
		dff = 80, N_e = 1, N_d = 1, heads = 4, dropout = 0.1, pe_window = 100):
		super().__init__()

		assert d_model % heads == 0
		
		self.encoder = Encoder(N_e, pe_window, heads, inp_dim_e, d_model, dff, dropout = dropout)
		
		self.decoder = Decoder(N_d, pe_window, heads, inp_dim_d, d_model, dff, dropout = dropout)
		
		self.map = nn.Linear(d_model, inp_dim_d)
	
	def forward(self, x, y, mask = None):
		'''
		x (batch, in_seq_len, inp_dim_e)
		y (batch, tar_seq_len, inp_dim_d)
		'''

		enc_out = self.encoder(x) # (batch, in_seq_len, d_model)

		dec_out = self.decoder(y, enc_out, mask = mask) # (batch, tar_seq_len, d_model)

		final = self.map(dec_out) # (batch, tar_seq_len, out_dim_d)

		return final

In [44]:
class LrSchedule:
	'''
	Implements the learning rate schedule from "Attention is all you need".
	Learning rate given by (1/sqrt(d_model)) * min(steps^-0.5, steps * warmup_steps ^-1.5).
	Init with a pytorch.optim optimizer then call as usual in the training loop.
	'''
	def __init__(self, optimizer, warmup_steps, d_model, scale = 1.0):
		self.optimizer_ = optimizer 
		self.warmup_steps = warmup_steps
		self.d_model = d_model
		self.scale = scale
		self.step_ = 1

	def zero_grad(self):
		self.optimizer_.zero_grad()

	def get_lr(self, step):
		min_ = np.minimum(step ** -0.5, step * (self.warmup_steps ** -1.5))
		return (self.d_model ** -0.5) * min_

	def update_lr(self):
		lr = self.scale * self.get_lr(self.step_)
		self.step_ += 1

		for param_group in self.optimizer_.param_groups:
			param_group['lr'] = lr

	def step(self):
		self.update_lr()
		self.optimizer_.step()

	def reset_lr(self, lr = 1e-3):
		for param_group in self.optimizer_.param_groups:
			param_group['lr'] = lr

In [102]:
import pandas as pd 
import numpy as np

result = pd.read_csv('data/1_before2016.csv')
data = result.drop('meanest', axis=1)
x = data.drop('actual', axis=1)
mean_normal = np.mean(x, axis=0)
std_normal = np.std(x, axis=0)
data.loc[:, 'actq':'fincfy'] = (x - mean_normal)/std_normal

xseq = []
yseq = []
for name, content in data.groupby('tic'):
    for i in range(len(content.index)-10):
        train_seq = []
        train_label = []
        for j in range(i, i + 10):
            train_seq.append(content.iloc[j]['actual'])
        for j in range(i+1, i + 11):
            train_label.append(content.iloc[j]['actual'])
        xseq.append(train_seq)
        yseq.append(train_label)
        

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


In [103]:
result1 = pd.read_csv('data/1_after2016.csv')
data1 = result1.drop('meanest', axis=1)
x1 = data1.drop('actual', axis=1)
data1.loc[:, 'actq':'fincfy'] = (x1 - mean_normal)/std_normal

xseqt = []
yseqt = []
for name, content in data1.groupby('tic'):
    for i in range(len(content.index)-10):
        train_seq = []
        train_label = []
        for j in range(i, i + 10):
            train_seq.append(content.iloc[j]['actual'])
        for j in range(i+1, i + 11):
            train_label.append(content.iloc[j]['actual'])
        xseqt.append(train_seq)
        yseqt.append(train_label)

In [108]:
import random

model = Ml4fTransformer(1, 1)

train = [(xseq[i], yseq[i]) for i in range(len(xseq))]
random.shuffle(train)

N = len(xseq)
train_N = (int(N/64)-20) * 64
valid_N = N - train_N

train_x = torch.FloatTensor([train[i][0] for i in range(train_N)])
train_x = train_x.reshape(train_x.shape[0], -1, 1)
train_y = torch.FloatTensor([train[i][1] for i in range(train_N)])
train_y = train_y.reshape(train_y.shape[0], -1, 1)

valid_x = torch.FloatTensor([train[i][0] for i in range(train_N, N)])
valid_x = valid_x.reshape(valid_x.shape[0], -1, 1)
valid_y = torch.FloatTensor([train[i][1] for i in range(train_N, N)])
valid_y = valid_y.reshape(valid_y.shape[0], -1, 1)

test_x = torch.FloatTensor(xseqt)
test_y = torch.FloatTensor(yseqt)
test_y = test_y.reshape(test_y.shape[0], -1, 1)

mask = create_mask(train_y.shape[1]-1)

In [109]:
criteria = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
scheduler = LrSchedule(optimizer, 1e-1, 20)

In [110]:
train_x.shape, train_y.shape, valid_x.shape, valid_y.shape

(torch.Size([16896, 10, 1]),
 torch.Size([16896, 10, 1]),
 torch.Size([1333, 10, 1]),
 torch.Size([1333, 10, 1]))

In [111]:
minloss = 100
for epoch in range(100):
    total_loss = 0
    for step in range(int(train_x.shape[0]/64)):
        # 生成数据
        src = train_x[step*64 : (step*64+64)]
        tgt = train_y[step*64 : (step*64+64), :-1, :]
        tgt_y = train_y[step*64 : (step*64+64), 1:, :]
        # 清空梯度
        scheduler.zero_grad()
        # 进行transformer的计算
        out = model(src, tgt, mask)
    
        #print(out.contiguous().view(-1, out.size()[-1]))
        #print(tgt_y.contiguous().view(-1, tgt_y.size()[-1]))
        loss = criteria(out[:, -1, :], tgt_y[:, -1, :])
        #print(loss)

        # 计算梯度
        loss.backward()
        # 更新参数
        scheduler.step()

        total_loss += loss

    if epoch % 10 == 0:
        print("Epoch {}, train_loss: {}".format(epoch, total_loss))
    
        with torch.no_grad():
            v_src = valid_x
            v_tgt = valid_y[:, :-1, :]
            v_tgt_y = valid_y[:, 1:, :]
            v_out = model(v_src, v_tgt, mask)
            v_loss = criteria(v_out[:, -1, :], v_tgt_y[:, -1, :])
            if(v_loss < minloss):
              minloss = v_loss
              #torch.save(model, 'drive/MyDrive/transformer.pth')
            print("Epoch {}, valid_loss: {}".format(epoch, v_loss))


Epoch 0, train_loss: 162.93341064453125


NameError: name 'minloss' is not defined

In [96]:
def withinten(y_true, y_pred):
    sum1 = 0
    for i in range(len(y_pred)):
        if(y_pred[i] < 1.1*y_true[i] and y_pred[i] > 0.9*y_true[i]):
            sum1+=1
    return sum1/len(y_pred)

In [99]:
from sklearn.metrics import r2_score

def test(model, test_x, test_y):
  model.eval()
  with torch.no_grad():
      v_src = test_x
      v_tgt = test_y[:, :-1, :]
      v_tgt_y = test_y[:, 1:, :]
      v_out = model(v_src, v_tgt, mask)
      v_loss = criteria(v_out[:, -1, :], v_tgt_y[:, -1, :]).item()
      r2 = r2_score(v_tgt_y[:, -1, :], v_out[:, -1, :])
      within10 = withinten(v_tgt_y[:, -1, :], v_out[:, -1, :])
  return v_loss, r2, within10


In [100]:
loss, r2, within10 = test(model, test_x, test_y)
loss, r2, within10

(1.9408961534500122, 0.25726783087586114, 0.1706792777300086)