Решим задачу с помощью GRU

In [84]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd

df = pd.read_csv("./data/data_proc.csv")
df = df.dropna(subset=['close', 'close + 1 hour', 'sent_scores'])
df['delta'] = df['close + 1 hour'] - df['close']



class TimeSeriesDataset(Dataset):
    def __init__(self, df, window_size=60):
        self.window_size = window_size
        self.df = df
        
        self.scaler_views = MinMaxScaler(feature_range=(0, 1))
        
        self.raw_prices = self.df['close'].values.astype(np.float32)
        
        raw_views = self.df['views'].values.reshape(-1, 1)
        self.views_norm = self.scaler_views.fit_transform(raw_views).flatten()
        
        self.sent_norm = self.df['sent_scores'].values.astype(np.float32)
        
        self.targets = self.df['delta'].values.astype(np.float32)

    def __len__(self):
        return len(self.df) - self.window_size

    def __getitem__(self, idx):
        
        raw_price_sequence = self.raw_prices[idx : idx + self.window_size]
        
        price_start = raw_price_sequence[0]
        if price_start == 0:
            price_norm_sequence = raw_price_sequence
        else:
            price_norm_sequence = (raw_price_sequence / price_start) - 1.0
            
        sent_sequence = self.sent_norm[idx : idx + self.window_size]
        views_sequence = self.views_norm[idx : idx + self.window_size]
        
        x_sequence = np.column_stack([
            price_norm_sequence, 
            sent_sequence, 
            views_sequence
        ])
        
        y_target = self.targets[idx + self.window_size - 1]
        
        return (torch.tensor(x_sequence, dtype=torch.float32), 
                torch.tensor(y_target, dtype=torch.float32))

dataset = TimeSeriesDataset(df, window_size=60)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

for seq, target in dataloader:
    print(f"Input shape: {seq}")
    print(f"Target shape: {target}")
    break

Input shape: tensor([[[ 0.0000e+00, -4.5221e-04,  1.7646e-02],
         [-8.9999e-03,  1.4142e-04,  4.5124e-04],
         [-8.7464e-03,  2.1406e-01,  3.5301e-03],
         ...,
         [-1.6479e-02,  4.5778e-01,  1.8536e-04],
         [-1.6479e-02,  1.9380e-04,  1.4391e-03],
         [-1.4070e-02, -3.6771e-01,  5.6405e-03]],

        [[ 0.0000e+00,  1.4142e-04,  4.5124e-04],
         [ 2.5582e-04,  2.1406e-01,  3.5301e-03],
         [ 2.5582e-04,  1.4937e-01,  1.9924e-04],
         ...,
         [-7.5467e-03,  1.9380e-04,  1.4391e-03],
         [-5.1164e-03, -3.6771e-01,  5.6405e-03],
         [-2.9419e-03,  1.8459e-01,  1.0740e-03]],

        [[ 0.0000e+00,  2.1406e-01,  3.5301e-03],
         [ 0.0000e+00,  1.4937e-01,  1.9924e-04],
         [ 6.3944e-04,  9.8597e-02,  4.7762e-04],
         ...,
         [-5.3709e-03, -3.6771e-01,  5.6405e-03],
         [-3.1970e-03,  1.8459e-01,  1.0740e-03],
         [-2.5576e-04, -5.2848e-03,  4.0681e-04]],

        ...,

        [[ 0.0000e+00, -2

In [None]:
# import torch
# from torch.utils.data import Dataset, DataLoader
# from sklearn.preprocessing import MinMaxScaler
# import numpy as np
# import pandas as pd

# df = pd.read_csv("./data/data_proc.csv")
# df['delta'] = df['close + 1 hour'] - df['close']
# df = df.dropna()
# features = [feature for feature in df.columns if 'close' or 'sent_scores' or 'views' in feature]
# features.remove('close + 1 hour')
# features.remove('close + 30 minutes')
# features.remove('date')
# features.remove('Unnamed: 0')
# features.remove('sent_labels')
# seq_columns = [feature for feature in features if 'close' in feature]
# print(features)

# class TimeSeriesDataset(Dataset):
#     def __init__(self, df):
#         self.df = df.reset_index(drop=True)
#         self.scaler_price = MinMaxScaler()
#         self.scaler_views = MinMaxScaler() 
        
#         all_prices = np.concatenate([self.df[col].values for col in seq_columns]).reshape(-1, 1)
#         self.scaler_price.fit(all_prices)
        
#         self.df['views_norm'] = self.scaler_views.fit_transform(self.df['views'].values.reshape(-1, 1)).flatten()

#     def __len__(self):
#         return len(self.df)
    
#     def __getitem__(self, idx):
#         row = self.df.iloc[idx]
#         sent = row['sent_scores']
#         views = row['views_norm']
#         prices = row['close']
        
#         #prices = [row[feature] for feature in seq_columns]
#         prices_norm = self.scaler_price.transform(np.array(prices).reshape(-1, 1)).flatten()
        
#         sequence_length = len(prices_norm)
#         seq_data = np.column_stack([
#             prices,
#             sent,
#             views        
#         ])
#         seq = torch.tensor(seq_data, dtype=torch.float32)
        
#         target = torch.tensor(row['delta'], dtype=torch.float32) 
#         return seq, target

# dataset = TimeSeriesDataset(df)
# dataloader = DataLoader(dataset, batch_size=32)

['views', 'close', 'close + -10 minutes', 'close + -20 minutes', 'close + -30 minutes', 'close + -40 minutes', 'close + -50 minutes', 'close + -60 minutes', 'close + -70 minutes', 'close + -80 minutes', 'close + -90 minutes', 'close + -100 minutes', 'close + -110 minutes', 'close + -120 minutes', 'close + -130 minutes', 'close + -140 minutes', 'close + -150 minutes', 'close + -160 minutes', 'close + -170 minutes', 'close + -180 minutes', 'close + -190 minutes', 'sent_scores', 'delta']


In [85]:
import torch.nn as nn

class GRUModel(nn.Module):
    def __init__(self, input_size=3, hidden_size=8, num_layers=6, output_size=1):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # x: [batch, seq_len, input_size]
        out, hn = self.gru(x)
        final_out = self.fc(out[:, -1, :]) 
        return final_out

In [None]:
df_t = df #.iloc[:int(0.5 * len(df))]
train_size = int(0.8 * len(df_t))
df_train = df_t.iloc[:train_size]
df_val = df_t.iloc[train_size:]

train_dataset = TimeSeriesDataset(df_train)
val_dataset = TimeSeriesDataset(df_val) 

train_dataloader = DataLoader(train_dataset, batch_size=32)
val_dataloader = DataLoader(val_dataset, batch_size=32) 

model = GRUModel(input_size=3)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 50
patience = 20
best_val_loss = float('inf')
early_stop_counter = 0

for epoch in range(num_epochs):
    # Train phase
    model.train()
    train_loss = 0
    for batch_seq, batch_target in train_dataloader:
        batch_seq, batch_target = batch_seq.to(device), batch_target.to(device)
        #print(batch_seq.shape)
        optimizer.zero_grad()
        output = model(batch_seq)
        loss = criterion(output.squeeze(), batch_target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_dataloader)
    
    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_seq, batch_target in val_dataloader:
            batch_seq, batch_target = batch_seq.to(device), batch_target.to(device)
            output = model(batch_seq)
            loss = criterion(output.squeeze(), batch_target)
            val_loss += loss.item()
    val_loss /= len(val_dataloader)
    
    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stop_counter = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print(f'Early stopping at epoch {epoch+1}')
            break

Epoch 1, Train Loss: 4.6696, Val Loss: 11.4927


In [51]:
for epoch in range(num_epochs):
    # Train phase
    model.train()
    train_loss = 0
    for batch_seq, batch_target in train_dataloader:
        batch_seq, batch_target = batch_seq.to(device), batch_target.to(device)
        optimizer.zero_grad()
        output = model(batch_seq)
        loss = criterion(output.squeeze(), batch_target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_dataloader)
    
    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_seq, batch_target in val_dataloader:
            batch_seq, batch_target = batch_seq.to(device), batch_target.to(device)
            output = model(batch_seq)
            loss = criterion(output.squeeze(), batch_target)
            val_loss += loss.item()
    val_loss /= len(val_dataloader)
    
    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stop_counter = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print(f'Early stopping at epoch {epoch+1}')
            break

Epoch 1, Train Loss: 3396.2359, Val Loss: 15402.4831
Epoch 2, Train Loss: 3130.6564, Val Loss: 14454.2634
Epoch 3, Train Loss: 2915.3867, Val Loss: 13606.8082
Epoch 4, Train Loss: 2743.5901, Val Loss: 12853.8767
Epoch 5, Train Loss: 2608.7521, Val Loss: 12188.9420
Epoch 6, Train Loss: 2504.8216, Val Loss: 11605.5491
Epoch 7, Train Loss: 2426.2589, Val Loss: 11096.9431
Epoch 8, Train Loss: 2368.0982, Val Loss: 10656.5401
Epoch 9, Train Loss: 2326.0285, Val Loss: 10277.7114
Epoch 10, Train Loss: 2296.3583, Val Loss: 9953.9987
Epoch 11, Train Loss: 2276.0453, Val Loss: 9679.2767
Epoch 12, Train Loss: 2262.6062, Val Loss: 9447.6141
Epoch 13, Train Loss: 2254.1091, Val Loss: 9253.4608
Epoch 14, Train Loss: 2249.0358, Val Loss: 9091.6300
Epoch 15, Train Loss: 2246.3013, Val Loss: 8957.5197
Epoch 16, Train Loss: 2245.0934, Val Loss: 8847.0083
Epoch 17, Train Loss: 2244.8260, Val Loss: 8756.3067
Epoch 18, Train Loss: 2245.1183, Val Loss: 8682.1676
Epoch 19, Train Loss: 2245.7130, Val Loss: 862

KeyboardInterrupt: 