In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.nn.utils.rnn import pad_sequence
import matplotlib.pyplot as plt

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Предобработка данных

In [3]:
df = pd.read_parquet('train.parquet')
df

Unnamed: 0,id,dates,values,label
0,19114,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[-1.86, 0.79, 1.4, 0.15, 0.0, -1.24, -1.46, 3....",0.0
1,22769,"[2016-05-01, 2016-06-01, 2016-07-01, 2016-08-0...","[-1.04, -3.48, 0.05, -0.13, -0.01, 0.03, 0.27,...",1.0
2,76935,"[2017-03-01, 2017-04-01, 2017-05-01, 2017-06-0...","[0.28, 0.63, 0.06, 0.96, -1.4, -0.3, 1.62, 1.1...",0.0
3,66297,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[-0.33, 0.58, 1.1, -0.56, -0.95, -0.61, -0.7, ...",0.0
4,2191,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[1.31, 0.5, -0.54, 0.95, 0.65, 0.83, -1.55, -0...",0.0
...,...,...,...,...
79995,71474,"[2018-10-01, 2018-11-01, 2018-12-01, 2019-01-0...","[-0.15, -1.13, -0.87, 0.49, 0.87, 1.67, 1.91, ...",0.0
79996,36908,"[2016-07-01, 2016-08-01, 2016-09-01, 2016-10-0...","[-0.35, -1.31, 0.11, 1.46, 1.28, 1.12, 1.78, 1...",1.0
79997,63517,"[2016-03-01, 2016-04-01, 2016-05-01, 2016-06-0...","[-0.78, -2.27, -1.85, -0.67, -1.2, -2.02, 0.12...",0.0
79998,94731,"[2016-02-01, 2016-03-01, 2016-04-01, 2016-05-0...","[-2.29, -2.85, 0.92, -0.83, -1.75, -1.81, -2.2...",0.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      80000 non-null  int64  
 1   dates   80000 non-null  object 
 2   values  80000 non-null  object 
 3   label   80000 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 2.4+ MB


In [5]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0.0,57820
1.0,22180


In [6]:
df['values'] = df['values'].apply(lambda x: np.where(np.isnan(x), 0, x))

In [31]:
# Подготовка данных
values = df['values'].values
labels = df['label'].values

# Приведение всех временных рядов к одной длине (padding до максимальной длины ряда)
max_seq_length = max(len(x) for x in values)
X = pad_sequence([torch.tensor(seq, dtype=torch.float32) for seq in values], batch_first=True)

# Стандартизация значений временных рядов
scaler = StandardScaler()
X = torch.tensor(scaler.fit_transform(X.view(-1, X.shape[-1])).reshape(X.shape), dtype=torch.float32)

# Преобразование меток в тензор
y = torch.tensor(labels, dtype=torch.float32)

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=labels)

In [32]:
# Определение PyTorch Dataset
class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

train_dataset = TimeSeriesDataset(X_train, y_train)
test_dataset = TimeSeriesDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## Обучение модели

In [33]:
# Создание модели LSTM
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        hn = self.dropout(hn[-1])  # Используем последний скрытый слой
        out = self.fc(hn)
        return torch.sigmoid(out)

# Гиперпараметры
input_size = X_train.shape[1]  # Размер входных данных (размер признаков)
hidden_size = 100  # Размер скрытого состояния
num_layers = 1  # Количество слоёв LSTM
output_size = 32  # Количество классов (1 нейрон для бинарной классификации)

model = LSTMClassifier(input_size, hidden_size, num_layers, output_size).to(device)

In [34]:
# Определение функции потерь и оптимизатора
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Обучение модели
num_epochs = 10
model.train()

for epoch in range(num_epochs):
    running_loss = 0.0
    for sequences, labels in train_loader:

        sequences, labels = sequences.to(device), labels.to(device)
        # Обнуление градиентов
        optimizer.zero_grad()

        # Прямой проход (forward pass)
        outputs = model(sequences)
        loss = criterion(outputs, labels)

        # Обратный проход (backward pass) и оптимизация
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}')

# Оценка модели
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for sequences, labels in test_loader:
        sequences, labels = sequences.to(device), labels.to(device)
        outputs = model(sequences)
        predicted = (outputs > 0.5).float()
        total += labels.size(0)
        correct += (predicted.squeeze() == labels).sum().item()

accuracy = 100 * correct / total
print(f'Точность на тестовой выборке: {accuracy:.2f}%')

Epoch [1/10], Loss: 0.5983
Epoch [2/10], Loss: 0.5915
Epoch [3/10], Loss: 0.5873
Epoch [4/10], Loss: 0.5823
Epoch [5/10], Loss: 0.5784
Epoch [6/10], Loss: 0.5738
Epoch [7/10], Loss: 0.5712
Epoch [8/10], Loss: 0.5677
Epoch [9/10], Loss: 0.5658
Epoch [10/10], Loss: 0.5644
Точность на тестовой выборке: 73.25%


## Предсказания

In [35]:
df_test = pd.read_parquet('test.parquet')
df_test.head()

Unnamed: 0,id,dates,values
0,6125,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[1.85, -0.04, 0.19, -0.45, -0.75, -0.95, -2.91..."
1,26781,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[-0.41, 0.39, -0.47, -0.9, -1.46, -0.51, 0.51,..."
2,13333,"[2016-06-01, 2016-07-01, 2016-08-01, 2016-09-0...","[-0.29, -1.26, 0.17, -1.22, 0.45, -0.94, 0.16,..."
3,53218,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[-1.47, 1.55, -0.03, 0.57, -0.57, 0.6, 0.27, 1..."
4,84204,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[2.33, 1.39, -1.03, -2.64, 1.89, 1.77, 1.43, 1..."


In [36]:
df_test['values'] = df_test['values'].apply(lambda x: np.where(np.isnan(x), 0, x))

In [37]:
values_test = df_test['values'].values

# Приведение всех временных рядов к одной длине (padding до максимальной длины ряда)
max_seq_length = max(len(x) for x in values_test)
X_test = pad_sequence([torch.tensor(seq, dtype=torch.float32) for seq in values_test], batch_first=True)

# Стандартизация значений временных рядов
#X_test = torch.tensor(scaler.transform(X_test.view(-1, X_test.shape[-1])).reshape(X_test.shape), dtype=torch.float32)

In [38]:
X_test = torch.stack([torch.tensor(lst) for lst in X_test])

  X_test = torch.stack([torch.tensor(lst) for lst in X_test])


In [39]:
# Создаем DataLoader для тестовых данных (без лейблов)
test_dataset = TensorDataset(X_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [49]:
model.eval()
correct = 0
total = 0
all_outs = []

with torch.no_grad():
    for sequences in test_loader:
        sequences = sequences[0].to(device)
        outputs = model(sequences)
        predicted = (outputs > 0.5).float()
        all_outs.extend(outputs)

In [41]:
# Преобразуем список тензоров в один 2D тензор с помощью torch.stack
stacked_tensor = torch.stack([tensor.cpu() for tensor in all_outs])

# Преобразуем 2D тензор в NumPy массив
all_outs = stacked_tensor.numpy()

In [42]:
sub = pd.DataFrame(df_test['id'].copy())
sub['score'] = pd.Series(all_outs)
sub.head()

Unnamed: 0,id,score
0,6125,0.326582
1,26781,0.298763
2,13333,0.371001
3,53218,0.297968
4,84204,0.399359


In [43]:
sub.to_csv('sub4.csv', index = False)

In [44]:
torch.save(model.state_dict(), 'model.pth')