In [13]:
import pandas as pd
import numpy as np
import torch

from tqdm.auto import tqdm

In [2]:
data = pd.read_json("dataset.json", encoding="utf-8")

In [3]:
data.head()

Unnamed: 0,stations,full_timetable
0,"{'Златоуст (1)': ['0', '38', '38', '25', '29',...","{'853': {'route': ['1', '3', '2'], 'free_carri..."
1,"{'Златоуст (1)': ['0', '28', '24', '28', '8', ...","{'896': {'route': ['1', '3', '2'], 'free_carri..."
2,"{'Златоуст (1)': ['0', '25', '15', '6', '15', ...","{'309': {'route': ['1', '3', '2'], 'free_carri..."
3,"{'Златоуст (1)': ['0', '32', '30', '13', '4', ...","{'800': {'route': ['2', '3', '6', '5'], 'free_..."
4,"{'Златоуст (1)': ['0', '10', '38', '16', '1', ...","{'893': {'route': ['1', '3', '6', '5', '7'], '..."


In [5]:
data["stations"].iloc[0]

{'Златоуст (1)': ['0', '38', '38', '25', '29', '7', '10'],
 'Кыштым (2)': ['26', '0', '7', '34', '20', '27', '35'],
 'Миасс (3)': ['15', '5', '0', '27', '16', '31', '24'],
 'Муслюмово (4)': ['12', '39', '1', '0', '2', '14', '20'],
 'Челябинск (5)': ['38', '38', '1', '28', '0', '33', '14'],
 'Полетаево (6)': ['21', '27', '24', '9', '1', '0', '3'],
 'Еманжелинск (7)': ['9', '3', '23', '25', '32', '37', '0']}

In [7]:
def preprocess_stations(stations_data):
    all_stations = list(stations_data.iloc[0].keys())
    
    station_vectors = []
    for key, value in stations_data.items():
        vector = []
        for station, values in value.items():
            vector.extend([int(v) for v in values])
        station_vectors.append(vector)
    
    return np.array(station_vectors), all_stations

In [8]:
station_vectors, all_stations = preprocess_stations(data["stations"])

In [10]:
def preprocess_timetable_updated(timetable_data, all_stations):
    route_vectors = []
    free_carriage_vectors = []
    timetable_vectors = []
    
    for train_num, train_data in timetable_data.items():
        for key, value in train_data.items():
            route = [int(r) for r in value['route']]
            for i in range(len(route)-1):
                route[i] = route[i+1]
            route_vectors.append(route)

            free_carriage = [int(fc) for fc in value['free_carriage']]
            free_carriage_vectors.append(free_carriage)

            timetable = [int(time.split('-')[0].split(':')[0])*60 + int(time.split('-')[0].split(':')[1]) for time in value['timetable']]
            timetable_vectors.append(timetable)
    
    return np.array(route_vectors), np.array(free_carriage_vectors), np.array(timetable_vectors)

In [12]:
route_vectors, free_carriage_vectors, timetable_vectors = preprocess_timetable_updated(data["full_timetable"], all_stations)

  return np.array(route_vectors), np.array(free_carriage_vectors), np.array(timetable_vectors)


In [16]:
route_vectors

array([list([3, 2, 2]), list([3, 2, 2]), list([3, 6, 5, 4, 4]), ...,
       list([6, 5, 5]), list([6, 5, 5]), list([6, 5, 5])], dtype=object)

In [17]:
def pad_sequences(sequences, max_length):
    padded_sequences = []
    
    for seq in sequences:
        if len(seq) < max_length:
            padded_seq = list(seq) + [0] * (max_length - len(seq))
        else:
            padded_seq = list(seq)[:max_length]
        padded_sequences.append(padded_seq)
    
    return np.array(padded_sequences)

In [18]:
max_length_route = max([len(r) for r in route_vectors])
max_length_free_carriage = max([len(fc) for fc in free_carriage_vectors])
max_length_timetable = max([len(t) for t in timetable_vectors])

route_vectors_padded = pad_sequences(route_vectors, max_length_route)
free_carriage_vectors_padded = pad_sequences(free_carriage_vectors, max_length_free_carriage)
timetable_vectors_padded = pad_sequences(timetable_vectors, max_length_timetable)

In [22]:
num_repeats = route_vectors_padded.shape[0] // station_vectors.shape[0]
expanded_station_vectors = np.tile(station_vectors, (num_repeats, 1))

In [23]:
route_vectors_padded = route_vectors_padded[:expanded_station_vectors.shape[0]]
free_carriage_vectors_padded = free_carriage_vectors_padded[:expanded_station_vectors.shape[0]]
timetable_vectors_padded = timetable_vectors_padded[:expanded_station_vectors.shape[0]]

In [24]:
combined_input = np.hstack((expanded_station_vectors, route_vectors_padded, free_carriage_vectors_padded, timetable_vectors_padded))

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(combined_input, expanded_station_vectors, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2080000, 72), (520000, 72), (2080000, 49), (520000, 49))

In [34]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import json


class TrainNet(nn.Module):
    def __init__(self, input_shape):
        super(TrainNet, self).__init__()
        self.fc1 = nn.Linear(input_shape, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, input_shape_stations)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Создание модели


input_shape = combined_input.shape[1]
model = TrainNet(input_shape)

# Определение функции потерь и оптимизатора
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Обучение модели
epochs = 100
for epoch in tqdm(range(epochs)):
    # Преобразование данных в тензоры PyTorch
    inputs = torch.tensor(X_train, dtype=torch.float32)
    targets = torch.tensor(y_train, dtype=torch.float32)
    
    # Прямой проход
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    
    # Обратный проход и оптимизация
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # Вывод статистики обучения
    if (epoch+1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

print("Training complete.")


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch [10/100], Loss: 221.0894
Epoch [20/100], Loss: 146.7267
Epoch [30/100], Loss: 129.1243
Epoch [40/100], Loss: 119.9132
Epoch [50/100], Loss: 114.3921
Epoch [60/100], Loss: 111.9769
Epoch [70/100], Loss: 110.4389
Epoch [80/100], Loss: 109.0225
Epoch [90/100], Loss: 107.6813
Epoch [100/100], Loss: 106.1915
Training complete.


In [36]:
model(torch.tensor(X_test, dtype=torch.float32))

tensor([[ 1.9298, 17.9488, 20.0837,  ..., 19.5518, 18.5673,  0.4876],
        [ 0.2424, 20.1483, 19.8802,  ..., 19.3656, 17.7703, -1.0788],
        [-1.0337, 22.3066, 21.0141,  ..., 24.7910, 23.9227, -0.1962],
        ...,
        [ 0.1541, 19.2849, 18.2535,  ..., 17.5948, 19.6196, -1.0559],
        [ 0.3569, 19.9868, 21.9910,  ..., 22.2222, 22.1454,  0.7376],
        [ 2.3790, 17.1191, 18.5860,  ..., 15.7557, 17.0595, -0.5799]],
       grad_fn=<AddmmBackward0>)

In [37]:
torch.tensor(y_test, dtype=torch.float32)

tensor([[ 0.,  1.,  6.,  ..., 13.,  0.,  0.],
        [ 0.,  9., 34.,  ..., 26.,  1.,  0.],
        [ 0., 39.,  5.,  ..., 38., 38.,  0.],
        ...,
        [ 0., 31., 22.,  ...,  0., 14.,  0.],
        [ 0., 26., 39.,  ...,  5., 36.,  0.],
        [ 0., 28., 15.,  ...,  6.,  7.,  0.]])

In [41]:
with torch.no_grad():
    inputs = torch.tensor(X_test, dtype=torch.float32)
    predicted_vectors = model(inputs).numpy()

predicted_schedule = {}
for idx, station_name in enumerate(all_stations):
    predicted_schedule[station_name] = list(map(int, predicted_vectors[idx]))

print(predicted_schedule)

{'Златоуст (1)': [1, 17, 20, 18, 19, 18, 18, 20, -1, 19, 20, 20, 20, 18, 17, 20, 0, 18, 19, 21, 19, 17, 18, 18, 0, 18, 18, 17, 18, 17, 19, 18, 0, 17, 17, 19, 20, 16, 19, 19, 1, 19, 19, 16, 18, 19, 19, 18, 0], 'Кыштым (2)': [0, 20, 19, 18, 17, 17, 18, 19, 0, 21, 17, 19, 18, 17, 19, 21, 0, 20, 18, 17, 19, 20, 18, 19, 0, 20, 20, 18, 19, 18, 19, 17, 1, 19, 18, 20, 20, 17, 19, 18, -1, 20, 18, 19, 18, 18, 19, 17, -1], 'Миасс (3)': [-1, 22, 21, 22, 22, 23, 21, 22, -1, 22, 19, 21, 24, 22, 22, 23, 0, 21, 21, 21, 22, 21, 20, 23, -1, 22, 21, 20, 23, 21, 20, 20, 0, 24, 20, 23, 20, 22, 22, 21, -1, 22, 21, 23, 21, 20, 24, 23, 0], 'Муслюмово (4)': [0, 20, 19, 19, 18, 18, 18, 18, 1, 19, 17, 21, 19, 16, 19, 19, 0, 19, 19, 21, 19, 20, 19, 20, -1, 19, 19, 19, 17, 19, 18, 18, 0, 19, 20, 19, 20, 19, 18, 21, -2, 19, 19, 21, 19, 18, 20, 17, 0], 'Челябинск (5)': [-1, 18, 19, 18, 21, 17, 19, 19, 2, 16, 19, 17, 17, 19, 21, 18, 0, 20, 19, 18, 17, 21, 21, 19, -1, 20, 19, 21, 17, 18, 17, 19, 1, 21, 20, 21, 18, 21,

In [45]:
print(len(predicted_schedule["Златоуст (1)"]))

49


In [48]:
y_test.shape

(520000, 49)

In [51]:
data["stations"].iloc[0]

{'Златоуст (1)': ['0', '38', '38', '25', '29', '7', '10'],
 'Кыштым (2)': ['26', '0', '7', '34', '20', '27', '35'],
 'Миасс (3)': ['15', '5', '0', '27', '16', '31', '24'],
 'Муслюмово (4)': ['12', '39', '1', '0', '2', '14', '20'],
 'Челябинск (5)': ['38', '38', '1', '28', '0', '33', '14'],
 'Полетаево (6)': ['21', '27', '24', '9', '1', '0', '3'],
 'Еманжелинск (7)': ['9', '3', '23', '25', '32', '37', '0']}

In [50]:
data["full_timetable"].iloc[0]["853"]

{'route': ['1', '3', '2'],
 'free_carriage': ['21', '21'],
 'timetable': ['02:00 - 02:38', '04:48 - 06:11', '13:35 - 13:58']}

In [53]:
data.head(20).to_json("small_dataset.json")

In [56]:
data.head(50).to_json("test_dataset.json")

In [57]:
model_scripted = torch.jit.script(model) # Export to TorchScript
model_scripted.save('model.pt') # Save

In [59]:
timetable_vectors

array([list([120, 288, 815]), list([204, 392, 870]),
       list([223, 396, 702, 801, 1002]), ..., list([111, 237, 372]),
       list([132, 273, 400]), list([128, 235, 348])], dtype=object)