In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.optim.lr_scheduler import _LRScheduler
from torch.utils.data import TensorDataset, DataLoader

data_path = 'X_train.parquet'
target_path = 'y_train.parquet'
data_test_path = 'X_test.parquet'

In [2]:
# from torcheval.metrics.functional import multiclass_f1_score

In [2]:
def preprocess_data(series):
    '''
        На входу получает историю для одной фичи, данные нормируются от 0 до 1, пропущенные значение интерполируются
    '''
    data = pd.Series(series.values)
    
    inter_data= data.interpolate()

    np_data = np.array(inter_data)
    values = np_data.reshape((len(np_data), 1))
    # train the normalization
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler = scaler.fit(values)
    # normalize the dataset and print the first 5 rows
    norm_inter_data = scaler.transform(values)
    
    return norm_inter_data

In [3]:
df_data = dd.read_parquet(data_path, engine="pyarrow")
data4 = df_data.iloc[:,:16]

data4 = data4.compute()
print('data saved')

data saved


In [4]:
# Загрузка таргетов для обучения
target = pd.read_parquet(target_path)

target4 = target.iloc[:,93:116].astype(int)
print('Кол-во таргетов у машины №4: ', len(target4.columns))

Кол-во таргетов у машины №4:  23


In [5]:
target4 = target4.loc[target4.index < '2021-04-01']
target4_M3 = target4[target4 != 1].dropna()
target4_M3.replace(2,1, inplace=True)

In [6]:
# data4["date"] = data4.index.date
data4_M3 = data4.loc[target4_M3.index]
data4_M3['ЭКСГАУСТЕР 4. ВИБРАЦИЯ НА ОПОРЕ 4'] = data4_M3['ЭКСГАУСТЕР 4. ВИБРАЦИЯ НА ОПОРЕ 4'].abs()

In [8]:
# data4_M3.shape, target4_M3.shape

In [7]:
list_columns = list(data4_M3.columns)
index_columns = list(data4_M3.index)

df_train = pd.DataFrame(columns=list_columns, index = index_columns)

for i in list_columns:
    current_data = preprocess_data(data4_M3[i])
    df_train[i] = current_data

In [8]:
X_data = np.array(df_train)
# X_data.shape

In [11]:
# X_data.shape

In [9]:
def absfft(x):
    return np.abs(np.fft.fft(x))

feat_fft_array = np.copy(X_data)
feat_fft_array = np.apply_along_axis(absfft, 1, feat_fft_array)

In [10]:
target4_M3.replace(2,1, inplace=True)
np.unique(target4_M3.values, return_counts=True)

(array([0., 1.]), array([126014216,  33899632]))

In [11]:
def create_datasets(data, target, train_size, valid_pct=0.1, seed=None):
    """Converts NumPy arrays into PyTorch datsets.
    
    Three datasets are created in total:
        * training dataset
        * validation dataset
        * testing (un-labelled) dataset

    """
    raw, fft = data
    assert len(raw) == len(fft)
    sz = train_size
    idx = np.arange(sz)
    trn_idx, val_idx = train_test_split(
        idx, test_size=valid_pct, random_state=seed)
    trn_ds = TensorDataset(
        torch.tensor(raw[:sz][trn_idx]).float(), 
        torch.tensor(fft[:sz][trn_idx]).float(), 
        torch.tensor(target[:sz][trn_idx]).long())
    val_ds = TensorDataset(
        torch.tensor(raw[:sz][val_idx]).float(), 
        torch.tensor(fft[:sz][val_idx]).float(), 
        torch.tensor(target[:sz][val_idx]).long())
    tst_ds = TensorDataset(
        torch.tensor(raw[sz:]).float(), 
        torch.tensor(fft[sz:]).float(), 
        torch.tensor(target[sz:]).long())
    return trn_ds, val_ds, tst_ds

In [12]:
def create_loaders(data, bs=128, jobs=0):
    """Wraps the datasets returned by create_datasets function with data loaders."""
    
    trn_ds, val_ds, tst_ds = data
    trn_dl = DataLoader(trn_ds, batch_size=bs, shuffle=False, num_workers=jobs)
    val_dl = DataLoader(val_ds, batch_size=bs, shuffle=False, num_workers=jobs)
    tst_dl = DataLoader(tst_ds, batch_size=bs, shuffle=False, num_workers=jobs)
    return trn_dl, val_dl, tst_dl

In [13]:
trn_sz = int(X_data.shape[0]*0.7)
seed = 42
data = (X_data, feat_fft_array)

# datasets = create_datasets(data, target_new, trn_sz, seed=seed)

datasets = create_datasets(data, np.array(target4_M3), trn_sz, seed=seed)

In [14]:
trn_dl, val_dl, tst_dl = create_loaders(datasets, bs=4096)

In [15]:
def calculate_predict(sigmoid_prob, y_true):
    
    y_pred = torch.as_tensor((sigmoid_prob - 0.5) > 0, dtype=torch.int32)
    TP = ((y_pred == 1) * (y_true == 1)).sum()
    FP = ((y_pred == 1) * (y_true != 1)).sum()
    FN = ((y_pred != 1) * (y_true == 1)).sum()
    J = TP/(TP+FP+FN)
    J = J/len(y_pred)
    
    return J.cpu().detach().numpy().item()

In [16]:
class _SepConv1d(nn.Module):
    """A simple separable convolution implementation.
    
    The separable convlution is a method to reduce number of the parameters 
    in the deep learning network for slight decrease in predictions quality.
    """
    def __init__(self, ni, no, kernel, stride, pad):
        super().__init__()
        self.depthwise = nn.Conv1d(ni, ni, kernel, stride, padding=pad, groups=ni)
        self.pointwise = nn.Conv1d(ni, no, kernel_size=1)

    def forward(self, x):
        return self.pointwise(self.depthwise(x))
    
class SepConv1d(nn.Module):
    """Implementes a 1-d convolution with 'batteries included'.
    
    The module adds (optionally) activation function and dropout layers right after
    a separable convolution layer.
    """
    def __init__(self, ni, no, kernel, stride, pad, drop=None,
                 activ=lambda: nn.ReLU(inplace=True)):
    
        super().__init__()
        assert drop is None or (0.0 < drop < 1.0)
        layers = [_SepConv1d(ni, no, kernel, stride, pad)]
        if activ:
            layers.append(activ())
        if drop is not None:
            layers.append(nn.Dropout(drop))
        self.layers = nn.Sequential(*layers)
        
    def forward(self, x): 
        return self.layers(x)

class Flatten(nn.Module):
    """Converts N-dimensional tensor into 'flat' one."""

    def __init__(self, keep_batch_dim=True):
        super().__init__()
        self.keep_batch_dim = keep_batch_dim

    def forward(self, x):
        if self.keep_batch_dim:
            return x.transpose(1, 0)
        return x.transpose(1, 0)

In [17]:
class Classifier(nn.Module):
    def __init__(self, raw_ni, fft_ni, no, drop=.5):
        super().__init__()
        
        self.raw = nn.Sequential(
            SepConv1d(    raw_ni,  32, 7, 2, 3, drop=drop),
            SepConv1d(    32,  64, 5, 4, 2, drop=drop),
            SepConv1d(    64, 128, 5, 4, 2, drop=drop),
            SepConv1d(   128, 256, 5, 4, 2),
            nn.Flatten(),
            nn.Dropout(drop), nn.Linear(256, 128), nn.PReLU(), nn.BatchNorm1d(128),
            nn.Dropout(drop), nn.Linear(128, 64), nn.PReLU(), nn.BatchNorm1d(64))
        
        self.fft = nn.Sequential(
            SepConv1d(    fft_ni,  32, 7, 2, 3, drop=drop),
            SepConv1d(    32,  64, 5, 4, 2, drop=drop),
            SepConv1d(    64, 128, 5, 4, 2, drop=drop),
            SepConv1d(   128, 256, 5, 4, 2),
            nn.Flatten(),
            nn.Dropout(drop), nn.Linear(256, 128), nn.PReLU(), nn.BatchNorm1d(128),
            nn.Dropout(drop), nn.Linear(128, 64), nn.PReLU(), nn.BatchNorm1d(64))
        

        self.out1 = nn.Sequential(
            nn.Linear(128, 512), nn.ReLU(inplace=True), nn.Linear(512, 128), nn.ReLU(inplace=True), nn.Linear(128, no))

    def forward(self, t_raw, t_fft):
        raw_out = self.raw(t_raw)
        fft_out = self.fft(t_fft)
        t_in = torch.cat([raw_out, fft_out], dim=1)        
        out1 = self.out1(t_in)
        return out1

In [18]:
device = 'cuda:2'

lr = 0.001
n_epochs = 10
num_classes = 23
best_acc = 0
loss_history = []
acc_history = []

raw_feat = X_data.shape[1]
fft_feat = feat_fft_array.shape[1]

model = Classifier(raw_feat, fft_feat, num_classes).to(device)
criterion = nn.BCEWithLogitsLoss()
opt = optim.Adam(model.parameters(), lr=lr, weight_decay=0.01)

print('Start model training')

Start model training


In [19]:
for epoch in tqdm(range(1, n_epochs + 1)):
    
    model.train()
    epoch_loss = 0
    for batch in tqdm(trn_dl):
        x_raw, x_fft, y_batch = [t.to(device) for t in batch]
        opt.zero_grad()
        X_raw = x_raw[:,:,None]
        X_fft = x_fft[:,:,None]
        out = model(X_raw, X_fft)
        loss = criterion(out, y_batch.float())
        epoch_loss += loss.item()
        loss.backward()
        opt.step()
    
    
    epoch_loss /= len(trn_dl)
    if epoch % 1 == 0:
        print( 'Номер эпохи', epoch, 'Train_loss', "%.4f" % epoch_loss)
    loss_history.append(epoch_loss)
    
#     if epoch % 1 == 0:
#         model.eval()
#         correct, total = 0, 0
#         acc = 0
#         for batch in tqdm(val_dl):
#             x_raw, x_fft, y_batch = [t.to(device) for t in batch]
            
#             X_raw = x_raw[:,:,None]
#             X_fft = x_fft[:,:,None]
#             out1, out2 = model(X_raw, X_fft)

#             out = torch.concatenate([out1[:,:, None],out2[:,:, None]], axis = 2)

#             softmax_prob = torch.sigmoid(out)
#             accuracy = calculate_predict(softmax_prob)
#             acc += accuracy
#             total += y_batch.size(0)

#         acc = acc / total
#         print( 'Номер эпохи', epoch, 'Accuracy_val', "%.6f" % acc)
#         acc_history.append(acc)
            
print('Done!')

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1070 [00:00<?, ?it/s]

Номер эпохи 1 Train_loss 0.2454


  0%|          | 0/1070 [00:00<?, ?it/s]

Номер эпохи 2 Train_loss 0.2460


  0%|          | 0/1070 [00:00<?, ?it/s]

Номер эпохи 3 Train_loss 0.2424


  0%|          | 0/1070 [00:00<?, ?it/s]

Номер эпохи 4 Train_loss 0.2419


  0%|          | 0/1070 [00:00<?, ?it/s]

Номер эпохи 5 Train_loss 0.2419


  0%|          | 0/1070 [00:00<?, ?it/s]

Номер эпохи 6 Train_loss 0.2419


  0%|          | 0/1070 [00:00<?, ?it/s]

Номер эпохи 7 Train_loss 0.2419


  0%|          | 0/1070 [00:00<?, ?it/s]

Номер эпохи 8 Train_loss 0.2419


  0%|          | 0/1070 [00:00<?, ?it/s]

Номер эпохи 9 Train_loss 0.2419


  0%|          | 0/1070 [00:00<?, ?it/s]

Номер эпохи 10 Train_loss 0.2419
Done!


In [20]:
torch.save(model.state_dict(), 'Ex4_M3')