In [None]:
import os
import gc
gc.enable()
import glob
import random
import numpy as np 
import pandas as pd  
import json
from tqdm import tqdm
from sklearn.preprocessing import RobustScaler,StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim import lr_scheduler
from tensorflow.keras.utils import pad_sequences
from sklearn.metrics import accuracy_score
import lzma
import pickle
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 150)

def set_seeed(seed_value=23, use_cuda=True):
    np.random.seed(seed_value) 
    torch.manual_seed(seed_value) 
    random.seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    if use_cuda: 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) 
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seeed()

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
class CFG:
    CREATE_DATA = True

In [None]:
def reader(f):
    try:
        df = pd.read_csv(f)
        df['ID']=f.split('/')[-1].split('.')[0]
        return df
    except: pass

train_files_p = '/kaggle/input/network-traffic-scenario-prediction/Train_data/Train_data'
test_files_p = '/kaggle/input/network-traffic-scenario-prediction/Test_data/Test_data'
ss_p = '/kaggle/input/network-traffic-scenario-prediction/SampleSubmission.csv'

In [None]:
%%time

train_files = glob.glob(train_files_p+'/**')
dfs = []
for f in tqdm(train_files):
    dfs.append(reader(f))
train = pd.concat(dfs).fillna(0)
del dfs
gc.collect()


test_files = glob.glob(test_files_p+'/**')
dfs = []
for f in tqdm(test_files):
    dfs.append(reader(f))
test = pd.concat(dfs).fillna(0)
del dfs
gc.collect()

ss = pd.read_csv(ss_p)

display(train, test, ss)

In [None]:
train['ID'].value_counts()

In [None]:
test['ID'].value_counts()

In [None]:
train['label'].nunique()

In [None]:
class BidirectionalLSTMClassifier(nn.Module):
    def __init__(self, input_size, num_classes, hidden_size, num_layers):
        super().__init__()

        self.Lstm_layer_1 = nn.GRU(input_size=input_size,
                                    hidden_size=hidden_size,
                                    num_layers=num_layers,
                                    bidirectional=True,
                                    batch_first=True)

        self.Output = nn.Linear(in_features=self.Lstm_layer_1.hidden_size*2, out_features=num_classes)

    def forward(self, inputs):
        lstm_1_seq, _ = self.Lstm_layer_1(inputs)
        output = self.Output(lstm_1_seq)
        return output

In [None]:
train = train.sort_values('time').reset_index(drop=True)

In [None]:
sequence_length = 5000
train['sequence_id'] = train['time'] // sequence_length
train['sequence_id'] = train['sequence_id'].astype(str)
train['sequence_id_ID'] = train['sequence_id'] + '_' + train['ID']
train

In [None]:
train['sequence_id_ID'].nunique()

In [None]:
train[train['sequence_id_ID']=='0_Train53']

In [None]:
train_ids = list(train['ID'].unique())
train_ids, val_ids = train_test_split(train_ids, random_state=1, test_size=0.15)
len(train_ids), len(val_ids)

In [None]:
labels = np.array(list(set(train['label'].unique())))
labels

In [None]:
%%time

train_df = train.copy()

xtrain_df = train_df[train_df['ID'].isin(train_ids)].reset_index(drop=True)
xval_df = train_df[train_df['ID'].isin(val_ids)].reset_index(drop=True)

features = ['time','portPktIn','portPktOut','qSize']
scaler = RobustScaler() 
xtrain_df[features] = scaler.fit_transform(xtrain_df[features])
xval_df[features] = scaler.transform(xval_df[features])

xtrain_df.fillna(0,inplace=True)
xval_df.fillna(0,inplace=True)

xtrain_df.replace([np.inf,-np.inf],0,inplace=True)
xval_df.replace([np.inf,-np.inf],0,inplace=True)
    
del train,train_df
gc.collect()
    
if CFG.CREATE_DATA:
    xtrain_idids = xtrain_df['sequence_id_ID'].unique()
    xval_idids = xval_df['sequence_id_ID'].unique()

    xtrain_data = []
    xtrain_targets = []
    for d in tqdm(xtrain_idids):
        data = xtrain_df[xtrain_df['sequence_id_ID']==d].sort_values('time')
        inputs = data[features].values
        inputs = pad_sequences([inputs], maxlen=sequence_length, dtype='float', padding='post', value=-1)[0,:,:]
        xtrain_data.append(inputs)

        targets = data['label'].values
        targets = pad_sequences([targets], maxlen=sequence_length, dtype='int', padding='post', value=12)[0]
        xtrain_targets.append(targets)

    xval_data = []
    xval_targets = []
    for d in tqdm(xval_idids):
        data = xval_df[xval_df['sequence_id_ID']==d].sort_values('time')
        inputs = data[features].values
        inputs = pad_sequences([inputs], maxlen=sequence_length, dtype='float', padding='post', value=-1)[0,:,:]
        xval_data.append(inputs)

        targets = data['label'].values
        targets = pad_sequences([targets], maxlen=sequence_length, dtype='int', padding='post', value=12)[0]
        xval_targets.append(targets)

    xtrain_data = np.array(xtrain_data)
    xval_data = np.array(xval_data)

    xtrain_targets = np.array(xtrain_targets)
    xval_targets = np.array(xval_targets)
    
    with open('xtrain_data.npy','wb') as f:
        np.save(f, xtrain_data)

    with open('xval_data.npy','wb') as f:
        np.save(f, xval_data)

    with open('xtrain_targets.npy','wb') as f:
        np.save(f, xtrain_targets)

    with open('xval_targets.npy','wb') as f:
        np.save(f, xval_targets)

else:
    
    saved_path = '/kaggle/input/network-traffic-classification-lstm-data/'
    with open(saved_path+'xtrain_data.npy','rb') as f:
        xtrain_data = np.load(f)

    with open(saved_path+'xval_data.npy','rb') as f:
        xval_data = np.load(f)

    with open(saved_path+'xtrain_targets.npy','rb') as f:
        xtrain_targets = np.load(f)

    with open(saved_path+'xval_targets.npy','rb') as f:
        xval_targets = np.load(f)

In [None]:
class MySequenceDataset(Dataset):
    def __init__(self, df, features=['time','portPktIn','portPktOut','qSize'], sequence_length=sequence_length):
        
        self.df = df
        self.seq_id_IDs = self.df['sequence_id_ID'].unique()
        self.features = features
        self.sequence_length = sequence_length
        
    def __len__(self):
        return len(self.seq_id_IDs)

    def __getitem__(self, idx):
        seq_id_ID = self.seq_id_IDs[idx]
        
        data = self.df[self.df['sequence_id_ID']==seq_id_ID].sort_values('time')
        inputs = data[self.features].values
        inputs = pad_sequences([inputs], maxlen=self.sequence_length, dtype='float', padding='post', value=-1)[0,:,:]
        targets = data['label'].values
        targets = pad_sequences([targets], maxlen=self.sequence_length, dtype='int', padding='post', value=12)[0]
        
        return torch.tensor(inputs, dtype=torch.float32), torch.tensor(targets, dtype=torch.long)
    
class MySimpleSequenceDataset(Dataset):
    def __init__(self, data, targets):
        
        self.data = data
        self.targets = targets
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        inputs = self.data[idx]
        targets = self.targets[idx]
        
        return torch.tensor(inputs, dtype=torch.float32), torch.tensor(targets, dtype=torch.long)
    
def save_checkpoint(checkpoint, filename):
    torch.save(checkpoint, filename)
    print(f"\n--> Saved checkpoint: {filename.split('.')[0]}")

def load_checkpoint(filename, model):
    model.load_state_dict(torch.load(filename)['state_dict'])
    return model

In [None]:
batch_size = 16

train_ds = MySimpleSequenceDataset(xtrain_data, xtrain_targets)
val_ds = MySimpleSequenceDataset(xval_data, xval_targets)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

torch.cuda.empty_cache()
gc.collect()

In [None]:
class BidirectionalLSTMClassifier(nn.Module):
    def __init__(self, input_size, num_classes, hidden_size, num_layers):
        super().__init__()

        self.Lstm_layer_1 = nn.GRU(input_size=input_size,
                                    hidden_size=hidden_size,
                                    num_layers=num_layers,
                                    bidirectional=True,
                                    batch_first=True)

        self.Output = nn.Linear(in_features=self.Lstm_layer_1.hidden_size*2, out_features=num_classes)

    def forward(self, inputs):
        lstm_1_seq, _ = self.Lstm_layer_1(inputs)
        output = self.Output(lstm_1_seq)
        return output

In [None]:
%%time
num_classes = len(labels)+1
input_size = len(features)  
hidden_size = 16
num_layers = 1   

model = BidirectionalLSTMClassifier(input_size, num_classes, hidden_size, num_layers).to(device)

criterion = nn.CrossEntropyLoss().to(device)

for x,y in train_loader:
    break
    
output = model(x.to(device))
loss = criterion(output.view(-1, num_classes), y.to(device).view(-1))
print(loss)
_, predicted = torch.max(output, 2)
print(output.shape)
print(predicted)
print(predicted.shape)

In [None]:
%%time

epochs = 200
batch_size = 4
hidden_size = 64
num_layers = 3

train_ds = MySimpleSequenceDataset(xtrain_data, xtrain_targets)
val_ds = MySimpleSequenceDataset(xval_data, xval_targets)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

torch.cuda.empty_cache()
gc.collect()

input_size = len(features)  
num_classes = len(labels)+1
model = BidirectionalLSTMClassifier(input_size, num_classes, hidden_size, num_layers).to(device)

model_filename = 'network_traffic.pth'
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.7, patience=4, verbose=True, min_lr=1e-6)

best_val_accuracy = 0
best_val_targets = []
best_val_preds = []
epochs_without_improvement = 0
patience = 10

for epoch in range(epochs):
    dataset_size = 0
    running_loss = 0.0
    model.train()
    pbar = tqdm(enumerate(train_loader), total=len(train_loader), desc='Train ')
    for step,(inputs, targets) in pbar:
        inputs,targets = inputs.to(device), targets.to(device)

        batch_size = inputs.size(0)
        
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, num_classes), targets.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        mem = torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0
        current_lr = optimizer.param_groups[0]['lr']
        pbar.set_postfix(train_loss=f'{epoch_loss:0.4f}',
                         lr=f'{current_lr:0.5f}',
                         gpu_mem=f'{mem:0.2f} GB')
        
    torch.cuda.empty_cache()
    gc.collect()

    model.eval()
    with torch.no_grad():
        all_predictions = []
        all_targets = []
        
        dataset_size = 0
        running_loss = 0.0
        pbar = tqdm(enumerate(val_loader), total=len(val_loader), desc='Valid ')
        for step, (inputs, targets) in pbar:
            inputs,targets = inputs.to(device), targets.to(device)
            batch_size = inputs.size(0)
            
            outputs = model(inputs)
            
            loss = criterion(outputs.view(-1, num_classes), targets.view(-1))
            running_loss += (loss.item() * batch_size)
            dataset_size += batch_size

            epoch_loss = running_loss / dataset_size

            mem = torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0
            current_lr = optimizer.param_groups[0]['lr']
            pbar.set_postfix(val_loss=f'{epoch_loss:0.4f}',
                             lr=f'{current_lr:0.5f}',
                             gpu_mem=f'{mem:0.2f} GB')
            
            _, predicted = torch.max(outputs, 2)
            all_predictions.extend(predicted.cpu().numpy().ravel())
            all_targets.extend(targets.cpu().numpy().ravel())
    
    all_targets_ = np.array(all_targets)
    all_predictions_ = np.array(all_predictions)

    all_predictions_ = all_predictions_[np.where(all_targets_!=12)]
    all_targets_ = all_targets_[np.where(all_targets_!=12)]

    val_accuracy = accuracy_score(all_targets_, all_predictions_)
    print(f"Epoch {epoch+1}/{epochs}, Accuracy Score: {val_accuracy:.4f}")
    
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_val_targets = all_targets_
        best_val_preds = all_predictions_
        epochs_without_improvement = 0
        checkpoint = {'state_dict': model.state_dict(),'optimizer': optimizer.state_dict()}
        save_checkpoint(checkpoint=checkpoint, filename=model_filename)
    else:
        epochs_without_improvement +=1

    if epochs_without_improvement == patience:
        break
            
    scheduler.step(val_accuracy)
    
    torch.cuda.empty_cache()
    gc.collect()
    
del xtrain_data, xtrain_targets, xval_data, xval_targets
torch.cuda.empty_cache()
gc.collect()

In [None]:
print('BEST VAL ACCURACY: ', np.round(best_val_accuracy,5))

In [None]:
class MySimpleSequenceDatasetInference(Dataset):
    def __init__(self, data):
        
        self.data = data
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        inputs = self.data[idx]
        
        return torch.tensor(inputs, dtype=torch.float32)

In [None]:
test_files = glob.glob(test_files_p+'/**')
dfs = []
for f in tqdm(test_files):
    dfs.append(reader(f))
test = pd.concat(dfs).fillna(0)
del dfs
gc.collect()

test['time'] = test['time'].astype(int).astype(str)
test['ID2'] = test['ID'].str.replace('T','t') +"_"+ test['time'] 
test['time'] = test['time'].astype(int)

display(test)

In [None]:
test_df = test.copy()
test_df['sequence_id'] = test_df['time'] // sequence_length
test_df['sequence_id'] = test_df['sequence_id'].astype(str)
test_df['sequence_id_ID'] = test_df['sequence_id'] + '_' + test_df['ID']
test_df[features] = scaler.transform(test_df[features])
display(test_df)

In [None]:
xtest_idids = test_df['sequence_id_ID'].unique()

xtest_data = []
tmp_data = []
test_id_data = []
for d in tqdm(xtest_idids):
    data = test_df[test_df['sequence_id_ID']==d].sort_values('time')
    inputs = data[features].values
    inputs = pad_sequences([inputs], maxlen=sequence_length, dtype='float', padding='post', value=-1)[0,:,:]
    xtest_data.append(inputs)
    
    tmp_ = data['time'].values
    tmp_ = pad_sequences([tmp_], maxlen=sequence_length, dtype='float', padding='post', value=-1)[0]
    tmp_data.append(tmp_)
    
    test_id_data_ = data['ID2'].values
    test_id_data_ = pad_sequences([test_id_data_], maxlen=sequence_length, dtype=object, padding='post', value='0')[0]
    test_id_data.append(test_id_data_)
    
xtest_data = np.array(xtest_data)
tmp_data = np.array(tmp_data)
test_id_data = np.array(test_id_data)

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
%%time

model_filename = f'network_traffic.pth'
batch_size = 16

test_ds = MySimpleSequenceDatasetInference(xtest_data)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

hidden_size = 64
num_layers = 3
input_size = len(features)  
num_classes = len(labels)+1
model = BidirectionalLSTMClassifier(input_size, num_classes, hidden_size, num_layers).to(device)          
model = load_checkpoint(filename=model_filename, model=model)

torch.cuda.empty_cache()
gc.collect()

In [None]:
all_predictions = []
model.eval()
with torch.no_grad():
    pbar = tqdm(enumerate(test_loader), total=len(test_loader), desc='Inference ')
    for step, inputs in pbar:
        inputs = inputs.to(device)
        outputs = model(inputs)
        mem = torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0
        pbar.set_postfix(gpu_mem=f'{mem:0.2f} GB')
            
        _, predicted = torch.max(outputs, 2)
        all_predictions.extend(predicted.cpu().numpy().ravel())

In [None]:
test_id_data_ = test_id_data.ravel()
all_predictions_ = np.array(all_predictions)

len(all_predictions_), len(test_id_data_)

In [None]:
sub = pd.DataFrame(all_predictions_)
sub['ID'] = test_id_data_
sub.columns = ['Target','ID']
sub = sub[['ID','Target']]

sub = sub[sub['ID']!='0']
sub.loc[sub['Target']==12,'Target'] = 8
sub_ = pd.merge(ss[['ID']],sub[['ID', 'Target']], how='left', on='ID')
sub_

In [None]:
sub_.to_csv('sub.csv', index=False)

In [None]:
sub_['Target'].value_counts()