In [57]:
import torch

import copy
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from torch import nn
import pickle as pk

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

RANDOM_SEED = 2077
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x20da5ca3250>

In [58]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [59]:
df1 = pd.read_csv('data/train.csv')
df1['length']=df1['request'].apply(len)
df1.describe()

Unnamed: 0,y_true,length
count,11324.0,11324.0
mean,0.029053,777.405864
std,0.167963,336.557775
min,0.0,94.0
25%,0.0,539.0
50%,0.0,739.0
75%,0.0,746.0
max,1.0,2044.0


In [60]:
df_test=pd.read_csv('data/test.csv')
df_test['length']=df_test['request'].apply(len)
# df_short_test=df_test[df_test['length']<df_test['length'].quantile(0.89)]
# df_short_test.describe()

In [61]:
df_short=df1[df1['length']<df1['length'].quantile(0.89)]
df_short.describe()

Unnamed: 0,y_true,length
count,10075.0,10075.0
mean,0.032655,670.735583
std,0.177741,117.536675
min,0.0,94.0
25%,0.0,533.0
50%,0.0,726.0
75%,0.0,742.0
max,1.0,1299.0


In [62]:
norm_data=df_short[df_short['y_true']==0.0].drop(['y_true','length'],axis=1)
norm_data.shape

(9746, 1)

In [63]:
anomaly_data=df_short[df_short['y_true']==1.0].drop(['y_true','length'],axis=1)
anomaly_data.shape

(329, 1)

In [64]:
X_train, X_check = train_test_split(norm_data,test_size=0.15,random_state=RANDOM_SEED)
X_check, X_test = train_test_split(X_check,test_size=0.33, random_state=RANDOM_SEED)

In [65]:
X_train

Unnamed: 0,request
6753,"\nThu, 15 Mar 2018 14:45:52 INFO\nPOST /vulnba..."
3633,"\nThu, 15 Mar 2018 14:45:52 INFO\nGET /vulnban..."
2403,"\nThu, 15 Mar 2018 14:45:52 INFO\nGET /vulnban..."
220,"\nThu, 15 Mar 2018 14:45:52 INFO\nPOST /vulnba..."
6593,"\nThu, 15 Mar 2018 14:45:52 INFO\nPOST /vulnba..."
...,...
654,"\nThu, 15 Mar 2018 14:45:52 INFO\nPOST /vulnba..."
5162,"\nThu, 15 Mar 2018 14:45:52 INFO\nPOST /vulnba..."
3696,"\nThu, 15 Mar 2018 14:45:52 INFO\nPOST /vulnba..."
2144,"\nThu, 15 Mar 2018 14:45:52 INFO\nGET /vulnban..."


In [66]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_fit = tfidf_vectorizer.fit(X_train['request'])
X_train_vec = pd.DataFrame(tfidf_vectorizer.transform(X_train['request']).todense())
X_test_norm_vec = pd.DataFrame(tfidf_vectorizer.transform(X_test['request']).todense())
X_check_vec = pd.DataFrame(tfidf_vectorizer.transform(X_check['request']).todense())
X_test_anom_vec = pd.DataFrame(tfidf_vectorizer.transform(anomaly_data['request']).todense())
X_train_vec.shape,X_test_norm_vec.shape,X_test_anom_vec.shape,X_check_vec.shape,type(X_check_vec)

((8284, 1200),
 (483, 1200),
 (329, 1200),
 (979, 1200),
 pandas.core.frame.DataFrame)

In [67]:
tfidf_fit

TfidfVectorizer()

In [68]:
pk.dump(tfidf_fit, open("tfidf.pkl","wb"))

In [70]:
tfidf_vocab = pk.load(open("tfidf.pkl", 'rb'))
tfidf_vectorizer = TfidfVectorizer(vocabulary=tfidf_vocab.vocabulary_)
df_ = pd.DataFrame(tfidf_vectorizer.fit_transform(df_test['request']).todense())
df_.shape

(11764, 1200)

In [53]:
df_test_vec=pd.DataFrame(tfidf_vectorizer.transform(df_test['request']).todense())
df_test_vec.shape

(11764, 1200)

In [71]:
pca=PCA(n_components=0.99)
pca_fit = pca.fit_transform(X_train_vec)
X_train_vec = pd.DataFrame(pca.transform(X_train_vec))
X_test_norm_vec = pd.DataFrame(pca.transform(X_test_norm_vec))
X_check_vec = pd.DataFrame(pca.transform(X_check_vec))
X_test_anom_vec = pd.DataFrame(pca.transform(X_test_anom_vec))
X_train_vec.shape,X_test_norm_vec.shape,X_test_anom_vec.shape,X_check_vec.shape,type(X_check_vec)

((8284, 585), (483, 585), (329, 585), (979, 585), pandas.core.frame.DataFrame)

In [72]:
pk.dump(pca, open("pca.pkl","wb"))

In [73]:
pca = pk.load(open("pca.pkl", 'rb'))
df_ = pd.DataFrame(pca.transform(df_))
df_.shape

(11764, 585)

In [56]:
df_test_vec = pd.DataFrame(pca.transform(df_test_vec))
df_test_vec.shape

(11764, 585)

In [118]:
def create_dataset(df):

  sequences = df.astype(np.float32).to_numpy().tolist()
  # print(df.shape,type(df))
  # torch.tensor(scipy.sparse.csr_matrix.todense(train_in_distribution)).float()
  dataset = [torch.tensor(s).unsqueeze(1).float() for s in sequences]
  # dataset = [torch.tensor(scipy.sparse.csr_matrix.todense(s)).unsqueeze(1).float() for s in sequences]

  n_seq, seq_len, n_features = torch.stack(dataset).shape

  return dataset, seq_len, n_features

In [250]:
train_vec_dataset, seq_len, n_features = create_dataset(X_train_vec)
check_vec_dataset, _, _ = create_dataset(X_check_vec)
test_norm_vec_dataset, _, _ = create_dataset(X_test_norm_vec)
test_anom_vec_dataset, _, _ = create_dataset(X_test_anom_vec)

In [251]:
gen_test_dataset, _, _ = create_dataset(df_test_vec)

In [255]:
seq_len,n_features

(585, 1)

In [257]:
class Encoder(nn.Module):
    def __init__(self, seq_len, n_features, embedding_dim=64):
        super(Encoder, self).__init__()

        self.seq_len = seq_len
        self.n_features = n_features
        self.embedding_dim = embedding_dim
        self.hidden_dim = 2 * embedding_dim

        self.lstm1 = nn.LSTM(input_size=n_features, hidden_size=self.hidden_dim, num_layers=1, batch_first=True)
        self.lstm2 = nn.LSTM(input_size=self.hidden_dim, hidden_size=embedding_dim, num_layers=1, batch_first=True)

    def forward(self, x):
        x = x.reshape((1, self.seq_len, self.n_features))

        x, (_, _) = self.lstm1(x)
        x, (h_n, _) = self.lstm2(x)
        return h_n.reshape((self.n_features, self.embedding_dim))

In [258]:
class Decoder(nn.Module):
    def __init__(self, seq_len, input_dim=64, n_features=1):
        super(Decoder, self).__init__()

        self.seq_len = seq_len
        self.input_dim = input_dim
        self.hidden_dim = 2 * input_dim
        self.n_features = n_features

        self.lstm1 = nn.LSTM(input_size=input_dim, hidden_size=input_dim, num_layers=1, batch_first=True)
        self.lstm2 = nn.LSTM(input_size=input_dim, hidden_size=self.hidden_dim, num_layers=1, batch_first=True)

        self.output = nn.Linear(self.hidden_dim, n_features)

    def forward(self, x):
        x = x.repeat(self.seq_len, self.n_features)
        x = x.reshape((self.n_features, self.seq_len, self.input_dim))

        x, (h_n, c_n) = self.lstm1(x)
        x, (h_n, c_n) = self.lstm2(x)
        x = x.reshape((self.seq_len, self.hidden_dim))
        return self.output(x)

In [259]:
class LSTMAutoencoder(nn.Module):
    def __init__(self, seq_len, n_features, embedding_dim=64):
        super(LSTMAutoencoder, self).__init__()

        self.encoder = Encoder(seq_len, n_features, embedding_dim).to(DEVICE)
        self.decoder = Decoder(seq_len, embedding_dim, n_features).to(DEVICE)

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

Our Autoencoder passes the input through the Encoder and Decoder. Let's create an instance of it:

In [260]:
model = LSTMAutoencoder(seq_len, n_features, 256)
model = model.to(DEVICE)

## Training

Let's write a helper function for our training process:

In [261]:
def train_model(model, train_dataset, check_dataset, epochs):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.MSELoss().to(DEVICE)

    best_model_w = copy.deepcopy(model.state_dict())
    best_loss = 10000.0

    for epoch in range(epochs):
        model = model.train()
        train_loss_list = list()
        for train_d in train_dataset:
            optimizer.zero_grad()
            train_d = train_d.to(DEVICE)
            pred_d = model(train_d)
            loss = criterion(pred_d, train_d)
            loss.backward()
            optimizer.step()
            train_loss_list.append(loss.item())

        model = model.eval()
        val_loss_list = list()
        with torch.no_grad():
            for check_d in check_dataset:
                check_d = check_d.to(DEVICE)
                seq_pred = model(check_d)
                loss = criterion(seq_pred, check_d)
                val_loss_list.append(loss.item())
        train_loss = np.mean(train_loss_list)
        val_loss = np.mean(val_loss_list)
        if val_loss < best_loss:
            best_loss = val_loss
            best_model_w = copy.deepcopy(model.state_dict())
        print('Epoch {epoch}: train loss {train_loss} val loss {val_loss}'.format(epoch=epoch, train_loss=train_loss,
                                                                                  val_loss=val_loss))
    model.load_state_dict(best_model_w)
    return model.eval()

In [None]:
model = train_model(model, train_vec_dataset,check_vec_dataset,epochs=150)

Epoch 1: train loss 4.379949882604948 val loss 4.081418354494214
Epoch 2: train loss 4.304078599584672 val loss 4.040680107958348
Epoch 3: train loss 4.306429691817782 val loss 4.086635550147301
Epoch 4: train loss 4.303155657549747 val loss 4.051419304875967
Epoch 5: train loss 4.303577233690504 val loss 4.094443501442275
Epoch 6: train loss 4.3055451886196865 val loss 4.045056180398725
Epoch 7: train loss 4.304090342678234 val loss 4.050448175104938
Epoch 8: train loss 4.302714952376084 val loss 4.051096580123512
Epoch 9: train loss 4.303396956434807 val loss 4.052989974815828
Epoch 10: train loss 4.304124182833962 val loss 4.083509454201142
Epoch 11: train loss 4.30641805141344 val loss 4.05071648513942
Epoch 12: train loss 4.303329603992068 val loss 4.050594602347151
Epoch 13: train loss 4.304686496909375 val loss 4.060787711615947
Epoch 14: train loss 4.304070798225992 val loss 4.051714015810677
Epoch 15: train loss 4.306213473824704 val loss 4.073600132204782
Epoch 16: train loss

In [None]:
torch.save(model, 'model.pt')

In [None]:
# model = torch.load('model.pt')
# model = model.to(DEVICE)

In [None]:
def predict(model, dataset):
    predictions = list()
    loss_list = list()
    criterion = nn.L1Loss(reduction='sum').to(DEVICE)
    with torch.no_grad():
        model = model.eval()
        for seq_true in dataset:
            seq_true = seq_true.to(DEVICE)
            seq_pred = model(seq_true)
            loss = criterion(seq_pred, seq_true)
            predictions.append(seq_pred.cpu().numpy().flatten())
            loss_list.append(loss.item())
    return predictions, loss_list

In [None]:
_, loss_list = predict(model, train_vec_dataset)
sns.distplot(loss_list, bins=100, kde=True)

In [None]:
THRESHOLD = 4.5

In [None]:
predictions, pred_norm_loss_list = predict(model, test_norm_vec_dataset)
sns.distplot(pred_norm_loss_list, bins=100, kde=True);

In [None]:
correct = sum(l <= THRESHOLD for l in pred_norm_loss_list)
print('Correct normal predictions:', correct,'/',len(test_norm_vec_dataset))

In [None]:
anomaly_dataset = test_anom_vec_dataset[:len(test_norm_vec_dataset)]

In [None]:
predictions, pred_anom_loss_list = predict(model, anomaly_dataset)
sns.distplot(pred_anom_loss_list, bins=100, kde=True);

In [None]:
correct = sum(l <= THRESHOLD for l in anomaly_dataset)
print('Correct anomaly predictions:', correct,'/',len(anomaly_dataset))

In [None]:
predictions, pred_loss_list = predict(model, gen_test_dataset)

In [None]:
result=pd.DataFrame({'y_true':[0 if l<=THRESHOLD else 1 for l in pred_loss_list]})
result.describe()