# Эксперименты по магистерской диссертации
Тема - обнаружение инсайдеров  
Датасеты - CERT 4.2, 6.2, в виде БД
### План работы:  
0) Привести набор данных к удобному виду  
1) Предобработка данных (генерация "предложений поведения пользователя")  
2) Предобработка контентных данных. Пока только письма  
3) Обучение трансформера типа Bert поведению пользователей  
4) Обучение классификатора пользователей  
5) Скор аномальности как ошибка классификатора  

In [1]:
import json
import sqlite3
import csv
from datetime import datetime
import os
import csv
import logging
import numpy as np
import itertools
from tqdm import tqdm
from collections import namedtuple
import math

class Device:
#     id,date,user,pc,file_tree,activity
    col = {name:idx for idx, name in enumerate(["id","date","user","pc","file_tree","activity"])}
    conncect = 1
    disconnect = 0
    feature_len = 2
    
class Email:
#     id,date,user,pc,to,cc,bcc,from,activity,size,attachments,content
    col = {name:idx for idx, name in enumerate(["id","date","user","pc","to","cc","bcc","from","activity","size","attachments","content"])}
    view = 0
    sent = 1
    feature_len = 6
    
class File:
#     id,date,user,pc,filename,activity,to_removable_media,from_removable_media,content
    col = {name:idx for idx, name in enumerate(["id","date","user","pc","filename","activity","to_removable_media","from_removable_media","content"])}
    open = 0
    write = 1
    copy = 2
    delete = 3
    feature_len = 6
    
    
class Http:
#     id,date,user,pc,url,activity,content
    col = {name:idx for idx, name in enumerate(["id","date","user","pc","url","activity","content"])}
    visit = 0
    download = 1
    upload = 2
    feature_len = 3
    
    
class Logon:
#     id, date, user, pc, activity
    col = {name:idx for idx, name in enumerate(["id","date","user","pc","activity"])}
    logon = 1
    logout = 0
    feature_len = 2
    
features_num = 0
for c in [Device, Email, File, Http, Logon]:
    c.feature_shift = features_num
    features_num += c.feature_len
    
ds_dir = "/media/alina/Elements/datasets/simple_ds"
fasttext_model_path = "./fast_text_model"
fasttext_vec_len = 64

In [2]:
File.feature_shift
user_computer = {'AAP0352':{6708}}

## 0) Привести набор данных к удобному виду
Пусть каждому пользователю отвечает директория, а файлы data_lig_type в ней отвечают логу пользователя в конкретный день

In [7]:
out_dir = "/media/alina/Elements/datasets/simple_ds"
in_dir = "/media/alina/65FC89D8465C3712/Documents/универ/курсовая_мага/датасеты/r6.2/"
os.makedirs(out_dir, exist_ok=True)

In [27]:
# id,date,user,pc
def parse_file(filename, function):
    log_type = os.path.splitext(os.path.basename(filename))[0]
    with open(filename, "r") as f:
        reader = csv.reader(f, delimiter=',')
        next(reader, None)
        user_records = dict()
        date = ''
        for idx, row in enumerate(reader):
            user = row[2]
            cur_date = datetime.strptime(row[1], '%m/%d/%Y %H:%M:%S').strftime('%Y_%m_%d')
            function(row)
            
            if cur_date != date:
                for u in user_records:
                    with open(os.path.join(out_dir, u, date +"_"+ log_type), "w") as small_f:
                        small_f.write('\n'.join(user_records[u]))
                date = cur_date
                user_records = dict()
                if date[-1] == '0':
                    print(f"{date} parsed")
                
            user_records.setdefault(user, [])
            user_records[user].append(','.join(row))

In [28]:
def device_f(row):
    row[5] = str(1 if row[5] == "Connect" else 0)
    
def email_f(row):
    row[8] = str(1 if row[8] == "Send" else 0)
    row[10] = str(len(row[10].split(';')))
    
def file_f(row):
    activity = File.write
    if row[5] == "File Open":
        activity = File.open
    elif row[5] == "File Copy":
        activity = File.copy
    elif row[5] == "File Delete":
        activity = File.delete
    row[5] = str(activity)
    row[6] = str(1 if row[6] == "True" else 0)
    row[7] = str(1 if row[7] == "True" else 0)
    
def http_f(row):
    activity = Http.visit
    if row[5] == "WWW Visit":
        activity = Http.visit
    elif row[5] == "WWW Upload":
        activity = Http.upload
    elif row[5] == "WWW Download":
        activity = Http.download
    row[5] = str(activity)
    
def logon_f(row):
    row[4] = str(1 if row[4] == "Logon" else 0)

In [None]:
parse_file(os.path.join(in_dir, "email.csv"), email_f)

In [None]:
parse_file(os.path.join(in_dir, "device.csv"), device_f)

In [None]:
parse_file(os.path.join(in_dir, "file.csv"), file_f)

In [None]:
parse_file(os.path.join(in_dir, "http.csv"), http_f)

In [None]:
parse_file(os.path.join(in_dir, "logon.csv"), logon_f)

In [None]:
# загрузим соответствие пользователь - комньютер

## 1) Генерация предложений пользователей (контекст)
Определим список признаков пользователя  
Каждая функция будет возвращать список таплов (дата, номер компьютера, контекст, контент)
#### общие:
- свой/не свой компьютер
- за пределами рабочего дня  
(2 признака)

#### device:
- connect
- disconnet  
(2 признака, контент отсутствует)  

#### email:
- отсылка
- посмотреть письмо
- объем
- количество приложений
- человек не из компании
- контент  
(3 признака + контент)

### file:
- действие (4 варианта)
- to_removable_media
- from_removable_media
(5 вариантов + контент)

### http:
- действие (3 варианта)
(3 варианта + контент)

### logon:
- логон
- логофф  
(2 признака)

In [5]:
#     id,date,user,pc,
def has_content(filename):
    return "_email" in filename or "_file" in filename or "_http" in filename

def get_data_pc(line):
    t = datetime.strptime(line[1], '%m/%d/%Y %H:%M:%S').time()
    assert(line[3][:3] == "PC-")
    return (t.hour * 60 + t.minute) * 60 + t.second, int(line[3][3:])

def device_events(user, date):
    device_features = []
    filename = os.path.join(ds_dir, user, date.strftime('%Y_%m_%d') + "_device")
    if not os.path.exists(filename):
        return []
    with open(filename, "r") as f:
        reader = csv.reader(f, delimiter=',')
        for row in reader:
            features = np.zeros(features_num)
            features[int(row[Device.feature_shift + Device.col["activity"]])] = 1
            device_features.append((*get_data_pc(row), features, None))
    return device_features
Device.events = device_events

def email_events(user, date):
    #     id,date,user,pc,to,cc,bcc,from,activity,size,attachments,content
    email_features = []
    filename = os.path.join(ds_dir, user, date.strftime('%Y_%m_%d') + "_email")
    if not os.path.exists(filename):
        return []
    with open(filename, "r") as f:
        reader = csv.reader(f, delimiter=',')
        for row in reader:
            features = np.zeros(features_num)
            features[int(row[Email.feature_shift + Email.col["activity"]])] = 1 # view or send
            features[Email.feature_shift + 2] = int(row[Email.col["size"]])
            features[Email.feature_shift + 3] = int(row[Email.col["attachments"]])
            features[Email.feature_shift + 4] = 0 # TODO 
            features[Email.feature_shift + 5] = 0 # TODO
            email_features.append((*get_data_pc(row), features, row[Email.col["content"]]))
    return email_features


def file_events(user, date):
    #     id,date,user,pc,filename,activity,to_removable_media,from_removable_media,content
    email_features = []
    filename = os.path.join(ds_dir, user, date.strftime('%Y_%m_%d') + "_file")
    if not os.path.exists(filename):
        return []
    with open(filename, "r") as f:
        reader = csv.reader(f, delimiter=',')
        for row in reader:
            features = np.zeros(features_num)
            features[File.feature_shift + int(row[File.col["activity"]])] = 1 # 0...3
            features[File.feature_shift + 4] = int(row[File.col["to_removable_media"]]) 
            features[File.feature_shift + 5] = int(row[File.col["from_removable_media"]]) 
            email_features.append((*get_data_pc(row), features, row[File.col["content"]]))
    return email_features


def http_events(user, date):
    #     id,date,user,pc,url,activity,content
    http_features = []
    filename = os.path.join(ds_dir, user, date.strftime('%Y_%m_%d') + "_http")
    if not os.path.exists(filename):
        return []
    with open(filename, "r") as f:
        reader = csv.reader(f, delimiter=',')
        for row in reader:
            features = np.zeros(features_num)
            features[Http.feature_shift + int(row[Http.col["activity"]])] = 1
            http_features.append((*get_data_pc(row), features, row[Http.col["url"]].replace("/", " ").replace("_", " ") + " " + row[Http.col["content"]]))
    return http_features


def logon_events(user, date):
    #     id, date, user, pc, activity
    logon_features = []
    filename = os.path.join(ds_dir, user, date.strftime('%Y_%m_%d') + "_logon")
    if not os.path.exists(filename):
        return []
    with open(filename, "r") as f:
        reader = csv.reader(f, delimiter=',')
        for row in reader:
            features = np.zeros(features_num)
            features[Logon.feature_shift + int(row[Logon.col["activity"]])] = 1
            logon_features.append((*get_data_pc(row), features, None))
    return logon_features


def merge_features(user, *args):
    all_data = list(itertools.chain(*args))
    all_data.sort(key=lambda x: x[0])
    my_computer = np.array([int(t[1] in user_computer[user]) for t in all_data])
    return [t[0]//600 for t in all_data], np.hstack((my_computer[..., None], np.vstack([t[2] for t in all_data]))), [t[3] for t in all_data]

In [6]:
user = 'AAP0352'
date = datetime(2010, 1, 5)
features = []
for f in (device_events, email_events, file_events, http_events, logon_events):
    features.append(f(user, date))
    print(features[-1][:3])
    print("---")
merged = merge_features(user, *features)
merged[1].shape

[(29531, 6708, array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.]), None), (31754, 6708, array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.]), None), (55824, 6708, array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.]), None)]
---
[(39165, 6708, array([0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 2.8293e+04,
       1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00]), 'It was expected that the building would serve a dual purpose as a museum and as the presidential meeting place for state visitors. Besides architecture'), (39489, 6708, array([0.000000e+00, 0.000000e+00, 0.000000e+00, 1.000000e+00,
       3.475206e+06, 3.000000e+00, 0.000000e+00, 0.000000e+00,
       0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
       0.000000e+00, 0.0

(102, 20)

## Предобработка контента
Используется FastText

### Обучение модели

In [None]:
from gensim.models import FastText
from gensim.parsing.preprocessing  import preprocess_string, preprocess_documents

def sent_gen(max_count=-1):
    for d, _, files in tqdm(os.walk(ds_dir)):
        for filename in filter(has_content, files):
            with open(os.path.join(d, filename), "r") as f:
                reader = csv.reader(f, delimiter=',')
                sents = preprocess_documents((row[-1].replace("/", " ").replace("_", " ") for row in reader))
                for s in sents:
                    yield s
        max_count -= 1
        if max_count == 0:
            break 
# for idx, t in enumerate(sent_gen()):
#     print(t)
#     if idx == 10:
#         break
                    
model4 = FastText(size=fasttext_vec_len, window=3, min_count=1)
model4.build_vocab(sentences = sent_gen(max_count = 2))
model4.save(fasttext_model_path)
total_examples = model4.corpus_count
model4.train(sentences=sent_gen(), total_examples=total_examples, epochs=5)

model4.save(fasttext_model_path)

In [None]:
model4.save(fasttext_model_path)

## Модель 

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

NetConfig = namedtuple('NetConfig', ['operation_len', 'content_len', 'lin0_size', 'encoder_dropout_rate', 'pos_dropout_rate', 'nhead', 'nlayers', 'nhid'])
default_config = NetConfig(
    operation_len = merged[1].shape[1],
    content_len = fasttext_vec_len, 
    lin0_size = 32,
    encoder_dropout_rate = 0.4, 
    pos_dropout_rate = 0.1,
    nhead = 2, 
    dlayers = 4, 
    nhid = 32
)


class MyBERTModel(nn.Module):
    def __init__(self, config):#, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        self._linear0 = nn.Linear(config.operation_len + config.content_len, config.lin0_size)
        self._pos_encoder = PositionalEncoding(config.lin0_size, config.dropout_rate)
        self._layer_nm = nn.LayerNorm(config.lin0_size)
        encoder_layers = nn.TransformerEncoderLayer(config.lin0_size, config.nhead, config.nhid, dropout_rate)
        self._transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)
        self.init_weights()

    def forward(self, operations, content, src_mask, timestamps=None):
        x = torch.cat((operations, operations), 1)
        x = self._linear0(x)
        x = self._pos_encoder(x)
        output = self.transformer_encoder(x, src_mask)
        return output
    
    
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, config, max_len=1000):
        super(PositionalEncoding, self).__init__()
        self._dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self._dropout(x)
    
    
class ClassifyModel(nn.Module):
    def __init__(self, config):
        super(TransformerModel, self).__init__()
        self._BERT = BERTmodel(config)
        self._ln = nn.Linear(config.lin0_size, 2)
        self._softmax = nn.Softmax(dim=1)        
    
    def forward(*args):
        x = self._BERT(*args)
        x = self._ln(x)
        x = self._softmax(x)
        return x

    
classify_criterion = nn.CrossEntropyLoss()