<a href="https://colab.research.google.com/github/MahjabeenTahir/MIDAS/blob/master/midas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
import sys
sys.path.append('/content/drive/MyDrive/thesis/pyMIDAS/')

In [9]:
from midas import midas
import pandas as pd
import numpy as np

In [11]:
data = pd.read_csv("/content/drive/MyDrive/thesis/pyMIDAS/UNSW-NB15_1.csv", names=['src', 'dst', 'timestamp'])


  exec(code_obj, self.user_global_ns, self.user_ns)


In [12]:
from midas.anom import midas, midasR

In [14]:
import math

import numpy as np
from tqdm import tqdm

from midas.edgehash import Edgehash
from midas.nodehash import Nodehash

__all__ = [
    'midas',
    'midasR',
]

def counts_to_anom(tot, cur, cur_t):
    cur_mean = tot / cur_t
    sqerr = np.power(max(0, cur - cur_mean), 2)
    return sqerr / cur_mean + sqerr / (cur_mean * max(1, cur_t - 1))

def getRowInfo(row, anom_score,cur_count,total_count,cur_t):
    cur_src = int(row["src"]) 
    cur_dst = int(row["dst"])
    cur_count.insert(cur_src, cur_dst, 1)
    total_count.insert(cur_src, cur_dst, 1)
    cur_mean = total_count.get_count(cur_src, cur_dst) / cur_t
    sqerr = np.power(cur_count.get_count(cur_src, cur_dst) - cur_mean, 2)
    cur_score = 0 if cur_t == 1 else sqerr / cur_mean + sqerr / (cur_mean * (cur_t - 1))
    cur_score = 0 if math.isnan(cur_score) else cur_score
    anom_score.append(cur_score)

def midas(df, num_rows, num_buckets):
    m = df.src.max()
    cur_count = Edgehash(num_rows, num_buckets, m)
    total_count = Edgehash(num_rows, num_buckets, m)
    anom_score = []
    
    time_df = df.groupby(["timestamp"])
    timestamp_keys =  time_df.groups.keys()
    for timeframe in tqdm(timestamp_keys):
        cur_t = timeframe
        curr_df = time_df.get_group(timeframe)        
        curr_df.apply(lambda row: getRowInfo(row, anom_score,cur_count,total_count,cur_t), axis=1)
        cur_count.clear()
    return anom_score


def midasR(src, dst, times, num_rows, num_buckets, factor):
    m = np.max(src)
    num_entries = src.shape[0]
    cur_count = Edgehash(num_rows, num_buckets, m)
    total_count = Edgehash(num_rows, num_buckets, m)
    src_score = Nodehash(num_rows, num_buckets)
    dst_score = Nodehash(num_rows, num_buckets)
    src_total = Nodehash(num_rows, num_buckets)
    dst_total = Nodehash(num_rows, num_buckets)
    anom_score = np.zeros(num_entries)
    cur_t = 1

    for i in range(num_entries):
        if i == 0 or times[i] > cur_t:
            cur_count.lower(factor)
            src_score.lower(factor)
            dst_score.lower(factor)
            cur_t = times[i]

        cur_src = src[i]
        cur_dst = dst[i]
        cur_count.insert(cur_src, cur_dst, 1)
        total_count.insert(cur_src, cur_dst, 1)
        src_score.insert(cur_src, 1)
        dst_score.insert(cur_dst, 1)
        src_total.insert(cur_src, 1)
        dst_total.insert(cur_dst, 1)

        cur_score = counts_to_anom(
            total_count.get_count(cur_src, cur_dst),
            cur_count.get_count(cur_src, cur_dst),
            cur_t,
        )
        cur_score_src = counts_to_anom(
            src_total.get_count(cur_src), src_score.get_count(cur_src), cur_t
        )
        cur_score_dst = counts_to_anom(
            dst_total.get_count(cur_dst), dst_score.get_count(cur_dst), cur_t
        )
        combined_score = max(cur_score_src, cur_score_dst, cur_score)
        anom_score[i] = np.log(1 + combined_score)

    return anom_score

In [15]:
import numpy as np


class Edgehash:
    def __init__(self, r, b, m0):
        self.num_rows = r
        self.num_buckets = b
        self.m = m0
        self.hash_a = np.random.randint(low=1, high=b, size=r)
        self.hash_b = np.random.randint(low=0, high=b, size=r)
        self.count = np.zeros((self.num_rows, self.num_buckets))

    def hash(self, a, b, i):
        resid = ((a + self.m * b) * self.hash_a[i] + self.hash_b[i]) % self.num_buckets
        return resid + (self.num_buckets if (resid < 0) else 0)

    def insert(self, a, b, weight):
        for i in range(self.num_rows):
            bucket = self.hash(a, b, i)
            self.count[i][bucket] += weight

    def get_count(self, a, b):
        bucket = self.hash(a, b, 0)
        min_count = self.count[0][bucket]
        for i in range(1, self.num_rows):
            bucket = self.hash(a, b, i)
            min_count = min(min_count, self.count[i][bucket])

        return min_count

    def clear(self):
        self.count = np.zeros((self.num_rows, self.num_buckets))

    def lower(self, factor):
        self.count = self.count * factor

In [16]:
import numpy as np


class Nodehash:
    def __init__(self, r, b):
        self.num_rows = r
        self.num_buckets = b
        self.hash_a = np.random.randint(low=1, high=b, size=r)
        self.hash_b = np.random.randint(low=0, high=b, size=r)
        self.count = np.zeros((self.num_rows, self.num_buckets))

    def hash(self, a, i):
        resid = (a * self.hash_a[i] + self.hash_b[i]) % self.num_buckets
        return resid + (self.num_buckets if (resid < 0) else 0)

    def insert(self, a, weight):
        for i in range(self.num_rows):
            bucket = self.hash(a, i)
            self.count[i][bucket] += weight

    def get_count(self, a):
        bucket = self.hash(a, 0)
        min_count = self.count[0][bucket]
        for i in range(1, self.num_rows):
            bucket = self.hash(a, i)
            min_count = min(min_count, self.count[i][bucket])

        return min_count

    def clear(self):
        self.count = np.zeros((self.num_rows, self.num_buckets))

    def lower(self, factor):
        self.count = self.count * factor