#Online Learning

In [None]:
#Handles creme instalation
!pip install creme

In [None]:
#Takes care of basic imports
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
import sklearn
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers.core import Dense, Activation
from tensorflow.python.keras.callbacks import EarlyStopping
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.models import load_model

def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()
    if sd is None:
        sd = df[name].std()
    df[name] = (df[name] - mean) / sd

def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)

def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)

def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)

def to_xy(df, target):
    result = []
    for x in df.columns:
      if x != target:
        result.append(x)
    res1 = df[result].values 
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    if target_type in (np.int64, np.int32):
        dummies = pd.get_dummies(df[target])
        return res1.astype(np.float32), dummies.to_numpy().astype(np.float32)
    else:
        return res1.astype(np.float32), df[target].to_numpy().astype(np.float32)


def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1, data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low

ENCODING = 'utf-8'

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))

def analyze(filename):
    print()
    print("Analyzing: {}".format(filename))
    df = pd.read_csv(filename,encoding=ENCODING,low_memory=False)
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    i = 0
    for col in cols:
        name = df.columns[i]
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("--> {}: {} ({}%)".format(name, unique_count,int(((unique_count)/total)*100)))
        else:
            print("--> {}: {}".format(name, expand_categories(df[col])))
            expand_categories(df[col])
        i += 1

def analyze_df(dataframe):
    print()
    df = dataframe
    cols = df.columns.values
    total = float(len(df))
    print("{} rows".format(int(total)))
    i = 0
    for col in cols:
        name = col_names[i]
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("--> {}: {} ({}%)".format(name, unique_count,int(((unique_count)/total)*100)))
        else:
            print("--> {}: {}".format(name, expand_categories(df[col])))
            expand_categories(df[col])
        if i < 48:
          i += 1



##ONLINE LEARNING: UNSW-NB15 (ALL FEATURES)

##Preprocessing

In [None]:
col_names = [
    'srcip', 'sport', 'dstip', 'dsport','proto','state','dur','sbytes',
    'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload',
    'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz',
    'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Stime', 'Ltime', 'Sintpkt',
    'Dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips_ports', 'ct_state_ttl',
    'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst',
    'ct_dst_ltm', 'ct_src_ ltm', 'ct_src_dport_ltm','ct_dst_sport_ltm',
    'ct_dst_src_ltm','attack_cat', 'label'
]

#Imports the dataset
df1 = pd.read_csv('UNSW-NB15_1.csv', header=0, names=col_names)
df2 = pd.read_csv('UNSW-NB15_2.csv', header=0, names=col_names)
df3 = pd.read_csv('UNSW-NB15_3.csv', header=0, names=col_names)
df4 = pd.read_csv('UNSW-NB15_4.csv', header=0, names=col_names)
df = pd.concat([df1,df2,df3,df4], ignore_index = True)

df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

#Corrects the dataset
for name in col_names:
  if name == 'sport' or name == 'dsport':
    df[name].replace('-', np.nan, inplace=True)
    index = 0
    for i in df[name]:
      istr = str(i)
      if isinstance(i, str) and '0x' in istr:
        x = int(i, 16)
        df.at[index, name] = x
      index += 1
  if name == 'service':
    df[name].replace('-', np.nan, inplace=True)

df['sport'] = pd.to_numeric(df['sport'])
df['dsport'] = pd.to_numeric(df['dsport'])
df['ct_ftp_cmd'] = pd.to_numeric(df['ct_ftp_cmd'])

for name in col_names:
    if name == 'srcip' or name == 'dstip' or name == 'proto' or name == 'state' or name == 'service':
        df[name].fillna("None", inplace=True)
    elif name == 'attack_cat':
        df['attack_cat'].fillna("Not an Attack", inplace=True)
        continue
    else:
        continue

#Time to encode
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

encode_text_dummy(df, 'srcip')
encode_text_index(df,'sport')
encode_text_dummy(df, 'dstip')
encode_text_index(df,'dsport')
encode_text_dummy(df, 'proto')
encode_text_dummy(df, 'state')
encode_numeric_zscore(df,'dur')
encode_numeric_zscore(df,'sbytes')
encode_numeric_zscore(df,'dbytes')
encode_numeric_zscore(df,'sttl')
encode_numeric_zscore(df,'dttl')
encode_numeric_zscore(df,'sloss')
encode_numeric_zscore(df,'dloss')
encode_text_dummy(df, 'service')
encode_numeric_zscore(df,'Sload')
encode_numeric_zscore(df,'Dload')
encode_numeric_zscore(df,'Spkts')
encode_numeric_zscore(df,'Dpkts')
encode_numeric_zscore(df,'swin')
encode_numeric_zscore(df,'dwin')
encode_numeric_zscore(df,'stcpb')
encode_numeric_zscore(df,'dtcpb')
encode_numeric_zscore(df,'smeansz')
encode_numeric_zscore(df,'dmeansz')
encode_numeric_zscore(df,'trans_depth')
encode_numeric_zscore(df,'res_bdy_len')
encode_numeric_zscore(df,'Sjit')
encode_numeric_zscore(df,'Djit')
encode_numeric_zscore(df,'Sintpkt')
encode_numeric_zscore(df,'Dintpkt')
encode_numeric_zscore(df,'tcprtt')
encode_numeric_zscore(df,'synack')
encode_numeric_zscore(df,'ackdat')
#encode_text_dummy(df, 'is_sm_ips_ports') #added
encode_numeric_zscore(df,'ct_state_ttl')
encode_numeric_zscore(df,'ct_flw_http_mthd')
#encode_text_dummy(df, 'is_ftp_login') #added
encode_text_index(df,'ct_ftp_cmd') #encode_numeric_zscore(df,'ct_ftp_cmd') 
encode_numeric_zscore(df,'ct_srv_src')
encode_numeric_zscore(df,'ct_srv_dst')
encode_numeric_zscore(df,'ct_dst_ltm')
encode_numeric_zscore(df,'ct_src_ ltm')
encode_numeric_zscore(df,'ct_src_dport_ltm')
encode_numeric_zscore(df,'ct_dst_sport_ltm')
encode_numeric_zscore(df,'ct_dst_src_ltm')

df.drop('attack_cat', axis=1, inplace=True)
# display 5 rows
df.dropna(inplace=True, axis=0) # For now, just drop NA's (rows with missing values)
x_columns = df.columns.drop('label')
x = df[x_columns]
y = df['label']

##Training

In [None]:
from creme import stream, datasets, compose, linear_model, metrics
#Creates the sequential stream for online learning
dataset = stream.iter_pandas(x, y)

In [None]:
acc = metrics.Accuracy()
conf = metrics.ConfusionMatrix()
rec = metrics.Recall()
roc = metrics.ROCAUC()
f1 = metrics.F1()
pre = metrics.Precision()
classifier = linear_model.LogisticRegression()
n = 0
for x, y in dataset:
  y_pred = classifier.predict_one(x)
  classifier.fit_one(x, y)
  acc.update(y, y_pred)
  conf.update(y,y_pred)
  rec.update(y,y_pred)
  roc.update(y,y_pred)
  f1.update(y,y_pred)
  pre.update(y,y_pred)
  if n%10000 == 0: #Limits output on console to avoid unexpected crashes
    print("[INFO] update {} - {}".format(n, acc))
  n+=1
print(acc)
print(conf)
print(rec)
print(roc)
print(f1)
print(pre)
matrix = [[conf[0][0],conf[0][1]],
          [conf[1][0],conf[1][1]]]
labels = [0,1]
df_cm = pd.DataFrame(matrix, index = [i for i in labels],
                  columns = [i for i in labels])
plt.figure(figsize = (5,3))
sn.heatmap(df_cm, annot=True, fmt='g')

##ONLINE LEARNING USING THE RELEVANT FEATURES IN UNSW-NB15

##Preprocessing

In [None]:
col_names = [
    'srcip', 'sport', 'dstip', 'dsport','proto','state','dur','sbytes',
    'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload',
    'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz',
    'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Stime', 'Ltime', 'Sintpkt',
    'Dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips_ports', 'ct_state_ttl',
    'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst',
    'ct_dst_ltm', 'ct_src_ ltm', 'ct_src_dport_ltm','ct_dst_sport_ltm',
    'ct_dst_src_ltm','attack_cat', 'label'
]
#Imports the dataset
df1 = pd.read_csv('UNSW-NB15_1.csv', header=0, names=col_names)
df2 = pd.read_csv('UNSW-NB15_2.csv', header=0, names=col_names)
df3 = pd.read_csv('UNSW-NB15_3.csv', header=0, names=col_names)
df4 = pd.read_csv('UNSW-NB15_4.csv', header=0, names=col_names)
df = pd.concat([df1,df2,df3,df4], ignore_index = True)

df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

for name in col_names:
  if name == 'sport' or name == 'dsport':
    df[name].replace('-', np.nan, inplace=True)
    index = 0
    for i in df[name]:
      istr = str(i)
      if isinstance(i, str) and '0x' in istr:
        x = int(i, 16)
        df.at[index, name] = x
      index += 1

df['sport'] = pd.to_numeric(df['sport'])
df['dsport'] = pd.to_numeric(df['dsport'])
df['ct_ftp_cmd'] = pd.to_numeric(df['ct_ftp_cmd'])

In [None]:
#Encoding the dataset
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

df.drop('srcip', axis=1, inplace=True)
df.drop('sport', axis=1, inplace=True)
df.drop('dstip', axis=1, inplace=True)
df.drop('dsport', axis=1, inplace=True)
encode_text_index(df, 'proto')
encode_text_index(df, 'state')
encode_numeric_zscore(df,'dur')
encode_numeric_zscore(df,'sbytes')
encode_numeric_zscore(df,'dbytes')
encode_numeric_zscore(df,'sttl')
encode_numeric_zscore(df,'dttl')
encode_numeric_zscore(df,'sloss')
encode_numeric_zscore(df,'dloss')
encode_text_index(df, 'service')
encode_numeric_zscore(df,'Sload')
encode_numeric_zscore(df,'Dload')
encode_numeric_zscore(df,'Spkts')
encode_numeric_zscore(df,'Dpkts')
encode_numeric_zscore(df,'swin')
encode_numeric_zscore(df,'dwin')
encode_numeric_zscore(df,'stcpb')
encode_numeric_zscore(df,'dtcpb')
encode_numeric_zscore(df,'smeansz')
encode_numeric_zscore(df,'dmeansz')
encode_numeric_zscore(df,'trans_depth')
encode_numeric_zscore(df,'res_bdy_len')
encode_numeric_zscore(df,'Sjit')
encode_numeric_zscore(df,'Djit')
encode_numeric_zscore(df,'Sintpkt')
encode_numeric_zscore(df,'Dintpkt')
encode_numeric_zscore(df,'tcprtt')
encode_numeric_zscore(df,'synack')
encode_numeric_zscore(df,'ackdat')

encode_numeric_zscore(df,'ct_state_ttl')
encode_numeric_zscore(df,'ct_flw_http_mthd')
encode_text_index(df, 'is_ftp_login')
encode_text_index(df,'ct_ftp_cmd')
encode_numeric_zscore(df,'ct_srv_src')
encode_numeric_zscore(df,'ct_srv_dst')
encode_numeric_zscore(df,'ct_dst_ltm')
encode_numeric_zscore(df,'ct_src_ltm')
encode_numeric_zscore(df,'ct_src_dport_ltm')
encode_numeric_zscore(df,'ct_dst_sport_ltm')
encode_numeric_zscore(df,'ct_dst_src_ltm')

df.drop('attack_cat', axis=1, inplace=True)
df.dropna(inplace=True, axis=0) # For now, just drop NA's (rows with missing values)

In [None]:
x, y = to_xy(df, 'label')
x_columns = ['sttl', 'Dload', 'Spkts', 'sloss', 'dloss', 'ct_src_ ltm', 'ct_src_ ltm',
             'ct_srv_dst']
x = df[x_columns]
y = df['label']

## Training

In [None]:
from creme import stream, datasets, compose, linear_model, metrics

#Creates the needed sequential stream from the dataset
dataset = stream.iter_pandas(x, y)

In [None]:
from creme import multiclass
#Trains the model, gets metrics and confusion matrix
acc = metrics.Accuracy()
conf = metrics.ConfusionMatrix()
rec = metrics.Recall()
roc = metrics.ROCAUC()
f1 = metrics.F1()
pre = metrics.Precision()
classifier = linear_model.LogisticRegression()
n = 0
for x, y in dataset:
  y_pred = classifier.predict_one(x)
  classifier.fit_one(x, y)
  acc.update(y, y_pred)
  conf.update(y,y_pred)
  rec.update(y,y_pred)
  roc.update(y,y_pred)
  f1.update(y,y_pred)
  pre.update(y,y_pred)
  if n%10000 == 0: #Limits output on screen so as to not crash the shell
    print("INFO update {} - {} - {} - {}".format(n, acc, f1, roc))
  n+=1
  
print(acc)
print(conf)
print(rec)
print(roc)
print(f1)
print(pre)
matrix = [[conf[0][0],conf[0][1]],
          [conf[1][0],conf[1][1]]]
labels = [0,1]
df_cm = pd.DataFrame(matrix, index = [i for i in labels],
                  columns = [i for i in labels])
plt.figure(figsize = (5,3))
sn.heatmap(df_cm, annot=True, fmt='g')

## Multiclass Online Learning UNSW-NB15

##Preprocessing

In [None]:
col_names = [
    'srcip', 'sport', 'dstip', 'dsport','proto','state','dur','sbytes',
    'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload',
    'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz',
    'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Stime', 'Ltime', 'Sintpkt',
    'Dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips_ports', 'ct_state_ttl',
    'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst',
    'ct_dst_ltm', 'ct_src_ ltm', 'ct_src_dport_ltm','ct_dst_sport_ltm',
    'ct_dst_src_ltm','attack_cat', 'label'
]
#Imports the dataset
df1 = pd.read_csv('UNSW-NB15_1.csv', header=0, names=col_names)
df2 = pd.read_csv('UNSW-NB15_2.csv', header=0, names=col_names)
df3 = pd.read_csv('UNSW-NB15_3.csv', header=0, names=col_names)
df4 = pd.read_csv('UNSW-NB15_4.csv', header=0, names=col_names)
df = pd.concat([df1,df2,df3,df4], ignore_index = True)

df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

for name in col_names:
  if name == 'sport' or name == 'dsport':
    df[name].replace('-', np.nan, inplace=True)
    index = 0
    for i in df[name]:
      istr = str(i)
      if isinstance(i, str) and '0x' in istr:
        x = int(i, 16)
        df.at[index, name] = x
      index += 1

df['sport'] = pd.to_numeric(df['sport'])
df['dsport'] = pd.to_numeric(df['dsport'])
df['ct_ftp_cmd'] = pd.to_numeric(df['ct_ftp_cmd'])
print("done")

#Time to encode
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

df.drop('srcip', axis=1, inplace=True)
df.drop('sport', axis=1, inplace=True)
df.drop('dstip', axis=1, inplace=True)
df.drop('dsport', axis=1, inplace=True)
encode_text_index(df, 'proto')
encode_text_index(df, 'state')
encode_numeric_zscore(df,'dur')
encode_numeric_zscore(df,'sbytes')
encode_numeric_zscore(df,'dbytes')
encode_numeric_zscore(df,'sttl')
encode_numeric_zscore(df,'dttl')
encode_numeric_zscore(df,'sloss')
encode_numeric_zscore(df,'dloss')
encode_text_index(df, 'service')
encode_numeric_zscore(df,'Sload')
encode_numeric_zscore(df,'Dload')
encode_numeric_zscore(df,'Spkts')
encode_numeric_zscore(df,'Dpkts')
encode_numeric_zscore(df,'swin')
encode_numeric_zscore(df,'dwin')
encode_numeric_zscore(df,'stcpb')
encode_numeric_zscore(df,'dtcpb')
encode_numeric_zscore(df,'smeansz')
encode_numeric_zscore(df,'dmeansz')
encode_numeric_zscore(df,'trans_depth')
encode_numeric_zscore(df,'res_bdy_len')
encode_numeric_zscore(df,'Sjit')
encode_numeric_zscore(df,'Djit')
encode_numeric_zscore(df,'Sintpkt')
encode_numeric_zscore(df,'Dintpkt')
encode_numeric_zscore(df,'tcprtt')
encode_numeric_zscore(df,'synack')
encode_numeric_zscore(df,'ackdat')
encode_numeric_zscore(df,'ct_state_ttl')
encode_numeric_zscore(df,'ct_flw_http_mthd')
encode_text_index(df, 'is_ftp_login')
encode_text_index(df,'ct_ftp_cmd')
encode_numeric_zscore(df,'ct_srv_src')
encode_numeric_zscore(df,'ct_srv_dst')
encode_numeric_zscore(df,'ct_dst_ltm')
encode_numeric_zscore(df,'ct_src_ltm')
encode_numeric_zscore(df,'ct_src_dport_ltm')
encode_numeric_zscore(df,'ct_dst_sport_ltm')
encode_numeric_zscore(df,'ct_dst_src_ltm')
df['attack_cat'].fillna("Not an Attack", inplace=True)
df.drop('label', axis=1, inplace=True)


In [None]:
x_columns = ['sttl', 'Dload', 'Spkts', 'sloss', 'dloss', 'ct_src_ltm', 'ct_src_ltm',
             'ct_srv_dst']
le = preprocessing.LabelEncoder()
df['attack_cat'] = df['attack_cat'].astype('category')
x = df[x_columns]
y = df['attack_cat']

In [None]:
from creme import stream, datasets, compose, linear_model, metrics

#Creates the sequential stream needed
dataset = stream.iter_pandas(x, y)

## Training

In [None]:
from creme import multiclass
acc = metrics.Accuracy()
conf = metrics.ConfusionMatrix()
rec = metrics.Recall()
roc = metrics.ROCAUC()
f1 = metrics.F1()
pre = metrics.Precision()
classifier = multiclass.OneVsRestClassifier(linear_model.LogisticRegression())
n = 0
for x, y in dataset:
  y_pred = str(classifier.predict_one(x))
  classifier.fit_one(x, y)

  acc.update(y, y_pred)
  conf.update(y,y_pred)
  rec.update(y,y_pred)
  f1.update(y,y_pred)
  pre.update(y,y_pred)
  if n%10000 == 0:
    print("INFO update {} - {} - {}".format(n, acc, f1))
  n+=1
  
print(acc)
print(rec)
print(f1)
print(pre)
print(conf)

## ONLINE LEARNING KDD-99



##Preprocessing

In [None]:
#Importing the dataset
df = pd.read_csv("/content/drive/My Drive/TFG/kddcup.data.corrected", header=None)
df.columns = [
    'duration', 'protocol_type', 'service','flag','src_bytes','dst_bytes',
    'land','wrong_fragment','urgent','hot','num_failed_logins','logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
    'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'outcome'
]

display(df[0:5])
df.dropna(inplace=True, axis=1)

In [None]:
#Encoding the dataset
encode_numeric_zscore(df, 'duration')
encode_text_dummy(df, 'protocol_type')
encode_text_dummy(df, 'service')
encode_text_dummy(df, 'flag')
encode_numeric_zscore(df, 'src_bytes')
encode_numeric_zscore(df, 'dst_bytes')
encode_text_dummy(df, 'land')
encode_numeric_zscore(df, 'wrong_fragment')
encode_numeric_zscore(df, 'urgent')
encode_numeric_zscore(df, 'hot')
encode_numeric_zscore(df, 'num_failed_logins')
encode_text_dummy(df, 'logged_in')
encode_numeric_zscore(df, 'num_compromised')
encode_numeric_zscore(df, 'root_shell')
encode_numeric_zscore(df, 'su_attempted')
encode_numeric_zscore(df, 'num_root')
encode_numeric_zscore(df, 'num_file_creations')
encode_numeric_zscore(df, 'num_shells')
encode_numeric_zscore(df, 'num_access_files')
encode_numeric_zscore(df, 'num_outbound_cmds')
encode_text_dummy(df, 'is_host_login')
encode_text_dummy(df, 'is_guest_login')
encode_numeric_zscore(df, 'count')
encode_numeric_zscore(df, 'srv_count')
encode_numeric_zscore(df, 'serror_rate')
encode_numeric_zscore(df, 'srv_serror_rate')
encode_numeric_zscore(df, 'rerror_rate')
encode_numeric_zscore(df, 'srv_rerror_rate')
encode_numeric_zscore(df, 'same_srv_rate')
encode_numeric_zscore(df, 'diff_srv_rate')
encode_numeric_zscore(df, 'srv_diff_host_rate')
encode_numeric_zscore(df, 'dst_host_count')
encode_numeric_zscore(df, 'dst_host_srv_count')
encode_numeric_zscore(df, 'dst_host_same_srv_rate')
encode_numeric_zscore(df, 'dst_host_diff_srv_rate')
encode_numeric_zscore(df, 'dst_host_same_src_port_rate')
encode_numeric_zscore(df, 'dst_host_srv_diff_host_rate')
encode_numeric_zscore(df, 'dst_host_serror_rate')
encode_numeric_zscore(df, 'dst_host_srv_serror_rate')
encode_numeric_zscore(df, 'dst_host_rerror_rate')
encode_numeric_zscore(df, 'dst_host_srv_rerror_rate')

df.dropna(inplace=True,axis=1)

#Ensures the use of a binary label
df['outcome'] = np.where(df['outcome']=='normal.', 0, 1)
df['outcome'] = pd.to_numeric(df['outcome'])

x_columns = df.columns.drop('outcome')
x = df[x_columns]
y = df['outcome']

In [None]:
from creme import stream, datasets, compose, linear_model, metrics

#Creates the stream of data
dataset = stream.iter_pandas(x, y)

##Training

In [None]:
from creme import multiclass
acc = metrics.Accuracy()
conf = metrics.ConfusionMatrix()
rec = metrics.Recall()
f1 = metrics.F1()
pre = metrics.Precision()
classifier = linear_model.LogisticRegression()
n = 0
for x, y in dataset:
  y_pred = classifier.predict_one(x)
  classifier.fit_one(x, y)
  #Metrics
  acc.update(y, y_pred)
  conf.update(y,y_pred)
  rec.update(y,y_pred)
  f1.update(y,y_pred)
  pre.update(y,y_pred)
  if n%10000 == 0:
    print("INFO] update {} - {}".format(n, acc))
  n+=1
  
print(acc)
print(conf)
print(rec)
print(f1)
print(pre)

##Multiclass Online Learning with KDD-99

##Preprocessing

In [None]:
#----------------------------------------------------------CLASIFICACIÓN MULTICLASE PARA EL KDD-99-------------------------------------------
df = pd.read_csv("/content/drive/My Drive/TFG/kddcup.data.corrected", header=None)
df.columns = [
    'duration', 'protocol_type', 'service','flag','src_bytes','dst_bytes',
    'land','wrong_fragment','urgent','hot','num_failed_logins','logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
    'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'outcome'
]

df.dropna(inplace=True, axis=1)

encode_numeric_zscore(df, 'duration')
encode_text_dummy(df, 'protocol_type')
encode_text_dummy(df, 'service')
encode_text_dummy(df, 'flag')
encode_numeric_zscore(df, 'src_bytes')
encode_numeric_zscore(df, 'dst_bytes')
encode_text_dummy(df, 'land')
encode_numeric_zscore(df, 'wrong_fragment')
encode_numeric_zscore(df, 'urgent')
encode_numeric_zscore(df, 'hot')
encode_numeric_zscore(df, 'num_failed_logins')
encode_text_dummy(df, 'logged_in')
encode_numeric_zscore(df, 'num_compromised')
encode_numeric_zscore(df, 'root_shell')
encode_numeric_zscore(df, 'su_attempted')
encode_numeric_zscore(df, 'num_root')
encode_numeric_zscore(df, 'num_file_creations')
encode_numeric_zscore(df, 'num_shells')
encode_numeric_zscore(df, 'num_access_files')
encode_numeric_zscore(df, 'num_outbound_cmds')
encode_text_dummy(df, 'is_host_login')
encode_text_dummy(df, 'is_guest_login')
encode_numeric_zscore(df, 'count')
encode_numeric_zscore(df, 'srv_count')
encode_numeric_zscore(df, 'serror_rate')
encode_numeric_zscore(df, 'srv_serror_rate')
encode_numeric_zscore(df, 'rerror_rate')
encode_numeric_zscore(df, 'srv_rerror_rate')
encode_numeric_zscore(df, 'same_srv_rate')
encode_numeric_zscore(df, 'diff_srv_rate')
encode_numeric_zscore(df, 'srv_diff_host_rate')
encode_numeric_zscore(df, 'dst_host_count')
encode_numeric_zscore(df, 'dst_host_srv_count')
encode_numeric_zscore(df, 'dst_host_same_srv_rate')
encode_numeric_zscore(df, 'dst_host_diff_srv_rate')
encode_numeric_zscore(df, 'dst_host_same_src_port_rate')
encode_numeric_zscore(df, 'dst_host_srv_diff_host_rate')
encode_numeric_zscore(df, 'dst_host_serror_rate')
encode_numeric_zscore(df, 'dst_host_srv_serror_rate')
encode_numeric_zscore(df, 'dst_host_rerror_rate')
encode_numeric_zscore(df, 'dst_host_srv_rerror_rate')

df.dropna(inplace=True,axis=1)

In [None]:
x_columns = df.columns.drop('outcome')
x = df[x_columns]
y = df['outcome']

In [None]:
from creme import stream, datasets, compose, linear_model, metrics

#Creates the necessary sequential stream
dataset = stream.iter_pandas(x, y)

##Training

In [None]:
from creme import multiclass
#NOTE: The full KDD-99 dataset is huge. This will iterate through its entirety, but it's recommended to limit the for look to a chosen number of iterations.
acc = metrics.Accuracy()
conf = metrics.ConfusionMatrix()
rec = metrics.Recall()
f1 = metrics.F1()
pre = metrics.Precision()
classifier = multiclass.OneVsRestClassifier(linear_model.LogisticRegression())
n = 0
for x, y in dataset:
  y_pred = str(classifier.predict_one(x))
  classifier.fit_one(x, y)
  acc.update(y, y_pred)
  conf.update(y,y_pred)
  rec.update(y,y_pred)
  f1.update(y,y_pred)
  pre.update(y,y_pred)
  if n%10000 == 0:
    print("INFO update {} - {} - {}".format(n, acc, f1))
  n+=1

print(acc)
print(conf)
print(rec)
print(f1)
print(pre)