# TRANSFER LEARNING

In [None]:
#This should be ran for any possible imports that may need to be done.
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
import io
import requests
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers.core import Dense, Activation, Reshape
from tensorflow.python.keras.callbacks import EarlyStopping
from tensorflow.python.keras.models import Sequential


def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(df):
    print()
    cols = df.columns.values
    total = float(len(df))
    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()
    if sd is None:
        sd = df[name].std()
    df[name] = (df[name]-mean)/sd
    
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

def to_xy(df, target):
    result = []
    for x in df.columns:
      if x != target:
        result.append(x)
    res1 = df[result].values 
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    if target_type in (np.int64, np.int32):
        dummies = pd.get_dummies(df[target])
        return res1.astype(np.float32), dummies.to_numpy().astype(np.float32)
    else:
        return res1.astype(np.float32), df[target].to_numpy().astype(np.float32)

# TRANSFER LEARNING: KDD-99 TO UNSW-NB15

In [None]:
#----------------------------------------------------------IMPORTING THE KDD99 DATASET-------------------------------------------
df = pd.read_csv("/content/drive/My Drive/TFG/kddcup.data.corrected", header=None)
df.columns = [
    'duration', 'protocol_type', 'service','flag','src_bytes','dst_bytes',
    'land','wrong_fragment','urgent','hot','num_failed_logins','logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
    'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'outcome'
]

df.dropna(inplace=True, axis=1) #For now, just remove all NaN values
display(df[0:5])

##Preprocessing

In [None]:
#----------------------------------------------------------DATA PREPROCESSING-------------------------------------------
encode_numeric_zscore(df, 'duration')
encode_text_dummy(df, 'protocol_type')
encode_text_dummy(df, 'service')
encode_text_dummy(df, 'flag')
encode_numeric_zscore(df, 'src_bytes')
encode_numeric_zscore(df, 'dst_bytes')
encode_text_dummy(df, 'land')
encode_numeric_zscore(df, 'wrong_fragment')
encode_numeric_zscore(df, 'urgent')
encode_numeric_zscore(df, 'hot')
encode_numeric_zscore(df, 'num_failed_logins')
encode_text_dummy(df, 'logged_in')
encode_numeric_zscore(df, 'num_compromised')
encode_numeric_zscore(df, 'root_shell')
encode_numeric_zscore(df, 'su_attempted')
encode_numeric_zscore(df, 'num_root')
encode_numeric_zscore(df, 'num_file_creations')
encode_numeric_zscore(df, 'num_shells')
encode_numeric_zscore(df, 'num_access_files')
encode_numeric_zscore(df, 'num_outbound_cmds')
encode_text_dummy(df, 'is_host_login')
encode_text_dummy(df, 'is_guest_login')
encode_numeric_zscore(df, 'count')
encode_numeric_zscore(df, 'srv_count')
encode_numeric_zscore(df, 'serror_rate')
encode_numeric_zscore(df, 'srv_serror_rate')
encode_numeric_zscore(df, 'rerror_rate')
encode_numeric_zscore(df, 'srv_rerror_rate')
encode_numeric_zscore(df, 'same_srv_rate')
encode_numeric_zscore(df, 'diff_srv_rate')
encode_numeric_zscore(df, 'srv_diff_host_rate')
encode_numeric_zscore(df, 'dst_host_count')
encode_numeric_zscore(df, 'dst_host_srv_count')
encode_numeric_zscore(df, 'dst_host_same_srv_rate')
encode_numeric_zscore(df, 'dst_host_diff_srv_rate')
encode_numeric_zscore(df, 'dst_host_same_src_port_rate')
encode_numeric_zscore(df, 'dst_host_srv_diff_host_rate')
encode_numeric_zscore(df, 'dst_host_serror_rate')
encode_numeric_zscore(df, 'dst_host_srv_serror_rate')
encode_numeric_zscore(df, 'dst_host_rerror_rate')
encode_numeric_zscore(df, 'dst_host_srv_rerror_rate')
df['outcome'] = np.where(df['outcome']=='normal.', 0, 1)
df['outcome'] = pd.to_numeric(df['outcome'])

df.dropna(inplace=True, axis=1) #ES TOTALMENTE NECESARIO, AUNQUE NO LO PAREZCA

x_columns = df.columns.drop('outcome')
x = df[x_columns].values
dummies = pd.get_dummies(df['outcome'])
y = dummies.values
print("Done")


##Training

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
# Create neural net
model = Sequential()
model.add(Dense(10, input_dim=x.shape[1], activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.add(Dense(y.shape[1],activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
                        patience=5, verbose=1, mode='auto')
model.fit(x_train,y_train,validation_data=(x_test,y_test),
          callbacks=[monitor],verbose=1,epochs=1000)

In [None]:
# Obtains measurements
pred = model.predict(x_test)
pred = np.argmax(pred,axis=1)
y_eval = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_eval, pred)
roc = metrics.roc_auc_score(y_eval, pred)
f1 = metrics.f1_score(y_eval, pred)
recall = metrics.recall_score(y_eval, pred)
conf = metrics.confusion_matrix(y_eval, pred)
model.summary()
print("Validation score: {}".format(score))
print(roc)
print(f1)
print(recall)
labels = [0,1]
df_cm = pd.DataFrame(conf, index = [i for i in labels],
                  columns = [i for i in labels])
plt.figure(figsize = (5,3))
sn.heatmap(df_cm, annot=True, fmt='g')



In [None]:
#Sets a copy of the previously trained model as backup
model2 = Sequential()
for layer in model.layers:
    model2.add(layer)
pred = model2.predict(x)
predict_classes = np.argmax(pred,axis=1)
expected_classes = np.argmax(y,axis=1)
correct = accuracy_score(expected_classes,predict_classes)
model2.summary()
print(f"Training Accuracy: {correct}")


##Preprocessing

In [None]:
#------------------------------------------------------Working with the UNSW15 Dataset---------------------------------------------
col_names = [
    'srcip', 'sport', 'dstip', 'dsport','proto','state','dur','sbytes',
    'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload',
    'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz',
    'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Stime', 'Ltime', 'Sintpkt',
    'Dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips_ports', 'ct_state_ttl',
    'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst',
    'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport_ltm','ct_dst_sport_ltm',
    'ct_dst_src_ltm','attack_cat', 'label'
]
#Imports the dataset
df1 = pd.read_csv('UNSW-NB15_1.csv', header=0, names=col_names)
df2 = pd.read_csv('UNSW-NB15_2.csv', header=0, names=col_names)
df3 = pd.read_csv('UNSW-NB15_3.csv', header=0, names=col_names)
df4 = pd.read_csv('UNSW-NB15_4.csv', header=0, names=col_names)
df = pd.concat([df1,df2,df3,df4], ignore_index = True)

df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

#Correcting the data

for name in col_names:
  df[name].replace('', np.nan, inplace=True)
  df[name].replace(' ', np.nan, inplace=True)
  if name == 'sport' or name == 'dsport':
    df[name].replace('-', np.nan, inplace=True)
    index = 0
    for i in df[name]:
      istr = str(i)
      if isinstance(i, str) and '0x' in istr:
        x = int(i, 16)
        df.at[index, name] = x
      index += 1

df['sport'] = pd.to_numeric(df['sport'])
df['dsport'] = pd.to_numeric(df['dsport'])
df['ct_ftp_cmd'] = pd.to_numeric(df['ct_ftp_cmd'])

for name in col_names:
    if name == 'srcip' or name == 'dstip' or name == 'proto' or name == 'state' or name == 'service':
        df[name].fillna("None", inplace=True)
    elif name == 'attack_cat':
        df['attack_cat'].fillna("Not an Attack", inplace=True)
        continue
    else:
        continue

#Encoding the data
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)
encode_text_dummy(df, 'srcip')
encode_text_index(df,'sport')
encode_text_dummy(df, 'dstip')
encode_text_index(df,'dsport')
encode_text_dummy(df, 'proto')
encode_text_dummy(df, 'state')
encode_numeric_zscore(df,'dur')
encode_numeric_zscore(df,'sbytes')
encode_numeric_zscore(df,'dbytes')
encode_numeric_zscore(df,'sttl')
encode_numeric_zscore(df,'dttl')
encode_numeric_zscore(df,'sloss')
encode_numeric_zscore(df,'dloss')
encode_text_dummy(df, 'service')
encode_numeric_zscore(df,'Sload')
encode_numeric_zscore(df,'Dload')
encode_numeric_zscore(df,'Spkts')
encode_numeric_zscore(df,'Dpkts')
encode_numeric_zscore(df,'swin')
encode_numeric_zscore(df,'dwin')
encode_numeric_zscore(df,'stcpb')
encode_numeric_zscore(df,'dtcpb')
encode_numeric_zscore(df,'smeansz')
encode_numeric_zscore(df,'dmeansz')
encode_numeric_zscore(df,'trans_depth')
encode_numeric_zscore(df,'res_bdy_len')
encode_numeric_zscore(df,'Sjit')
encode_numeric_zscore(df,'Djit')
encode_numeric_zscore(df,'Sintpkt')
encode_numeric_zscore(df,'Dintpkt')
encode_numeric_zscore(df,'tcprtt')
encode_numeric_zscore(df,'synack')
encode_numeric_zscore(df,'ackdat')
encode_text_dummy(df, 'is_sm_ips_ports')
encode_numeric_zscore(df,'ct_state_ttl')
encode_numeric_zscore(df,'ct_flw_http_mthd')
encode_text_dummy(df, 'is_ftp_login')
encode_text_index(df,'ct_ftp_cmd')
encode_numeric_zscore(df,'ct_srv_src')
encode_numeric_zscore(df,'ct_srv_dst')
encode_numeric_zscore(df,'ct_dst_ltm')
encode_numeric_zscore(df,'ct_src_ltm')
encode_numeric_zscore(df,'ct_src_dport_ltm')
encode_numeric_zscore(df,'ct_dst_sport_ltm')
encode_numeric_zscore(df,'ct_dst_src_ltm')
encode_text_dummy(df, 'attack_cat')
outcomes = encode_text_index(df, 'label')
num_classes = len(outcomes)

df.dropna(inplace=True, axis=0) # For now, just drop NA's (rows with missing values)

# Break into X (predictors) & y (prediction)
x, y = to_xy(df,'label')
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=42)

##Transfering and training

In [None]:
model3 = Sequential()
for i in range(5):
  if i == 0:
    #Adds the new input layer to the model
    layer = Dense(125, input_dim=x.shape[1], activation='relu')
    model3.add(layer)
  layer = model.layers[i]
  layer.trainable = False
  model3.add(layer)
#Now we add the output layer
model3.add(Dense(2,activation='softmax'))
model3.compile(loss='categorical_crossentropy', optimizer='adam')
model3.fit(x_train,y_train,validation_data=(x_test,y_test), callbacks=[monitor],verbose=1,epochs=1000)


In [None]:
#Get metrics from the model and plots the confusion matrix
pred = model3.predict(x_test)
pred = np.argmax(pred,axis=1)
y_eval = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_eval, pred)
roc = metrics.roc_auc_score(y_eval, pred)
f1 = metrics.f1_score(y_eval, pred)
recall = metrics.recall_score(y_eval, pred)
conf = metrics.confusion_matrix(y_eval, pred)
model.summary()

print("Validation score: {}".format(score))
print(roc)
print(f1)
print(recall)

labels = [0,1]
df_cm = pd.DataFrame(conf, index = [i for i in labels],
                  columns = [i for i in labels])
plt.figure(figsize = (5,3))
sn.heatmap(df_cm, annot=True, fmt='g')


##Preprocessing

In [None]:
#-------------------------------------------------UNSW-NB15 using only the relevant features
col_names = [
    'srcip', 'sport', 'dstip', 'dsport','proto','state','dur','sbytes',
    'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload',
    'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz',
    'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Stime', 'Ltime', 'Sintpkt',
    'Dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips_ports', 'ct_state_ttl',
    'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst',
    'ct_dst_ltm', 'ct_src_ ltm', 'ct_src_dport_ltm','ct_dst_sport_ltm',
    'ct_dst_src_ltm','attack_cat', 'label'
]
#Imports the dataset
df1 = pd.read_csv('UNSW-NB15_1.csv', header=0, names=col_names)
df2 = pd.read_csv('UNSW-NB15_2.csv', header=0, names=col_names)
df3 = pd.read_csv('UNSW-NB15_3.csv', header=0, names=col_names)
df4 = pd.read_csv('UNSW-NB15_4.csv', header=0, names=col_names)
df = pd.concat([df1,df2,df3,df4], ignore_index = True)

df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

#Corrects the dataset

for name in col_names:
  if name == 'sport' or name == 'dsport':
    df[name].replace('-', np.nan, inplace=True)
    index = 0
    for i in df[name]:
      istr = str(i)
      if isinstance(i, str) and '0x' in istr:
        x = int(i, 16)
        df.at[index, name] = x
      index += 1

df['sport'] = pd.to_numeric(df['sport'])
df['dsport'] = pd.to_numeric(df['dsport'])
df['ct_ftp_cmd'] = pd.to_numeric(df['ct_ftp_cmd'])
print("done")

#Time to encode
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

df.drop('srcip', axis=1, inplace=True)
df.drop('sport', axis=1, inplace=True)
df.drop('dstip', axis=1, inplace=True)
df.drop('dsport', axis=1, inplace=True)
encode_text_index(df, 'proto')
encode_text_index(df, 'state')
encode_numeric_zscore(df,'dur')
encode_numeric_zscore(df,'sbytes')
encode_numeric_zscore(df,'dbytes')
encode_numeric_zscore(df,'sttl')
encode_numeric_zscore(df,'dttl')
encode_numeric_zscore(df,'sloss')
encode_numeric_zscore(df,'dloss')
encode_text_index(df, 'service')
encode_numeric_zscore(df,'Sload')
encode_numeric_zscore(df,'Dload')
encode_numeric_zscore(df,'Spkts')
encode_numeric_zscore(df,'Dpkts')
encode_numeric_zscore(df,'swin')
encode_numeric_zscore(df,'dwin')
encode_numeric_zscore(df,'stcpb')
encode_numeric_zscore(df,'dtcpb')
encode_numeric_zscore(df,'smeansz')
encode_numeric_zscore(df,'dmeansz')
encode_numeric_zscore(df,'trans_depth')
encode_numeric_zscore(df,'res_bdy_len')
encode_numeric_zscore(df,'Sjit')
encode_numeric_zscore(df,'Djit')
encode_numeric_zscore(df,'Sintpkt')
encode_numeric_zscore(df,'Dintpkt')
encode_numeric_zscore(df,'tcprtt')
encode_numeric_zscore(df,'synack')
encode_numeric_zscore(df,'ackdat')
encode_numeric_zscore(df,'ct_state_ttl')
encode_numeric_zscore(df,'ct_flw_http_mthd')
encode_text_index(df, 'is_ftp_login')
encode_text_index(df,'ct_ftp_cmd')
encode_numeric_zscore(df,'ct_srv_src')
encode_numeric_zscore(df,'ct_srv_dst')
encode_numeric_zscore(df,'ct_dst_ltm')
encode_numeric_zscore(df,'ct_src_ ltm')
encode_numeric_zscore(df,'ct_src_dport_ltm')
encode_numeric_zscore(df,'ct_dst_sport_ltm')
encode_numeric_zscore(df,'ct_dst_src_ltm')
#Gets rid of the multiclass label
df.drop('attack_cat', axis=1, inplace=True)

x, y = to_xy(df,'label')
x_columns = ['sttl', 'Dload', 'Spkts', 'sloss', 'dloss', 'ct_src_ltm', 'ct_src_ltm',
             'ct_srv_dst']
x = df[x_columns]
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=42)
print(x_train)
print(y_train)

## Transfering and training

In [None]:
#Model for the relevant features
model4 = Sequential()

for i in range(5):
  if i == 0:
    layer = Dense(10, input_dim=x.shape[1], activation='relu')
    model4.add(layer)
    continue
  layer = model.layers[i]
  layer.trainable = False
  model4.add(layer)
#Now we add the output
model4.add(Dense(y.shape[1],activation='softmax'))
model4.compile(loss='categorical_crossentropy', optimizer='adam')
model4.fit(x_train,y_train,validation_data=(x_test,y_test), callbacks=[monitor],verbose=1,epochs=1000)

In [None]:
#Get metrics from the model and plots the confusion matrix
pred = model4.predict(x_test)
pred = np.argmax(pred,axis=1)
y_eval = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_eval, pred)
roc = metrics.roc_auc_score(y_eval, pred)
f1 = metrics.f1_score(y_eval, pred)
recall = metrics.recall_score(y_eval, pred)
conf = metrics.confusion_matrix(y_eval, pred)
model.summary()
print("Validation score: {}".format(score))
print(roc)
print(f1)
print(recall)
labels = [0,1]
df_cm = pd.DataFrame(conf, index = [i for i in labels],
                  columns = [i for i in labels])
plt.figure(figsize = (5,3))
sn.heatmap(df_cm, annot=True, fmt='g')

#TRANSFER LEARNING: UNSW-NB15 TO KDD-99

In [None]:
col_names = [
    'srcip', 'sport', 'dstip', 'dsport','proto','state','dur','sbytes',
    'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload',
    'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz',
    'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Stime', 'Ltime', 'Sintpkt',
    'Dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips_ports', 'ct_state_ttl',
    'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst',
    'ct_dst_ltm', 'ct_src_ ltm', 'ct_src_dport_ltm','ct_dst_sport_ltm',
    'ct_dst_src_ltm','attack_cat', 'label'
]
#Imports the dataset
df1 = pd.read_csv('UNSW-NB15_1.csv', header=0, names=col_names)
df2 = pd.read_csv('UNSW-NB15_2.csv', header=0, names=col_names)
df3 = pd.read_csv('UNSW-NB15_3.csv', header=0, names=col_names)
df4 = pd.read_csv('UNSW-NB15_4.csv', header=0, names=col_names)
df = pd.concat([df1,df2,df3,df4], ignore_index = True)

display(df[0:5])

df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)



In [None]:
#Corrects the dataset
for name in col_names:
  if name == 'sport' or name == 'dsport':
    df[name].replace('-', np.nan, inplace=True)
    index = 0
    for i in df[name]:
      istr = str(i)
      if isinstance(i, str) and '0x' in istr:
        x = int(i, 16)
        df.at[index, name] = x
      index += 1

df['sport'] = pd.to_numeric(df['sport'])
df['dsport'] = pd.to_numeric(df['dsport'])
df['ct_ftp_cmd'] = pd.to_numeric(df['ct_ftp_cmd'])
print("done")

###Preprocessing

In [None]:
#Encoding the dataset
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

df.drop('srcip', axis=1, inplace=True)
df.drop('sport', axis=1, inplace=True)
df.drop('dstip', axis=1, inplace=True)
df.drop('dsport', axis=1, inplace=True)
encode_text_index(df, 'proto')
encode_text_index(df, 'state')
encode_numeric_zscore(df,'dur')
encode_numeric_zscore(df,'sbytes')
encode_numeric_zscore(df,'dbytes')
encode_numeric_zscore(df,'sttl')
encode_numeric_zscore(df,'dttl')
encode_numeric_zscore(df,'sloss')
encode_numeric_zscore(df,'dloss')
encode_text_index(df, 'service')
encode_numeric_zscore(df,'Sload')
encode_numeric_zscore(df,'Dload')
encode_numeric_zscore(df,'Spkts')
encode_numeric_zscore(df,'Dpkts')
encode_numeric_zscore(df,'swin')
encode_numeric_zscore(df,'dwin')
encode_numeric_zscore(df,'stcpb')
encode_numeric_zscore(df,'dtcpb')
encode_numeric_zscore(df,'smeansz')
encode_numeric_zscore(df,'dmeansz')
encode_numeric_zscore(df,'trans_depth')
encode_numeric_zscore(df,'res_bdy_len')
encode_numeric_zscore(df,'Sjit')
encode_numeric_zscore(df,'Djit')
encode_numeric_zscore(df,'Sintpkt')
encode_numeric_zscore(df,'Dintpkt')
encode_numeric_zscore(df,'tcprtt')
encode_numeric_zscore(df,'synack')
encode_numeric_zscore(df,'ackdat')
encode_numeric_zscore(df,'ct_state_ttl')
encode_numeric_zscore(df,'ct_flw_http_mthd')
encode_text_index(df, 'is_ftp_login')
encode_text_index(df,'ct_ftp_cmd')
encode_numeric_zscore(df,'ct_srv_src')
encode_numeric_zscore(df,'ct_srv_dst')
encode_numeric_zscore(df,'ct_dst_ltm')
encode_numeric_zscore(df,'ct_src_ ltm')
encode_numeric_zscore(df,'ct_src_dport_ltm')
encode_numeric_zscore(df,'ct_dst_sport_ltm')
encode_numeric_zscore(df,'ct_dst_src_ltm')
df.drop('attack_cat', axis=1, inplace=True)

In [None]:
df.dropna(inplace=True, axis=0) # For now, just drop NA's (rows with missing values)
# Break into X (predictors) & y (prediction)
x, y = to_xy(df,'label')
x_columns = ['sttl', 'Dload', 'Spkts', 'sloss', 'dloss', 'ct_src_ ltm', 'ct_src_ ltm',
             'ct_srv_dst']
x = df[x_columns]
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=42)


##Training

In [None]:
print("TRAIN UNSW15 FIRST-----------------------------------------------------------------------------------------------------------------------m02_R4_G4_tanh")
model11 = Sequential()
model11.add(Dense(47, input_dim=x.shape[1], activation='relu'))
model11.add(Dense(42, activation='tanh'))
model11.add(Dense(10, activation='tanh'))
#model11.add(Dense(1, kernel_initializer='normal'))
model11.add(Dense(y.shape[1], activation='softmax'))
model11.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
model11.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=1,epochs=1000)

In [None]:
#Gets the metrics and prints the confusion matrix
pred = model11.predict(x_test)
pred = np.argmax(pred,axis=1)
y_eval = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_eval, pred)
roc = metrics.roc_auc_score(y_eval, pred)
f1 = metrics.f1_score(y_eval, pred)
recall = metrics.recall_score(y_eval, pred)
conf = metrics.confusion_matrix(y_eval, pred)
model11.summary()
print("Validation score: {}".format(score))
print(roc)
print(f1)
print(recall)
labels = [0,1]
df_cm = pd.DataFrame(conf, index = [i for i in labels],
                  columns = [i for i in labels])
plt.figure(figsize = (5,3))
sn.heatmap(df_cm, annot=True, fmt='g')


In [None]:
#First we copy the previous model as backup. We try it out, see if it works.
modelTemp = Sequential()
for layer in model11.layers:
    modelTemp.add(layer)
pred = modelTemp.predict(x)
predict_classes = np.argmax(pred,axis=1)
expected_classes = np.argmax(y,axis=1)
correct = accuracy_score(expected_classes,predict_classes)
modelTemp.summary()
print(f"Training Accuracy: {correct}")

##Preprocessing

In [None]:
#----------------------------------------------------------Handling KDD99 data just like before-------------------------------------------
df = pd.read_csv("/content/drive/My Drive/TFG/kddcup.data.corrected", header=None)
df.columns = [
    'duration', 'protocol_type', 'service','flag','src_bytes','dst_bytes',
    'land','wrong_fragment','urgent','hot','num_failed_logins','logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
    'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'outcome'
]

df.dropna(inplace=True, axis=1)

encode_numeric_zscore(df, 'duration')
encode_text_dummy(df, 'protocol_type')
encode_text_dummy(df, 'service')
encode_text_dummy(df, 'flag')
encode_numeric_zscore(df, 'src_bytes')
encode_numeric_zscore(df, 'dst_bytes')
encode_text_dummy(df, 'land')
encode_numeric_zscore(df, 'wrong_fragment')
encode_numeric_zscore(df, 'urgent')
encode_numeric_zscore(df, 'hot')
encode_numeric_zscore(df, 'num_failed_logins')
encode_text_dummy(df, 'logged_in')
encode_numeric_zscore(df, 'num_compromised')
encode_numeric_zscore(df, 'root_shell')
encode_numeric_zscore(df, 'su_attempted')
encode_numeric_zscore(df, 'num_root')
encode_numeric_zscore(df, 'num_file_creations')
encode_numeric_zscore(df, 'num_shells')
encode_numeric_zscore(df, 'num_access_files')
encode_numeric_zscore(df, 'num_outbound_cmds')
encode_text_dummy(df, 'is_host_login')
encode_text_dummy(df, 'is_guest_login')
encode_numeric_zscore(df, 'count')
encode_numeric_zscore(df, 'srv_count')
encode_numeric_zscore(df, 'serror_rate')
encode_numeric_zscore(df, 'srv_serror_rate')
encode_numeric_zscore(df, 'rerror_rate')
encode_numeric_zscore(df, 'srv_rerror_rate')
encode_numeric_zscore(df, 'same_srv_rate')
encode_numeric_zscore(df, 'diff_srv_rate')
encode_numeric_zscore(df, 'srv_diff_host_rate')
encode_numeric_zscore(df, 'dst_host_count')
encode_numeric_zscore(df, 'dst_host_srv_count')
encode_numeric_zscore(df, 'dst_host_same_srv_rate')
encode_numeric_zscore(df, 'dst_host_diff_srv_rate')
encode_numeric_zscore(df, 'dst_host_same_src_port_rate')
encode_numeric_zscore(df, 'dst_host_srv_diff_host_rate')
encode_numeric_zscore(df, 'dst_host_serror_rate')
encode_numeric_zscore(df, 'dst_host_srv_serror_rate')
encode_numeric_zscore(df, 'dst_host_rerror_rate')
encode_numeric_zscore(df, 'dst_host_srv_rerror_rate')

df.dropna(inplace=True,axis=1)
df['outcome'] = np.where(df['outcome']=='normal.', 0, 1)
df['outcome'] = pd.to_numeric(df['outcome'])

x, y = to_xy(df, 'outcome')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

##Transfering and training

In [None]:
model5 = Sequential()
for i in range(4):
  if i == 0:
    layer = Dense(47, input_dim=x.shape[1], activation='relu')
    model5.add(layer)
    continue
  layer = modelTemp.layers[i]
  layer.trainable = False
  model5.add(layer)
#Now we add the output
model5.add(Dense(y.shape[1], activation='softmax'))
model5.compile(loss='categorical_crossentropy', optimizer='adam')
model5.fit(x_train,y_train,validation_data=(x_test,y_test), callbacks=[monitor],verbose=1,epochs=1000)

In [None]:
#Gets metrics for the model and plots the confusion matrix
pred = model5.predict(x_test)
pred = np.argmax(pred,axis=1)
y_eval = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_eval, pred)
roc = metrics.roc_auc_score(y_eval, pred)
f1 = metrics.f1_score(y_eval, pred)
recall = metrics.recall_score(y_eval, pred)
conf = metrics.confusion_matrix(y_eval, pred)
model.summary()
print("Validation score: {}".format(score))
print(roc)
print(f1)
print(recall)
labels = [0,1]
df_cm = pd.DataFrame(conf, index = [i for i in labels],
                  columns = [i for i in labels])
plt.figure(figsize = (5,3))
sn.heatmap(df_cm, annot=True, fmt='g')