<a href="https://colab.research.google.com/github/McUsuf/SSL_traffic_classification/blob/main/SSL_TLS_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [63]:
!pip install tensorflow_decision_forests
!pip install wurlitzer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [64]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import pandas as pd
import numpy as np

In [65]:
def next_batch(i, num, data, labels):
	idx = np.arange(0, len(data))
	idx = idx[:i * num]
	batch_data = data[idx]
	batch_labels = labels[idx]
	return batch_data, batch_labels

def next_random_batch(num, data, labels):
	idx = np.arange(0, len(data))
	np.random.shuffle(idx)
	idx = idx[:num]
	data_shuffle = data[idx]
	labels_shuffle = labels[idx]
	#labels_shuffle = np.asarray(labels_shuffle.reshape(len(labels_shuffle), 1))
	return data_shuffle, labels_shuffle

def del_bad_columns(df): 
  cols = df.columns.tolist()
  del_cols = cols[0:15]
  for name in del_cols:
      del df[name]
  return df

def fix_columns_names(df):
  replace_values = {"%": "cent", " ": "_", ",": "_", ".": "_"}
  cols = df.columns.tolist()
  cols_change_list = dict()
  for name in cols:
    old_name = name
    for val_from, val_to in replace_values.items():
      name = name.replace(val_from, val_to)
    cols_change_list[old_name] = name
  
  return df.rename(columns=cols_change_list)
  

In [66]:
def split_dataset(dataset, test_ratio=0.30):
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]

In [67]:
def df_prepare():

  #Основной датафрейм
  df_train = pd.read_csv('/content/drive/MyDrive/For_SSL_Classification/results1_1.csv')
  df_train = del_bad_columns(df_train)
  df_train = fix_columns_names(df_train)
  label = "Server_Name"

  server_cnt = df_train[label].value_counts()
  server_names = df_train[label].unique()
  #Shuffle
  #df_train = df_train.sample(frac = 1).reset_index(drop = True)

  #Если имя сервера встречается меньше 10 раз, то 
  #исключаем все строки с этим именем сервера
  for item in server_names:
      cnt = df_train[df_train[label] == item].shape[0]
      if cnt < 25:
          df_train = df_train.drop(df_train[df_train[label] == item].index)

  df_train.fillna(0.0, inplace = True)
  server_cnt = df_train[label].value_counts()

  classes = df_train[label].unique().tolist()
  print(f"Classes num: {len(classes)}")
  df_train[label] = df_train[label].map(classes.index)
  #Нумеруем имена серверов (имя сервера, номер)
  #train_num = [i for i in range(0, len(server_names), 1)]
  #target_dict = dict(zip(server_names[::1], train_num[::1]))

  #Задаем для каждой строки из датафрейма нужный номер имени сервера
  #Y_train = np.array([target_dict.get(name) for name in df_train['Server Name']])
  #Y_train = Y_train.astype(np.int32)
  #Задаем количество классов для классификатора
  #num_classes = server_names.shape[0]

  #Вычищаем выборку
  #del df_train['Server Name']
  #features = df_train.columns.tolist()
  #df_train.fillna(0.0, inplace = True)
  #num_features = df_train.shape[1]

  #Приводим к нампаевским массивам тренировочные данные
  #df_train['LABEL'] = Y_train

  #Делим на тренировочные и тестовые
  df_train, df_test = split_dataset(df_train)

  #X_train = df_train.drop(['LABEL'], axis=1).astype(np.float32).values
  #Y_train = df_train['LABEL'].astype(np.int32).values

  #X_test = df_test.drop(['LABEL'], axis=1).astype(np.float32).values
  #Y_test = df_test['LABEL'].astype(np.int32).values

  return df_train, df_test, classes

In [68]:
df_train, df_test, classes = df_prepare()
print(f"{len(df_train)} examples in training, {len(df_test)} examples for testing.")
print(f"{df_train.columns}")
ds_train = tfdf.keras.pd_dataframe_to_tf_dataset(df_train, label='Server_Name', max_num_classes=len(classes), fix_feature_names=False)
ds_test = tfdf.keras.pd_dataframe_to_tf_dataset(df_test, label='Server_Name', max_num_classes=len(classes), fix_feature_names=False)

Classes num: 82
6327 examples in training, 2653 examples for testing.
Index(['Enc__Payload_Client-Server_Avg_size',
       'Enc__Payload_Client-Server_Variance',
       'Enc__Payload_Client-Server_Std__Dev',
       'Enc__Payload_Client-Server_Max_package',
       'Enc__Payload_Client-Server_25th_cent_sum',
       'Enc__Payload_Client-Server_50th_cent_sum',
       'Enc__Payload_Client-Server_75th_cent_sum',
       'Enc__Payload_Server-Client_Avg_size',
       'Enc__Payload_Server-Client_Variance',
       'Enc__Payload_Server-Client_Std__Dev',
       'Enc__Payload_Server-Client_Max_package',
       'Enc__Payload_Server-Client_25th_cent_sum',
       'Enc__Payload_Server-Client_50th_cent_sum',
       'Enc__Payload_Server-Client_75th_cent_sum', 'Client-Server_Avg_size',
       'Client-Server_Variance', 'Client-Server_Std__Dev',
       'Client-Server_Max_package', 'Client-Server_25th_cent_sum',
       'Client-Server_50th_cent_sum', 'Client-Server_75th_cent_sum',
       'Client-Server_Total_c

  features_dataframe = dataframe.drop(label, 1)


In [69]:
num_steps = 1000
batch_size = 16
num_trees = 100
max_nodes = 5000

model_1 = tfdf.keras.RandomForestModel(verbose=2, max_num_nodes=max_nodes, num_trees=num_trees, growing_strategy='BEST_FIRST_GLOBAL')
model_1.fit(ds_train)

Use 2 thread(s) for training
Use /tmp/tmpu7n0cspj as temporary training directory
Reading training dataset...
Training tensor examples:
Features: {'Enc__Payload_Client-Server_Avg_size': <tf.Tensor 'data_24:0' shape=(None,) dtype=float64>, 'Enc__Payload_Client-Server_Variance': <tf.Tensor 'data_27:0' shape=(None,) dtype=float64>, 'Enc__Payload_Client-Server_Std__Dev': <tf.Tensor 'data_26:0' shape=(None,) dtype=float64>, 'Enc__Payload_Client-Server_Max_package': <tf.Tensor 'data_25:0' shape=(None,) dtype=int64>, 'Enc__Payload_Client-Server_25th_cent_sum': <tf.Tensor 'data_21:0' shape=(None,) dtype=int64>, 'Enc__Payload_Client-Server_50th_cent_sum': <tf.Tensor 'data_22:0' shape=(None,) dtype=int64>, 'Enc__Payload_Client-Server_75th_cent_sum': <tf.Tensor 'data_23:0' shape=(None,) dtype=int64>, 'Enc__Payload_Server-Client_Avg_size': <tf.Tensor 'data_31:0' shape=(None,) dtype=float64>, 'Enc__Payload_Server-Client_Variance': <tf.Tensor 'data_34:0' shape=(None,) dtype=float64>, 'Enc__Payload_S



Training dataset read in 0:00:01.198595. Found 6327 examples.
Training model...


[INFO kernel.cc:813] Start Yggdrasil model training
[INFO kernel.cc:814] Collect training examples
[INFO kernel.cc:422] Number of batches: 7
[INFO kernel.cc:423] Number of examples: 6327
[INFO kernel.cc:836] Training dataset:
Number of records: 6327
Number of columns: 54

Number of columns by type:
	NUMERICAL: 53 (98.1481%)
	CATEGORICAL: 1 (1.85185%)

Columns:

NUMERICAL: 53 (98.1481%)
	0: "25th_cent_Inter_arrival" NUMERICAL mean:6.10675 min:0 max:2303.17 sd:65.4061
	1: "25th_cent_sum" NUMERICAL mean:273195 min:0 max:1.66569e+08 sd:3.87469e+06
	2: "50th_cent_Inter_arrival" NUMERICAL mean:14.9598 min:0 max:5033.98 sd:129.88
	3: "50th_cent_sum" NUMERICAL mean:518236 min:0 max:3.45673e+08 sd:7.79336e+06
	4: "75th_cent_Inter_arrival" NUMERICAL mean:26.4591 min:0 max:7788.29 sd:200.99
	5: "75th_cent_sum" NUMERICAL mean:753203 min:0 max:4.8313e+08 sd:1.11722e+07
	6: "Avg_Inter_arrival" NUMERICAL mean:3.13764 min:0 max:116.023 sd:7.62227
	7: "Avg_size" NUMERICAL mean:1628.72 min:88.392 max:16

Model trained in 0:00:21.174712
Compiling model...
Model compiled.


<keras.callbacks.History at 0x7f5185c2ecd0>

In [70]:
model_1.compile(metrics=["accuracy"])
evaluation = model_1.evaluate(ds_test, return_dict=True)
print()

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")


loss: 0.0000
accuracy: 0.9077


In [71]:
tfdf.model_plotter.plot_model_in_colab(model_1, tree_idx=0, max_depth=4)

In [72]:
model_1.make_inspector().training_logs()

[TrainLog(num_trees=1, evaluation=Evaluation(num_examples=2332, accuracy=0.7941680960548885, loss=7.418933430012462, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)),
 TrainLog(num_trees=11, evaluation=Evaluation(num_examples=6282, accuracy=0.8451130213307864, loss=3.358436251487921, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)),
 TrainLog(num_trees=21, evaluation=Evaluation(num_examples=6327, accuracy=0.873241662715347, loss=2.0202884829183767, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)),
 TrainLog(num_trees=31, evaluation=Evaluation(num_examples=6327, accuracy=0.8858858858858859, loss=1.5946991488246842, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)),
 TrainLog(num_trees=41, evaluation=Evaluation(num_examples=6327, accuracy=0.890627469574838, loss=1.3356444465047388, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)),
 TrainLog(num_trees=51, evaluation=Evaluation(num_examples=6327, accuracy=0.8933143669985776, loss=1.1966930800121232, rmse=N

In [74]:
model_1.save("/content/drive/MyDrive/For_SSL_Classification/my_saved_model")

