<a href="https://colab.research.google.com/github/McUsuf/Networks/blob/main/SSL_TLS_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow_decision_forests
!pip install wurlitzer
!pip install keras-tuner -U -qq

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import pandas as pd
import numpy as np
import keras_tuner as kt

In [None]:
def next_batch(i, num, data, labels):
	idx = np.arange(0, len(data))
	idx = idx[:i * num]
	batch_data = data[idx]
	batch_labels = labels[idx]
	return batch_data, batch_labels

def next_random_batch(num, data, labels):
	idx = np.arange(0, len(data))
	np.random.shuffle(idx)
	idx = idx[:num]
	data_shuffle = data[idx]
	labels_shuffle = labels[idx]
	#labels_shuffle = np.asarray(labels_shuffle.reshape(len(labels_shuffle), 1))
	return data_shuffle, labels_shuffle

def del_bad_columns(df): 
  cols = df.columns.tolist()
  del_cols = cols[0:15]
  for name in del_cols:
      del df[name]
  return df

def fix_columns_names(df):
  replace_values = {"%": "cent", " ": "_", ",": "_", ".": "_"}
  cols = df.columns.tolist()
  cols_change_list = dict()
  for name in cols:
    old_name = name
    for val_from, val_to in replace_values.items():
      name = name.replace(val_from, val_to)
    cols_change_list[old_name] = name
  
  return df.rename(columns=cols_change_list)
  

In [None]:
def split_dataset(dataset, test_ratio=0.30):
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]

In [None]:
def df_prepare():

  #Основной датафрейм
  df_train = pd.read_csv('/content/drive/MyDrive/For_SSL_Classification/results1_1.csv')
  df_train = del_bad_columns(df_train)
  df_train = fix_columns_names(df_train)
  label = "Server_Name"

  server_cnt = df_train[label].value_counts()
  server_names = df_train[label].unique()
  #Shuffle
  #df_train = df_train.sample(frac = 1).reset_index(drop = True)

  #Если имя сервера встречается меньше 10 раз, то 
  #исключаем все строки с этим именем сервера
  for item in server_names:
      cnt = df_train[df_train[label] == item].shape[0]
      if cnt < 25:
          df_train = df_train.drop(df_train[df_train[label] == item].index)

  df_train.fillna(0.0, inplace = True)
  server_cnt = df_train[label].value_counts()

  classes = df_train[label].unique().tolist()
  print(f"Classes num: {len(classes)}")
  df_train[label] = df_train[label].map(classes.index)
  #Нумеруем имена серверов (имя сервера, номер)
  #train_num = [i for i in range(0, len(server_names), 1)]
  #target_dict = dict(zip(server_names[::1], train_num[::1]))

  #Задаем для каждой строки из датафрейма нужный номер имени сервера
  #Y_train = np.array([target_dict.get(name) for name in df_train['Server Name']])
  #Y_train = Y_train.astype(np.int32)
  #Задаем количество классов для классификатора
  #num_classes = server_names.shape[0]

  #Вычищаем выборку
  #del df_train['Server Name']
  #features = df_train.columns.tolist()
  #df_train.fillna(0.0, inplace = True)
  #num_features = df_train.shape[1]

  #Приводим к нампаевским массивам тренировочные данные
  #df_train['LABEL'] = Y_train

  #Делим на тренировочные и тестовые
  df_train, df_test = split_dataset(df_train)

  #X_train = df_train.drop(['LABEL'], axis=1).astype(np.float32).values
  #Y_train = df_train['LABEL'].astype(np.int32).values

  #X_test = df_test.drop(['LABEL'], axis=1).astype(np.float32).values
  #Y_test = df_test['LABEL'].astype(np.int32).values

  return df_train, df_test, classes

In [None]:
def build_model(hp):
  model = tfdf.keras.RandomForestModel(
      categorical_algorithm=hp.Choice("categorical_algorithm", ["CART", "RANDOM"]),
      #max_depth=hp.Choice("max_depth", [4, 5, 6, 7]),
      max_num_nodes=hp.Choice("max_num_nodes", list(range(1000,10000, 250))),
      num_trees=hp.Choice("num_trees", list(range(50,1000, 50))),
      verbose=1,
      check_dataset=True,   
      growing_strategy='BEST_FIRST_GLOBAL'
  )

  # Optimize the model accuracy as computed on the validation dataset.
  model.compile(metrics=["accuracy"])
  return model

In [None]:
df_train, df_test, classes = df_prepare()
print(f"{len(df_train)} examples in training, {len(df_test)} examples for testing.")
ds_train = tfdf.keras.pd_dataframe_to_tf_dataset(df_train, label='Server_Name', max_num_classes=len(classes), fix_feature_names=False)
ds_test = tfdf.keras.pd_dataframe_to_tf_dataset(df_test, label='Server_Name', max_num_classes=len(classes), fix_feature_names=False)

Classes num: 82
6342 examples in training, 2638 examples for testing.


In [None]:
keras_tuner = kt.RandomSearch(
    build_model,
    objective="accuracy",
    max_trials=50,
    overwrite=True,
    directory="/tmp/keras_tuning")

keras_tuner.search(ds_train)

Trial 50 Complete [00h 01m 14s]
accuracy: 0.9105960264900662

Best accuracy So Far: 0.913118889940082
Total elapsed time: 01h 27m 56s


In [None]:
evaluation = model_1.evaluate(ds_test, return_dict=True)
print()

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")


loss: 0.0000
accuracy: 0.9121


In [None]:
tfdf.model_plotter.plot_model_in_colab(model_1, tree_idx=0, max_depth=4)

In [None]:
model_1.make_inspector().training_logs()

[TrainLog(num_trees=1, evaluation=Evaluation(num_examples=2301, accuracy=0.7944372012168622, loss=7.4092338973156835, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)),
 TrainLog(num_trees=11, evaluation=Evaluation(num_examples=6185, accuracy=0.8412287793047696, loss=3.3103506679147507, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)),
 TrainLog(num_trees=21, evaluation=Evaluation(num_examples=6229, accuracy=0.8709263124096965, loss=2.1521320121055107, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)),
 TrainLog(num_trees=31, evaluation=Evaluation(num_examples=6230, accuracy=0.8812199036918138, loss=1.6935632247101056, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)),
 TrainLog(num_trees=41, evaluation=Evaluation(num_examples=6230, accuracy=0.8882825040128411, loss=1.4563867134366477, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)),
 TrainLog(num_trees=51, evaluation=Evaluation(num_examples=6230, accuracy=0.8934189406099519, loss=1.2532193094294632, rm

In [None]:
model_1.save("/content/drive/MyDrive/For_SSL_Classification/my_saved_model")

