In [1]:
import numpy as np
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from tqdm.auto import tqdm

from utils import *

  from pandas import MultiIndex, Int64Index


In [2]:
dataset_train = FlowDataset(train=True)
dataset_valid = FlowDataset(train=False)
dataset_test = FlowDataset(train=False, test=True)

reading ./data/train
reading ./data/valid
reading ./data/project2_test


In [3]:
x_train, y_train = dataset_train.get_xy()
x_valid, y_valid = dataset_valid.get_xy()
x_test, _ = dataset_test.get_xy(inference=True)

In [4]:
model = XGBClassifier(n_jobs=-1, random_state=GLOBAL_SEED, use_label_encoder=False)

In [5]:
model.fit(x_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=-1,
              num_parallel_tree=1, predictor='auto', random_state=755,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [6]:
x_test.shape

(325963, 50)

In [7]:
pred = model.predict_proba(x_test)

In [8]:
df = dataset_test.df

In [9]:
df['prob_0'] = pred[:, 0]
df['prob_1'] = pred[:, 1]


In [10]:
raw = df.groupby('dst_ip', as_index=False)[['prob_0', 'prob_1']]

In [11]:
temp = raw.mean()
whole = set(df['dst_ip'])
detected = set(temp[temp['prob_1'] >= 0.608]['dst_ip'])
not_detected = whole - detected

In [12]:
print(len(detected), len(not_detected))

466 581


In [13]:
import pickle

In [14]:
with open('proba_detected.pkl', 'wb') as f:
    pickle.dump(detected, f)
with open('proba_not_detected.pkl', 'wb') as f:
    pickle.dump(not_detected, f)

In [15]:
detected & pd.read_pickle('./data/outer_ip_set.pkl')

{'104.130.4.129',
 '104.199.140.161',
 '104.205.143.7',
 '104.210.119.150',
 '104.211.252.81',
 '104.230.111.8',
 '106.139.5.161',
 '106.171.187.18',
 '106.206.134.35',
 '106.241.198.65',
 '106.242.84.53',
 '106.245.69.85',
 '107.142.27.240',
 '107.170.244.190',
 '107.173.251.105',
 '107.174.115.152',
 '107.175.87.45',
 '107.177.217.183',
 '107.178.152.0',
 '107.189.6.17',
 '109.208.147.88',
 '109.226.87.126',
 '11.127.208.136',
 '11.140.215.84',
 '11.99.74.200',
 '110.216.26.21',
 '111.50.194.12',
 '111.90.101.124',
 '112.22.125.5',
 '113.130.58.89',
 '113.167.167.227',
 '113.57.45.14',
 '113.95.71.249',
 '114.112.17.171',
 '114.51.220.74',
 '115.249.158.49',
 '116.175.103.164',
 '117.13.21.216',
 '117.154.162.49',
 '117.17.58.51',
 '117.18.196.152',
 '117.251.102.173',
 '117.83.47.182',
 '119.157.109.49',
 '120.173.25.183',
 '120.41.93.227',
 '121.116.175.191',
 '121.152.40.24',
 '121.154.85.12',
 '121.192.245.188',
 '121.211.142.234',
 '122.143.7.2',
 '123.12.187.71',
 '123.30.187.2

In [16]:
not_detected & pd.read_pickle('./data/outer_ip_set.pkl')

{'0.85.204.112',
 '0.90.71.87',
 '1.128.101.201',
 '1.192.185.61',
 '1.70.247.22',
 '1.71.143.132',
 '100.131.35.240',
 '100.144.85.150',
 '102.215.171.4',
 '102.32.110.112',
 '104.147.116.140',
 '104.17.46.230',
 '104.234.9.112',
 '104.236.9.242',
 '104.30.172.154',
 '104.61.216.146',
 '106.207.75.223',
 '106.240.215.201',
 '106.52.121.70',
 '106.53.79.243',
 '106.57.7.188',
 '106.6.32.44',
 '107.12.22.26',
 '107.27.109.55',
 '108.118.205.50',
 '108.119.194.143',
 '108.72.145.34',
 '109.198.246.133',
 '109.225.246.4',
 '109.231.119.56',
 '11.127.245.99',
 '111.164.125.217',
 '112.18.202.123',
 '112.19.131.44',
 '112.232.235.79',
 '112.24.43.107',
 '112.242.90.229',
 '112.25.138.121',
 '113.165.88.185',
 '113.202.52.41',
 '113.208.47.92',
 '113.81.142.32',
 '113.87.229.171',
 '113.90.50.118',
 '113.92.180.104',
 '113.95.119.41',
 '114.108.81.15',
 '114.27.200.191',
 '114.32.92.214',
 '114.39.207.75',
 '114.45.7.144',
 '114.51.40.99',
 '116.107.2.7',
 '116.147.240.244',
 '116.22.210.158