# Inicio

In [None]:
!pip install category_encoders
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.utils import resample
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
columnas = ["attack_time", "watcher_country", "watcher_as_name", "attacker_country", "attacker_as_name", "attack_type", "attacker_ip_enum"]
X_test = pd.read_parquet("/content/drive/MyDrive/Colab Notebooks/test.parq", columns = columnas)
X_test = X_test.drop_duplicates(subset = ["attacker_ip_enum"])
X_test = X_test.sort_values(by = "attacker_ip_enum")
X_test

Unnamed: 0,attack_time,watcher_country,watcher_as_name,attacker_country,attacker_as_name,attack_type,attacker_ip_enum
21502,2023-08-05 01:06:05+00:00,US,GOOGLE,KH,ANGKOR DATA COMMUNICATION,http:bruteforce,5
191,2023-07-30 07:48:36+00:00,LU,PONYNET,CA,OVH SAS,ssh:bruteforce,7
665517,2023-08-24 22:34:25+00:00,DE,ENTEGA Medianet GmbH,DE,Vodafone GmbH,http:scan,21
72758,2023-07-31 15:06:25+00:00,US,A2HOSTING,IN,Threesa Infoway Pvt.Ltd.,http:spam,29
37390,2023-07-31 12:51:27+00:00,GB,DIGITALOCEAN-ASN,CZ,"Seznam.cz, a.s.",http:spam,33
...,...,...,...,...,...,...,...
17869691,2023-08-02 19:20:11+00:00,US,CONTABO,BR,GOOGLE,unknown:unknown,199947
17977618,2023-08-29 01:54:37+00:00,DE,Hetzner Online GmbH,ZA,Afrihost,ssh:bruteforce,199949
17862033,2023-08-02 01:09:27+00:00,US,GOOGLE-CLOUD-PLATFORM,GB,Constantine Cybersecurity Ltd.,http:spam,199962
17857346,2023-08-03 00:41:02+00:00,HK,AMAZON-02,BH,Kalaam Telecom Bahrain B.S.C.,http:spam,199964


In [None]:
X_test_ip = X_test[["attacker_ip_enum"]]
X_test_ip

Unnamed: 0,attacker_ip_enum
21502,5
191,7
665517,21
72758,29
37390,33
...,...
17869691,199947
17977618,199949
17862033,199962
17857346,199964


Para test y train, creo varias features: hora, rango de horas, día de la semana, divido tipo de ataque en protocolo y ataque, y país del atacante junto a la hora del ataque. Para las dos columnas de países, había diez que aparecían más veces, así que los que sobraban los agrupé en Otros países. Para la imputación de nulos de las dos columnas de nombres, tomé en cuenta al nombre que aparecía más veces en el país respectivo y le asigné ese. También cambié los tipos de todas las columnas para ahorrar memoria.

In [None]:
X_test["hour"] = X_test["attack_time"].dt.hour.astype("category")
X_test["hour_range"] = pd.cut(X_test["hour"], bins = [-1, 6, 12, 18, 23], labels = ["(0-6)","(6-12)", "(12-18)", "(18-0)"]).astype("category")
X_test["day"] = X_test["attack_time"].dt.day_name().astype("category")
X_test[["protocol", "attack"]] = X_test["attack_type"].str.split(":", expand = True).astype("category")

paises_attacker = ["US", "CA", "RU", "SG", "DE", "AU", "CN", "IN", "FR", "NL"]
paises_watcher = ["US", "DE", "FR", "GB", "NL", "PL", "CA", "SG", "RU", "BE"]
X_test["attacker_country"] = np.where(X_test["attacker_country"].isin(paises_attacker), X_test["attacker_country"], "Otros países")
X_test["attacker_country"] = X_test["attacker_country"].astype("category")
X_test["watcher_country"] = np.where(X_test["watcher_country"].isin(paises_watcher), X_test["watcher_country"], "Otros países")
X_test["watcher_country"] = X_test["watcher_country"].astype("category")

X_test["attacker_as_name"] = X_test.groupby("attacker_country")["attacker_as_name"].transform(lambda x: x.fillna(x.mode()[0]))
X_test["watcher_as_name"] = X_test.groupby("watcher_country")["watcher_as_name"].transform(lambda x: x.fillna(x.mode()[0]))

batches = np.array_split(X_test, 100)
batches_juntos = []
for batch in batches:
    batch["attacker_country_hour"] = batch["attacker_country"].astype(str) + "/" + batch["hour"].astype(str)
    batches_juntos.append(batch)
X_test = pd.concat(batches_juntos)
X_test["attacker_country_hour"] = X_test["attacker_country_hour"].astype("category")

X_test = X_test.drop(["attack_type", "attack_time", "attacker_ip_enum"], axis = 1)
X_test

Unnamed: 0,watcher_country,watcher_as_name,attacker_country,attacker_as_name,hour,hour_range,day,protocol,attack,attacker_country_hour
21502,US,GOOGLE,Otros países,ANGKOR DATA COMMUNICATION,1,(0-6),Saturday,http,bruteforce,Otros países/1
191,Otros países,PONYNET,CA,OVH SAS,7,(6-12),Sunday,ssh,bruteforce,CA/7
665517,DE,ENTEGA Medianet GmbH,DE,Vodafone GmbH,22,(18-0),Thursday,http,scan,DE/22
72758,US,A2HOSTING,IN,Threesa Infoway Pvt.Ltd.,15,(12-18),Monday,http,spam,IN/15
37390,GB,DIGITALOCEAN-ASN,Otros países,"Seznam.cz, a.s.",12,(6-12),Monday,http,spam,Otros países/12
...,...,...,...,...,...,...,...,...,...,...
17869691,US,CONTABO,Otros países,GOOGLE,19,(18-0),Wednesday,unknown,unknown,Otros países/19
17977618,DE,Hetzner Online GmbH,Otros países,Afrihost,1,(0-6),Tuesday,ssh,bruteforce,Otros países/1
17862033,US,GOOGLE-CLOUD-PLATFORM,Otros países,Constantine Cybersecurity Ltd.,1,(0-6),Wednesday,http,spam,Otros países/1
17857346,Otros países,AMAZON-02,Otros países,Kalaam Telecom Bahrain B.S.C.,0,(0-6),Thursday,http,spam,Otros países/0


In [None]:
columnas = ["attack_time", "watcher_country", "watcher_as_name", "attacker_country", "attacker_as_name", "attack_type", "attacker_ip_enum", "label"]
train = pd.read_parquet("/content/drive/MyDrive/Colab Notebooks/train.parq", columns = columnas)
train

Unnamed: 0,attack_time,watcher_country,watcher_as_name,attacker_country,attacker_as_name,attack_type,attacker_ip_enum,label
0,2023-07-31 07:17:51+00:00,DE,Host Europe GmbH,TR,Murat Aktas,http:exploit,6466,0
1,2023-07-31 07:17:51+00:00,DE,Host Europe GmbH,TR,Murat Aktas,http:spam,6466,0
2,2023-07-31 07:17:49+00:00,DE,bn:t Blatzheim Networks Telecom GmbH,DE,Contabo GmbH,http:bruteforce,4637,0
3,2023-07-31 07:17:49+00:00,DE,bn:t Blatzheim Networks Telecom GmbH,DE,Contabo GmbH,http:spam,4637,0
4,2023-07-31 07:17:49+00:00,DE,bn:t Blatzheim Networks Telecom GmbH,DE,Contabo GmbH,http:exploit,4637,0
...,...,...,...,...,...,...,...,...
61629680,2023-08-24 21:14:52+00:00,DE,ENTEGA Medianet GmbH,DE,Vodafone GmbH,http:scan,190784,0
61629681,2023-08-24 21:14:46+00:00,US,NAMECHEAP-NET,US,WOW,http:scan,193446,0
61629682,2023-08-24 21:14:46+00:00,US,NAMECHEAP-NET,US,WOW,http:spam,193446,0
61629683,2023-08-24 21:14:46+00:00,US,NAMECHEAP-NET,US,WOW,http:spam,193446,0


Como voy a splitear el train para conseguir el set de validación, dropeo los duplicados para que no queden separados en diferentes sets y termine validando con la data aprendida en el entrenamiento.

In [None]:
train = train.drop_duplicates()
train = train.copy()
train

Unnamed: 0,attack_time,watcher_country,watcher_as_name,attacker_country,attacker_as_name,attack_type,attacker_ip_enum,label
0,2023-07-31 07:17:51+00:00,DE,Host Europe GmbH,TR,Murat Aktas,http:exploit,6466,0
1,2023-07-31 07:17:51+00:00,DE,Host Europe GmbH,TR,Murat Aktas,http:spam,6466,0
2,2023-07-31 07:17:49+00:00,DE,bn:t Blatzheim Networks Telecom GmbH,DE,Contabo GmbH,http:bruteforce,4637,0
3,2023-07-31 07:17:49+00:00,DE,bn:t Blatzheim Networks Telecom GmbH,DE,Contabo GmbH,http:spam,4637,0
4,2023-07-31 07:17:49+00:00,DE,bn:t Blatzheim Networks Telecom GmbH,DE,Contabo GmbH,http:exploit,4637,0
...,...,...,...,...,...,...,...,...
61629671,2023-08-24 21:14:59+00:00,US,INMOTION,US,ATT-INTERNET4,http:bruteforce,191439,0
61629673,2023-08-24 21:14:59+00:00,US,INMOTION,US,ATT-INTERNET4,http:scan,191439,0
61629674,2023-08-24 21:14:59+00:00,US,INMOTION,US,ATT-INTERNET4,http:exploit,191439,0
61629681,2023-08-24 21:14:46+00:00,US,NAMECHEAP-NET,US,WOW,http:scan,193446,0


# Creación de Features e Imputación de Nulos

In [None]:
train["hour"] = train["attack_time"].dt.hour.astype("category")
train["hour_range"] = pd.cut(train["hour"], bins = [-1, 6, 12, 18, 23], labels = ["(0-6)","(6-12)", "(12-18)", "(18-0)"]).astype("category")
train["day"] = train["attack_time"].dt.day_name().astype("category")
train[["protocol", "attack"]] = train["attack_type"].str.split(":", expand = True).astype("category")

train["attacker_country"] = np.where(train["attacker_country"].isin(paises_attacker), train["attacker_country"], "Otros países")
train["attacker_country"] = train["attacker_country"].astype("category")
train["watcher_country"] = np.where(train["watcher_country"].isin(paises_watcher), train["watcher_country"], "Otros países")
train["watcher_country"] = train["watcher_country"].astype("category")

train["attacker_as_name"] = train.groupby("attacker_country")["attacker_as_name"].transform(lambda x: x.fillna(x.mode()[0]))
train["watcher_as_name"] = train.groupby("watcher_country")["watcher_as_name"].transform(lambda x: x.fillna(x.mode()[0]))

batches = np.array_split(train, 100)
batches_juntos = []
for batch in batches:
    batch["attacker_country_hour"] = batch["attacker_country"].astype(str) + "/" + batch["hour"].astype(str)
    batches_juntos.append(batch)
train = pd.concat(batches_juntos)
train["attacker_country_hour"] = train["attacker_country_hour"].astype("category")

train = train.drop(["attack_type", "attack_time"], axis = 1)
train

Unnamed: 0,watcher_country,watcher_as_name,attacker_country,attacker_as_name,attacker_ip_enum,label,hour,hour_range,day,protocol,attack,attacker_country_hour
0,DE,Host Europe GmbH,Otros países,Murat Aktas,6466,0,7,(6-12),Monday,http,exploit,Otros países/7
1,DE,Host Europe GmbH,Otros países,Murat Aktas,6466,0,7,(6-12),Monday,http,spam,Otros países/7
2,DE,bn:t Blatzheim Networks Telecom GmbH,DE,Contabo GmbH,4637,0,7,(6-12),Monday,http,bruteforce,DE/7
3,DE,bn:t Blatzheim Networks Telecom GmbH,DE,Contabo GmbH,4637,0,7,(6-12),Monday,http,spam,DE/7
4,DE,bn:t Blatzheim Networks Telecom GmbH,DE,Contabo GmbH,4637,0,7,(6-12),Monday,http,exploit,DE/7
...,...,...,...,...,...,...,...,...,...,...,...,...
61629671,US,INMOTION,US,ATT-INTERNET4,191439,0,21,(18-0),Thursday,http,bruteforce,US/21
61629673,US,INMOTION,US,ATT-INTERNET4,191439,0,21,(18-0),Thursday,http,scan,US/21
61629674,US,INMOTION,US,ATT-INTERNET4,191439,0,21,(18-0),Thursday,http,exploit,US/21
61629681,US,NAMECHEAP-NET,US,WOW,193446,0,21,(18-0),Thursday,http,scan,US/21


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40238870 entries, 0 to 61629682
Data columns (total 12 columns):
 #   Column                 Dtype   
---  ------                 -----   
 0   watcher_country        category
 1   watcher_as_name        category
 2   attacker_country       category
 3   attacker_as_name       category
 4   attacker_ip_enum       int32   
 5   label                  int8    
 6   hour                   category
 7   hour_range             category
 8   day                    category
 9   protocol               category
 10  attack                 category
 11  attacker_country_hour  category
dtypes: category(10), int32(1), int8(1)
memory usage: 998.2 MB


# División de Train y Validation

¿Cómo conviene elegir los datos de validación respecto de los de train? Conviene que los IPs que estén en train, no estén en el de validación, porque si no solo estaría memorizando lo aprendido.

In [None]:
train_set, validation_set = train_test_split(train["attacker_ip_enum"].unique(), test_size = 0.2, random_state = 18)

X_train = train[train["attacker_ip_enum"].isin(train_set)].drop("attacker_ip_enum", axis = 1)
X_validation = train[train["attacker_ip_enum"].isin(validation_set)].drop(["attacker_ip_enum", "label"], axis = 1)
y_validation = train[train["attacker_ip_enum"].isin(validation_set)]["label"]

label_zero = X_train.loc[X_train["label"] == 0]
label_one = X_train.loc[X_train["label"] == 1]

label_zero = resample(label_zero, replace = False, n_samples = len(label_one) + 2900000, random_state = 18)

X_train = pd.concat([label_zero, label_one])
y_train = X_train["label"]
X_train = X_train.drop("label", axis = 1)

In [None]:
del train
del batch
del batches
del batches_juntos

In [None]:
X_train

Unnamed: 0,watcher_country,watcher_as_name,attacker_country,attacker_as_name,hour,hour_range,day,protocol,attack,attacker_country_hour
46466580,US,HVC-AS,CN,CT-HangZhou-IDC,12,(6-12),Monday,http,scan,CN/12
35466168,DE,ENTEGA Medianet GmbH,DE,1&1 Versatel Deutschland GmbH,4,(0-6),Saturday,http,scan,DE/4
32971698,Otros países,Datacamp Limited,CN,Chinanet,12,(6-12),Monday,ssh,bruteforce,CN/12
14102806,US,GOOGLE-CLOUD-PLATFORM,Otros países,StarNet Solutii SRL,14,(12-18),Friday,http,exploit,Otros países/14
38449210,DE,ENTEGA Medianet GmbH,DE,Deutsche Telekom AG,11,(6-12),Friday,http,scan,DE/11
...,...,...,...,...,...,...,...,...,...,...
61629208,FR,Cyllene Lille Sasu,CA,AS-COLOCROSSING,9,(6-12),Wednesday,unknown,unknown,CA/9
61629247,DE,Strato AG,SG,DIGITALOCEAN-ASN,21,(18-0),Thursday,http,spam,SG/21
61629250,DE,Strato AG,SG,DIGITALOCEAN-ASN,21,(18-0),Thursday,http,scan,SG/21
61629389,Otros países,One.com A/S,SG,DIGITALOCEAN-ASN,21,(18-0),Thursday,http,scan,SG/21


In [None]:
X_validation

Unnamed: 0,watcher_country,watcher_as_name,attacker_country,attacker_as_name,hour,hour_range,day,protocol,attack,attacker_country_hour
57,Otros países,Buuldy Bilisim,Otros países,VIETNAM POSTS AND TELECOMMUNICATIONS GROUP,7,(6-12),Monday,tcp,scan,Otros países/7
66,US,NETWORK-SOLUTIONS-HOSTING,Otros países,Telecomunicacoes de Mocambique (TDM),7,(6-12),Monday,http,scan,Otros países/7
67,US,NETWORK-SOLUTIONS-HOSTING,Otros países,Telecomunicacoes de Mocambique (TDM),7,(6-12),Monday,http,spam,Otros países/7
68,US,NETWORK-SOLUTIONS-HOSTING,Otros países,Telecomunicacoes de Mocambique (TDM),7,(6-12),Monday,http,bruteforce,Otros países/7
82,US,NETWORK-SOLUTIONS-HOSTING,Otros países,Telecomunicacoes de Mocambique (TDM),7,(6-12),Monday,http,exploit,Otros países/7
...,...,...,...,...,...,...,...,...,...,...
61629630,US,DIGITALOCEAN-ASN,Otros países,Tunisie-Telecom,21,(18-0),Thursday,ssh,bruteforce,Otros países/21
61629654,Otros países,One Albania Sh.a.,Otros países,M247 Europe SRL,21,(18-0),Thursday,tcp,scan,Otros países/21
61629657,FR,OVH SAS,DE,Host Europe GmbH,21,(18-0),Thursday,http,spam,DE/21
61629658,FR,OVH SAS,DE,Host Europe GmbH,21,(18-0),Thursday,http,exploit,DE/21


Para todos los encodings decidí cambiarle el tipo a cada columna para ahorrar memoria, ya que algunas quedaban con int64 o float64, dependiendo la bibliotea.

# Watcher Country

Uso Binary Encoding para watcher_country y attacker_country porque no son pocos países para usar One Hot Encoding, pero tampoco muchos para Mean Encoding.

In [None]:
encoder = ce.BinaryEncoder(cols = ["watcher_country"])
X_train = encoder.fit_transform(X_train)
X_validation = encoder.transform(X_validation)
X_test = encoder.transform(X_test)

columnas = ["watcher_country_0", "watcher_country_1", "watcher_country_2", "watcher_country_3"]
X_train[columnas] = X_train[columnas].astype("int8")
X_validation[columnas] = X_validation[columnas].astype("int8")
X_test[columnas] = X_test[columnas].astype("int8")

X_train

Unnamed: 0,watcher_country_0,watcher_country_1,watcher_country_2,watcher_country_3,watcher_as_name,attacker_country,attacker_as_name,hour,hour_range,day,protocol,attack,attacker_country_hour
46466580,0,0,0,1,HVC-AS,CN,CT-HangZhou-IDC,12,(6-12),Monday,http,scan,CN/12
35466168,0,0,1,0,ENTEGA Medianet GmbH,DE,1&1 Versatel Deutschland GmbH,4,(0-6),Saturday,http,scan,DE/4
32971698,0,0,1,1,Datacamp Limited,CN,Chinanet,12,(6-12),Monday,ssh,bruteforce,CN/12
14102806,0,0,0,1,GOOGLE-CLOUD-PLATFORM,Otros países,StarNet Solutii SRL,14,(12-18),Friday,http,exploit,Otros países/14
38449210,0,0,1,0,ENTEGA Medianet GmbH,DE,Deutsche Telekom AG,11,(6-12),Friday,http,scan,DE/11
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61629208,0,1,0,0,Cyllene Lille Sasu,CA,AS-COLOCROSSING,9,(6-12),Wednesday,unknown,unknown,CA/9
61629247,0,0,1,0,Strato AG,SG,DIGITALOCEAN-ASN,21,(18-0),Thursday,http,spam,SG/21
61629250,0,0,1,0,Strato AG,SG,DIGITALOCEAN-ASN,21,(18-0),Thursday,http,scan,SG/21
61629389,0,0,1,1,One.com A/S,SG,DIGITALOCEAN-ASN,21,(18-0),Thursday,http,scan,SG/21


# Watcher AS Name

Uso Mean Encoding para watcher_as_name y attacker_as_name porque hay muchos nombres diferentes en cada columna.

In [None]:
encoder = ce.TargetEncoder(cols = ["watcher_as_name"])
X_train = encoder.fit_transform(X_train, y_train)
X_validation = encoder.transform(X_validation)
X_test = encoder.transform(X_test)

X_train["watcher_as_name"] = X_train["watcher_as_name"].astype("float32")
X_validation["watcher_as_name"] = X_validation["watcher_as_name"].astype("float32")
X_test["watcher_as_name"] = X_test["watcher_as_name"].astype("float32")
X_train

Unnamed: 0,watcher_country_0,watcher_country_1,watcher_country_2,watcher_country_3,watcher_as_name,attacker_country,attacker_as_name,hour,hour_range,day,protocol,attack,attacker_country_hour
46466580,0,0,0,1,0.147452,CN,CT-HangZhou-IDC,12,(6-12),Monday,http,scan,CN/12
35466168,0,0,1,0,0.029596,DE,1&1 Versatel Deutschland GmbH,4,(0-6),Saturday,http,scan,DE/4
32971698,0,0,1,1,0.141247,CN,Chinanet,12,(6-12),Monday,ssh,bruteforce,CN/12
14102806,0,0,0,1,0.177792,Otros países,StarNet Solutii SRL,14,(12-18),Friday,http,exploit,Otros países/14
38449210,0,0,1,0,0.029596,DE,Deutsche Telekom AG,11,(6-12),Friday,http,scan,DE/11
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61629208,0,1,0,0,0.433012,CA,AS-COLOCROSSING,9,(6-12),Wednesday,unknown,unknown,CA/9
61629247,0,0,1,0,0.192596,SG,DIGITALOCEAN-ASN,21,(18-0),Thursday,http,spam,SG/21
61629250,0,0,1,0,0.192596,SG,DIGITALOCEAN-ASN,21,(18-0),Thursday,http,scan,SG/21
61629389,0,0,1,1,0.128138,SG,DIGITALOCEAN-ASN,21,(18-0),Thursday,http,scan,SG/21


# Attacker Country

In [None]:
encoder = ce.BinaryEncoder(cols = ["attacker_country"])
X_train = encoder.fit_transform(X_train)
X_validation = encoder.transform(X_validation)
X_test = encoder.transform(X_test)

columnas = ["attacker_country_0", "attacker_country_1", "attacker_country_2", "attacker_country_3"]
X_train[columnas] = X_train[columnas].astype("int8")
X_validation[columnas] = X_validation[columnas].astype("int8")
X_test[columnas] = X_test[columnas].astype("int8")

X_train

Unnamed: 0,watcher_country_0,watcher_country_1,watcher_country_2,watcher_country_3,watcher_as_name,attacker_country_0,attacker_country_1,attacker_country_2,attacker_country_3,attacker_as_name,hour,hour_range,day,protocol,attack,attacker_country_hour
46466580,0,0,0,1,0.147452,0,0,0,1,CT-HangZhou-IDC,12,(6-12),Monday,http,scan,CN/12
35466168,0,0,1,0,0.029596,0,0,1,0,1&1 Versatel Deutschland GmbH,4,(0-6),Saturday,http,scan,DE/4
32971698,0,0,1,1,0.141247,0,0,0,1,Chinanet,12,(6-12),Monday,ssh,bruteforce,CN/12
14102806,0,0,0,1,0.177792,0,0,1,1,StarNet Solutii SRL,14,(12-18),Friday,http,exploit,Otros países/14
38449210,0,0,1,0,0.029596,0,0,1,0,Deutsche Telekom AG,11,(6-12),Friday,http,scan,DE/11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61629208,0,1,0,0,0.433012,0,1,0,0,AS-COLOCROSSING,9,(6-12),Wednesday,unknown,unknown,CA/9
61629247,0,0,1,0,0.192596,1,0,1,0,DIGITALOCEAN-ASN,21,(18-0),Thursday,http,spam,SG/21
61629250,0,0,1,0,0.192596,1,0,1,0,DIGITALOCEAN-ASN,21,(18-0),Thursday,http,scan,SG/21
61629389,0,0,1,1,0.128138,1,0,1,0,DIGITALOCEAN-ASN,21,(18-0),Thursday,http,scan,SG/21


# Attacker AS Name

In [None]:
encoder = ce.TargetEncoder(cols = ["attacker_as_name"])
X_train = encoder.fit_transform(X_train, y_train)
X_validation = encoder.transform(X_validation)
X_test = encoder.transform(X_test)

X_train["attacker_as_name"] = X_train["attacker_as_name"].astype("float32")
X_validation["attacker_as_name"] = X_validation["attacker_as_name"].astype("float32")
X_test["attacker_as_name"] = X_test["attacker_as_name"].astype("float32")
X_train

Unnamed: 0,watcher_country_0,watcher_country_1,watcher_country_2,watcher_country_3,watcher_as_name,attacker_country_0,attacker_country_1,attacker_country_2,attacker_country_3,attacker_as_name,hour,hour_range,day,protocol,attack,attacker_country_hour
46466580,0,0,0,1,0.147452,0,0,0,1,0.000000e+00,12,(6-12),Monday,http,scan,CN/12
35466168,0,0,1,0,0.029596,0,0,1,0,0.000000e+00,4,(0-6),Saturday,http,scan,DE/4
32971698,0,0,1,1,0.141247,0,0,0,1,6.621183e-02,12,(6-12),Monday,ssh,bruteforce,CN/12
14102806,0,0,0,1,0.177792,0,0,1,1,3.085171e-10,14,(12-18),Friday,http,exploit,Otros países/14
38449210,0,0,1,0,0.029596,0,0,1,0,0.000000e+00,11,(6-12),Friday,http,scan,DE/11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61629208,0,1,0,0,0.433012,0,1,0,0,6.070111e-01,9,(6-12),Wednesday,unknown,unknown,CA/9
61629247,0,0,1,0,0.192596,1,0,1,0,2.770404e-01,21,(18-0),Thursday,http,spam,SG/21
61629250,0,0,1,0,0.192596,1,0,1,0,2.770404e-01,21,(18-0),Thursday,http,scan,SG/21
61629389,0,0,1,1,0.128138,1,0,1,0,2.770404e-01,21,(18-0),Thursday,http,scan,SG/21


# Hour Range

Uso One Hot Encoding para hour_range, ya que solo hay cuatro valores en la columna.

In [None]:
encoder = ce.OneHotEncoder(cols = ["hour_range"], use_cat_names = True)
X_train = encoder.fit_transform(X_train)
X_validation = encoder.transform(X_validation)
X_test = encoder.transform(X_test)

columnas = ["hour_range_(0-6)", "hour_range_(6-12)", "hour_range_(12-18)", "hour_range_(18-0)"]
X_train[columnas] = X_train[columnas].astype("int8")
X_validation[columnas] = X_validation[columnas].astype("int8")
X_test[columnas] = X_test[columnas].astype("int8")

X_train

Unnamed: 0,watcher_country_0,watcher_country_1,watcher_country_2,watcher_country_3,watcher_as_name,attacker_country_0,attacker_country_1,attacker_country_2,attacker_country_3,attacker_as_name,hour,hour_range_(0-6),hour_range_(6-12),hour_range_(12-18),hour_range_(18-0),day,protocol,attack,attacker_country_hour
46466580,0,0,0,1,0.147452,0,0,0,1,0.000000e+00,12,0,1,0,0,Monday,http,scan,CN/12
35466168,0,0,1,0,0.029596,0,0,1,0,0.000000e+00,4,1,0,0,0,Saturday,http,scan,DE/4
32971698,0,0,1,1,0.141247,0,0,0,1,6.621183e-02,12,0,1,0,0,Monday,ssh,bruteforce,CN/12
14102806,0,0,0,1,0.177792,0,0,1,1,3.085171e-10,14,0,0,1,0,Friday,http,exploit,Otros países/14
38449210,0,0,1,0,0.029596,0,0,1,0,0.000000e+00,11,0,1,0,0,Friday,http,scan,DE/11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61629208,0,1,0,0,0.433012,0,1,0,0,6.070111e-01,9,0,1,0,0,Wednesday,unknown,unknown,CA/9
61629247,0,0,1,0,0.192596,1,0,1,0,2.770404e-01,21,0,0,0,1,Thursday,http,spam,SG/21
61629250,0,0,1,0,0.192596,1,0,1,0,2.770404e-01,21,0,0,0,1,Thursday,http,scan,SG/21
61629389,0,0,1,1,0.128138,1,0,1,0,2.770404e-01,21,0,0,0,1,Thursday,http,scan,SG/21


# Hour

Para la columna hour decidí normalizar la columna entre 0 y 1 porque hay 24 valores númericos diferentes con mucha varianza entre sí.

In [None]:
scaler = MinMaxScaler()

X_train["hour"] = scaler.fit_transform(X_train["hour"].values.reshape(-1, 1))
X_validation["hour"] = scaler.transform(X_validation["hour"].values.reshape(-1, 1))
X_test["hour"] = scaler.transform(X_test["hour"].values.reshape(-1, 1))

X_train["hour"] = X_train["hour"].astype("float32")
X_validation["hour"] = X_validation["hour"].astype("float32")
X_test["hour"] = X_test["hour"].astype("float32")
X_train

Unnamed: 0,watcher_country_0,watcher_country_1,watcher_country_2,watcher_country_3,watcher_as_name,attacker_country_0,attacker_country_1,attacker_country_2,attacker_country_3,attacker_as_name,hour,hour_range_(0-6),hour_range_(6-12),hour_range_(12-18),hour_range_(18-0),day,protocol,attack,attacker_country_hour
46466580,0,0,0,1,0.147452,0,0,0,1,0.000000e+00,0.521739,0,1,0,0,Monday,http,scan,CN/12
35466168,0,0,1,0,0.029596,0,0,1,0,0.000000e+00,0.173913,1,0,0,0,Saturday,http,scan,DE/4
32971698,0,0,1,1,0.141247,0,0,0,1,6.621183e-02,0.521739,0,1,0,0,Monday,ssh,bruteforce,CN/12
14102806,0,0,0,1,0.177792,0,0,1,1,3.085171e-10,0.608696,0,0,1,0,Friday,http,exploit,Otros países/14
38449210,0,0,1,0,0.029596,0,0,1,0,0.000000e+00,0.478261,0,1,0,0,Friday,http,scan,DE/11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61629208,0,1,0,0,0.433012,0,1,0,0,6.070111e-01,0.391304,0,1,0,0,Wednesday,unknown,unknown,CA/9
61629247,0,0,1,0,0.192596,1,0,1,0,2.770404e-01,0.913043,0,0,0,1,Thursday,http,spam,SG/21
61629250,0,0,1,0,0.192596,1,0,1,0,2.770404e-01,0.913043,0,0,0,1,Thursday,http,scan,SG/21
61629389,0,0,1,1,0.128138,1,0,1,0,2.770404e-01,0.913043,0,0,0,1,Thursday,http,scan,SG/21


# Day

Uso Binary Encoder para la columna day porque son los siete días de la semana.

In [None]:
encoder = ce.BinaryEncoder(cols = ["day"])
X_train = encoder.fit_transform(X_train)
X_validation = encoder.transform(X_validation)
X_test = encoder.transform(X_test)

columnas = ["day_0", "day_1", "day_2"]
X_train[columnas] = X_train[columnas].astype("int8")
X_validation[columnas] = X_validation[columnas].astype("int8")
X_test[columnas] = X_test[columnas].astype("int8")

X_train

Unnamed: 0,watcher_country_0,watcher_country_1,watcher_country_2,watcher_country_3,watcher_as_name,attacker_country_0,attacker_country_1,attacker_country_2,attacker_country_3,attacker_as_name,...,hour_range_(0-6),hour_range_(6-12),hour_range_(12-18),hour_range_(18-0),day_0,day_1,day_2,protocol,attack,attacker_country_hour
46466580,0,0,0,1,0.147452,0,0,0,1,0.000000e+00,...,0,1,0,0,0,0,1,http,scan,CN/12
35466168,0,0,1,0,0.029596,0,0,1,0,0.000000e+00,...,1,0,0,0,0,1,0,http,scan,DE/4
32971698,0,0,1,1,0.141247,0,0,0,1,6.621183e-02,...,0,1,0,0,0,0,1,ssh,bruteforce,CN/12
14102806,0,0,0,1,0.177792,0,0,1,1,3.085171e-10,...,0,0,1,0,0,1,1,http,exploit,Otros países/14
38449210,0,0,1,0,0.029596,0,0,1,0,0.000000e+00,...,0,1,0,0,0,1,1,http,scan,DE/11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61629208,0,1,0,0,0.433012,0,1,0,0,6.070111e-01,...,0,1,0,0,1,0,0,unknown,unknown,CA/9
61629247,0,0,1,0,0.192596,1,0,1,0,2.770404e-01,...,0,0,0,1,1,0,1,http,spam,SG/21
61629250,0,0,1,0,0.192596,1,0,1,0,2.770404e-01,...,0,0,0,1,1,0,1,http,scan,SG/21
61629389,0,0,1,1,0.128138,1,0,1,0,2.770404e-01,...,0,0,0,1,1,0,1,http,scan,SG/21


# Protocol

En protocol y attack uso Mean Encoding porque me da mejor score.

In [None]:
encoder = ce.TargetEncoder(cols = ["protocol"])
X_train = encoder.fit_transform(X_train, y_train)
X_validation = encoder.transform(X_validation)
X_test = encoder.transform(X_test)

X_train["protocol"] = X_train["protocol"].astype("float32")
X_validation["protocol"] = X_validation["protocol"].astype("float32")
X_test["protocol"] = X_test["protocol"].astype("float32")
X_train

Unnamed: 0,watcher_country_0,watcher_country_1,watcher_country_2,watcher_country_3,watcher_as_name,attacker_country_0,attacker_country_1,attacker_country_2,attacker_country_3,attacker_as_name,...,hour_range_(0-6),hour_range_(6-12),hour_range_(12-18),hour_range_(18-0),day_0,day_1,day_2,protocol,attack,attacker_country_hour
46466580,0,0,0,1,0.147452,0,0,0,1,0.000000e+00,...,0,1,0,0,0,0,1,0.137509,scan,CN/12
35466168,0,0,1,0,0.029596,0,0,1,0,0.000000e+00,...,1,0,0,0,0,1,0,0.137509,scan,DE/4
32971698,0,0,1,1,0.141247,0,0,0,1,6.621183e-02,...,0,1,0,0,0,0,1,0.149341,bruteforce,CN/12
14102806,0,0,0,1,0.177792,0,0,1,1,3.085171e-10,...,0,0,1,0,0,1,1,0.137509,exploit,Otros países/14
38449210,0,0,1,0,0.029596,0,0,1,0,0.000000e+00,...,0,1,0,0,0,1,1,0.137509,scan,DE/11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61629208,0,1,0,0,0.433012,0,1,0,0,6.070111e-01,...,0,1,0,0,1,0,0,0.068001,unknown,CA/9
61629247,0,0,1,0,0.192596,1,0,1,0,2.770404e-01,...,0,0,0,1,1,0,1,0.137509,spam,SG/21
61629250,0,0,1,0,0.192596,1,0,1,0,2.770404e-01,...,0,0,0,1,1,0,1,0.137509,scan,SG/21
61629389,0,0,1,1,0.128138,1,0,1,0,2.770404e-01,...,0,0,0,1,1,0,1,0.137509,scan,SG/21


# Attack

In [None]:
encoder = ce.TargetEncoder(cols = ["attack"])
X_train = encoder.fit_transform(X_train, y_train)
X_validation = encoder.transform(X_validation)
X_test = encoder.transform(X_test)

X_train["attack"] = X_train["attack"].astype("float32")
X_validation["attack"] = X_validation["attack"].astype("float32")
X_test["attack"] = X_test["attack"].astype("float32")
X_train

Unnamed: 0,watcher_country_0,watcher_country_1,watcher_country_2,watcher_country_3,watcher_as_name,attacker_country_0,attacker_country_1,attacker_country_2,attacker_country_3,attacker_as_name,...,hour_range_(0-6),hour_range_(6-12),hour_range_(12-18),hour_range_(18-0),day_0,day_1,day_2,protocol,attack,attacker_country_hour
46466580,0,0,0,1,0.147452,0,0,0,1,0.000000e+00,...,0,1,0,0,0,0,1,0.137509,0.134369,CN/12
35466168,0,0,1,0,0.029596,0,0,1,0,0.000000e+00,...,1,0,0,0,0,1,0,0.137509,0.134369,DE/4
32971698,0,0,1,1,0.141247,0,0,0,1,6.621183e-02,...,0,1,0,0,0,0,1,0.149341,0.120577,CN/12
14102806,0,0,0,1,0.177792,0,0,1,1,3.085171e-10,...,0,0,1,0,0,1,1,0.137509,0.139693,Otros países/14
38449210,0,0,1,0,0.029596,0,0,1,0,0.000000e+00,...,0,1,0,0,0,1,1,0.137509,0.134369,DE/11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61629208,0,1,0,0,0.433012,0,1,0,0,6.070111e-01,...,0,1,0,0,1,0,0,0.068001,0.068001,CA/9
61629247,0,0,1,0,0.192596,1,0,1,0,2.770404e-01,...,0,0,0,1,1,0,1,0.137509,0.145446,SG/21
61629250,0,0,1,0,0.192596,1,0,1,0,2.770404e-01,...,0,0,0,1,1,0,1,0.137509,0.134369,SG/21
61629389,0,0,1,1,0.128138,1,0,1,0,2.770404e-01,...,0,0,0,1,1,0,1,0.137509,0.134369,SG/21


# Attacker Country Hour

Uso Mean Encoding para attacker_country_hour porque hay muchos valores diferentes.

In [None]:
encoder = ce.TargetEncoder(cols = ["attacker_country_hour"])
X_train = encoder.fit_transform(X_train, y_train)
X_validation = encoder.transform(X_validation)
X_test = encoder.transform(X_test)

X_train["attacker_country_hour"] = X_train["attacker_country_hour"].astype("float32")
X_validation["attacker_country_hour"] = X_validation["attacker_country_hour"].astype("float32")
X_test["attacker_country_hour"] = X_test["attacker_country_hour"].astype("float32")
X_train

Unnamed: 0,watcher_country_0,watcher_country_1,watcher_country_2,watcher_country_3,watcher_as_name,attacker_country_0,attacker_country_1,attacker_country_2,attacker_country_3,attacker_as_name,...,hour_range_(0-6),hour_range_(6-12),hour_range_(12-18),hour_range_(18-0),day_0,day_1,day_2,protocol,attack,attacker_country_hour
46466580,0,0,0,1,0.147452,0,0,0,1,0.000000e+00,...,0,1,0,0,0,0,1,0.137509,0.134369,0.062926
35466168,0,0,1,0,0.029596,0,0,1,0,0.000000e+00,...,1,0,0,0,0,1,0,0.137509,0.134369,0.144663
32971698,0,0,1,1,0.141247,0,0,0,1,6.621183e-02,...,0,1,0,0,0,0,1,0.149341,0.120577,0.062926
14102806,0,0,0,1,0.177792,0,0,1,1,3.085171e-10,...,0,0,1,0,0,1,1,0.137509,0.139693,0.073777
38449210,0,0,1,0,0.029596,0,0,1,0,0.000000e+00,...,0,1,0,0,0,1,1,0.137509,0.134369,0.121517
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61629208,0,1,0,0,0.433012,0,1,0,0,6.070111e-01,...,0,1,0,0,1,0,0,0.068001,0.068001,0.139834
61629247,0,0,1,0,0.192596,1,0,1,0,2.770404e-01,...,0,0,0,1,1,0,1,0.137509,0.145446,0.177384
61629250,0,0,1,0,0.192596,1,0,1,0,2.770404e-01,...,0,0,0,1,1,0,1,0.137509,0.134369,0.177384
61629389,0,0,1,1,0.128138,1,0,1,0,2.770404e-01,...,0,0,0,1,1,0,1,0.137509,0.134369,0.177384


In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3977372 entries, 46466580 to 61629390
Data columns (total 21 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   watcher_country_0      int8   
 1   watcher_country_1      int8   
 2   watcher_country_2      int8   
 3   watcher_country_3      int8   
 4   watcher_as_name        float32
 5   attacker_country_0     int8   
 6   attacker_country_1     int8   
 7   attacker_country_2     int8   
 8   attacker_country_3     int8   
 9   attacker_as_name       float32
 10  hour                   float32
 11  hour_range_(0-6)       int8   
 12  hour_range_(6-12)      int8   
 13  hour_range_(12-18)     int8   
 14  hour_range_(18-0)      int8   
 15  day_0                  int8   
 16  day_1                  int8   
 17  day_2                  int8   
 18  protocol               float32
 19  attack                 float32
 20  attacker_country_hour  float32
dtypes: float32(6), int8(15)
memory usage: 178.3 MB

# Decision Tree

In [None]:
decision_tree = DecisionTreeClassifier(random_state = 18)

# Random Search

Elijo estos hiperparámetros porque son los que consideré para obtener mejor score en el modelo.

In [None]:
parametros = {
    "max_depth": [1, 5, 10, 15, 20, None],
    "min_samples_split": [2, 5, 10, 15, 20],
    "min_samples_leaf": [1, 2, 5, 10],
    "max_features": ["sqrt", "log2", None],
    "criterion": ["gini", "entropy"],
    "splitter": ["best", "random"],
    "class_weight": [None, "balanced"]
}

random_search = RandomizedSearchCV(estimator = decision_tree, param_distributions = parametros, n_iter = 10, cv = 5, scoring = "f1", n_jobs = -1, random_state = 18)
random_search.fit(X_train, y_train)

decision_tree_optimizado = random_search.best_estimator_
y_prediction = decision_tree_optimizado.predict(X_validation)
hiperparametros = random_search.best_params_

hiperparametros

{'splitter': 'random',
 'min_samples_split': 5,
 'min_samples_leaf': 10,
 'max_features': 'log2',
 'max_depth': 5,
 'criterion': 'entropy',
 'class_weight': None}

In [None]:
f1_score(y_validation, y_prediction)

0.6236808281067976

¿Cuál es el mejor score en la competencia? (Guardar el csv con predicciones para entregarlo después)

In [None]:
y_prediction = decision_tree_optimizado.predict(X_test)
y_prediction

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [None]:
predictions = pd.DataFrame({"label": y_prediction}).reset_index(drop = True)
X_test_ip = X_test_ip.reset_index(drop = True)

predicciones = pd.concat([X_test_ip, predictions], axis = 1)
predicciones

Unnamed: 0,attacker_ip_enum,label
0,5,0
1,7,0
2,21,0
3,29,0
4,33,0
...,...,...
49415,199947,0
49416,199949,0
49417,199962,0
49418,199964,0


El score obtenido es: 0.44054

In [None]:
predicciones.to_csv("/content/drive/MyDrive/Colab Notebooks/predicciones_decision_tree_no_batches.csv", index = False)

In [None]:
predicciones[predicciones["label"] == 1]

Unnamed: 0,attacker_ip_enum,label
13,99,1
59,299,1
93,386,1
115,458,1
128,520,1
...,...,...
49363,199756,1
49376,199793,1
49395,199861,1
49402,199877,1


https://drive.google.com/file/d/14UJmwVVp2f7FkP9PtGdG3R15bAp4R4Ni/view?usp=drive_link