In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score, precision_score


In [None]:
train_data = pd.read_parquet('/content/drive/MyDrive/OrgaDeDatos/train_checkpoint2.parq')

In [None]:
# Hago Undersampling
not_label_df = train_data.loc[train_data["label"]==0, :]
label_df = train_data.loc[train_data["label"]==1, :]
not_label_ammount = round(len(label_df) * 100 / 50) # Regla de 3 simple
random_not_label_df = not_label_df.sample(n=not_label_ammount - len(label_df), random_state=73)
del train_data
train_data = pd.concat([label_df, random_not_label_df], axis=0)
del not_label_df, label_df

In [None]:
X_train = train_data.drop("label", axis=1)
y_train = train_data.loc[:, "label"]
del train_data

In [None]:
val_data = pd.read_parquet('/content/drive/MyDrive/OrgaDeDatos/val_checkpoint2.parq')

In [None]:
X_val = val_data.drop("label", axis=1)
y_val = val_data.loc[:, "label"]

# DecisionTree

In [None]:
X_train.drop(["mean_attack_hour_by_attacker", "attacks_before_12_hours"], axis=1, inplace=True)
X_val.drop(["mean_attack_hour_by_attacker", "attacks_before_12_hours"], axis=1, inplace=True)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

model = DecisionTreeClassifier()
# param_dist = {
#     'criterion': ['gini', 'entropy'],
#     'splitter': ['best', 'random'],
#     'max_depth': [None, 5, 10, 15, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'class_weight': [None, 'balanced']
# }
param_dist = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 3, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10],
    'max_features': [None, 'sqrt', 'log2'],
    'max_leaf_nodes': [None, 10, 20, 30, 40, 50, 60],
    'min_impurity_decrease': [0.0, 0.1, 0.2],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.2],
    'class_weight': [None, 'balanced', {0: 1, 1: 2}],
    'ccp_alpha': [0.0, 0.1, 0.2]
}

random_search = RandomizedSearchCV(model, param_distributions=param_dist, random_state=0 ,n_iter=100, scoring='f1', cv=3, verbose=3, n_jobs=-1)

random_search.fit(X_train, y_train)

print("Mejores hiperparámetros encontrados:")
print(random_search.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Mejores hiperparámetros encontrados:
{'splitter': 'random', 'min_weight_fraction_leaf': 0.0, 'min_samples_split': 10, 'min_samples_leaf': 10, 'min_impurity_decrease': 0.0, 'max_leaf_nodes': 30, 'max_features': 'sqrt', 'max_depth': 20, 'criterion': 'entropy', 'class_weight': None, 'ccp_alpha': 0.0}


In [None]:
pred = random_search.predict(X_val)
train_pred = random_search.predict(X_train)

In [None]:
{'splitter': 'random', 'min_weight_fraction_leaf': 0.0, 'min_samples_split': 3, 'min_samples_leaf': 6, 'min_impurity_decrease': 0.0, 'max_leaf_nodes': 60, 'max_features': 'log2', 'max_depth': 10, 'criterion': 'gini', 'class_weight': 'balanced', 'ccp_alpha': 0.0}
train_score = f1_score(y_train.reset_index(drop=True), train_pred)
score = f1_score(y_val.reset_index(drop=True), pred)
print(f"Score para train: {train_score}")
print(f"Score para val: {score}")

Score para train: 0.7168700467023367
Score para val: 0.6112111523266812


# Test y submission

In [None]:
params = {'splitter': 'best', 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 5, 'criterion': 'entropy', 'class_weight': 'balanced'}
dt = DecisionTreeClassifier(**params)

In [None]:
dt.fit(X_train, y_train)

In [None]:
pred = dt.predict(X_val)
train_pred = dt.predict(X_train)

In [None]:
val = pd.read_parquet('/content/drive/MyDrive/OrgaDeDatos/val_checkpoint1.parq')

In [None]:
real_val = val.groupby("attacker_ip_enum").agg({"label": "max"}).loc[:, "label"]

In [None]:
val["label"] = pred
a = val.groupby("attacker_ip_enum").agg({"label":"sum", "attacker_ip_enum": "count"})
# a = val.groupby("attacker_ip_enum").agg({"label":"min"})

In [None]:
b = a.label / a.attacker_ip_enum

In [None]:
# m = b.mean()
c = b.apply(lambda x: 1 if x > 0.5 else 0)
f1_score(real_val, c)

0.3784639746634996

In [None]:
d = c.reset_index()
d.columns = ["attacker_ip_enum", "label"]
d["attacker_ip_enum"] = d.loc[:, "attacker_ip_enum"].astype("int32")
d["label"] = d.loc[:, "label"].astype("int8")

In [None]:
train_score = f1_score(y_train.reset_index(drop=True), train_pred)
score = f1_score(y_val.reset_index(drop=True), pred)
print(f"Score para train: {train_score}")
print(f"Score para val: {score}")

Score para train: 0.7169962415869131
Score para val: 0.7500740358018235


In [None]:
test_data = pd.read_parquet('/content/drive/MyDrive/OrgaDeDatos/test_checkpoint2.parq')

In [None]:
test_pred = dt.predict(test_data)

In [None]:
test_data = pd.read_parquet('/content/drive/MyDrive/OrgaDeDatos/test_checkpoint1.parq')

In [None]:
test_data["label"] = test_pred

In [None]:
a = test_data.groupby("attacker_ip_enum").agg({"label":"min"})

In [None]:
b = a.label / a.attacker_ip_enum

In [None]:
# m = b.mean()
c = b.apply(lambda x: 1 if x > 0.5 else 0)

In [None]:
d = c.reset_index()
d.columns = ["attacker_ip_enum", "label"]
d["attacker_ip_enum"] = d.loc[:, "attacker_ip_enum"].astype("int32")
d["label"] = d.loc[:, "label"].astype("int8")

In [None]:
a.reset_index().to_csv("/content/drive/MyDrive/OrgaDeDatos/decision_tree_3rd.csv", index=False)