In [10]:
import pandas as pd
import numpy as np

from sklearn import tree
from sklearn.model_selection import train_test_split

In [3]:
__DATA_PATH__ = "../data"
__INTERIM_DATA_PATH__ = f"{__DATA_PATH__}/interim"

In [12]:
df = pd.read_csv(f"{__INTERIM_DATA_PATH__}/easy_money_prep_1.csv")
df.drop("Unnamed: 0", axis=1, inplace=True)
df["pk_partition"] = pd.to_datetime(df["pk_partition"])
df.shape

(5962838, 75)

In [16]:
TARGET = "em_acount"

In [24]:
def dev_val_split(dataframe: pd.DataFrame, split_condition, exclude_columns, target: str = TARGET):
    dev_df = dataframe[split_condition].drop(exclude_columns, axis=1)
    val_df = dataframe[~split_condition].drop(exclude_columns, axis=1)

    dev_df_X = dev_df.drop(target, axis=1)
    dev_df_y = dev_df[[TARGET]]

    val_df_X = val_df.drop(TARGET, axis=1)
    val_df_y = val_df[[TARGET]]

    print(f"""
    dev_df_X.shape: {dev_df_X.shape}
    dev_df_y.shape: {dev_df_y.shape}

    val_df_X.shape: {val_df_X.shape}
    val_df_y.shape: {val_df_y.shape}
    """)

    return dev_df_X, dev_df_y, val_df_X, val_df_y


dev_df_X, dev_df_y, val_df_X, val_df_y = dev_val_split(
    dataframe=df,
    split_condition=(df["pk_partition"] < "2019-03-28"),
    exclude_columns=["pk_cid", "pk_partition", "entry_date"],
    target=TARGET
)

dev_df.shape: 4644039,72
val_df.shape: 1318799,72

    dev_df_X.shape: (4644039, 71)
    dev_df_y.shape: (4644039, 1)

    val_df_X.shape: (1318799, 71)
    val_df_y.shape: (1318799, 1)
    


In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    dev_df_X,
    dev_df_y,
    test_size=0.2,
    random_state=13,
    stratify=dev_df_y
)
print(f"""
X_train.shape: {X_train.shape}
X_test.shape: {X_test.shape}
y_train.shape: {y_train.shape}
y_test.shape: {y_test.shape}
""")


X_train.shape: (3715231, 71)
X_test.shape: (928808, 71)
y_train.shape: (3715231, 1)
y_test.shape: (928808, 1)



In [28]:
import mlflow
from sklearn.linear_model import LogisticRegression

mlflow.set_tracking_uri('http://3.249.188.239:5000')

# Si no existe, creo el experimento
experiment_name = "malware-windows-prediction-aamv"

if not mlflow.get_experiment_by_name(experiment_name):
    mlflow.create_experiment(name=experiment_name)

experiment = mlflow.get_experiment_by_name(experiment_name)

# autologging tracking
mlflow.sklearn.autolog()

# start run and track
with mlflow.start_run(experiment_id = experiment.experiment_id):
    tree_one = tree.DecisionTreeClassifier()
    tree_one = tree_one.fit(X_train, y_train)
    tree_one_accuracy = round(tree_one.score(val_df_X, val_df_y), 4)
    print('Accuracy: %0.4f' % (tree_one_accuracy))

Accuracy: 0.9539
