# Start


In [None]:
import os

import numpy as np
import pandas as pd
from graphviz import Source
from IPython.core.magic import register_cell_magic
from IPython.display import HTML, SVG
from matplotlib import pyplot as plt
from scipy.stats import zscore
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    auc,
    precision_score,
    recall_score,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import (
    GridSearchCV,
    ParameterGrid,
    RandomizedSearchCV,
    cross_val_score,
    learning_curve,
    train_test_split,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler
from sklearn.tree import DecisionTreeClassifier, export_graphviz

## Process data


### Loading data


In [None]:
file_path: str = "../data/raw"
files: tuple[str, ...] = ("connections", "devices", "processes", "profiles")

dataset: dict[str, pd.DataFrame] = {}
for file in files:
    dataset[file] = pd.read_csv(f"{file_path}/{file}.csv", sep="\t")
    dataset[file] = dataset[file].drop_duplicates()

df = pd.merge(dataset["connections"], dataset["processes"], on=["imei", "ts", "mwra"])
df["ts"] = pd.to_datetime(df.ts)

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

### Cleaning data


In [None]:
# 1 iteration of cleaning whole dataset of outliers (including p.android.vending)
train_data = train_data[(np.abs(zscore(train_data.iloc[:, 3:])) < 3).all(axis=1)]

# Using all columns except c.android.vending for outlier detection
columns_for_zscore = train_data.iloc[:, 3:].columns.difference(["p.android.vending"])
outliers_count = (~(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)).sum()
max_iterations = 10
iteration = 0

# Iterating after we removed all outliers
while outliers_count > 0:
    train_data = train_data[(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)]
    outliers_count = (~(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)).sum()
    iteration += 1
    if iteration >= max_iterations:
        break

train_data = train_data.reset_index(drop=True)

### Export cleaned data


In [None]:
# Exporting cleaned data
os.makedirs("../data/clean", exist_ok=True)

train_data.to_csv("../data/clean/train_data.csv", index=False)
test_data.to_csv("../data/clean/test_data.csv", index=False)

### Import cleaned data


In [None]:
train_data = pd.read_csv("../data/clean/train_data.csv")
test_data = pd.read_csv("../data/clean/test_data.csv")

### Define columns


In [None]:
all_columns = train_data.drop(columns=["mwra", "ts", "imei"]).columns
non_gaussian_columns = [
    "c.android.vending",
    "c.UCMobile.x86",
    "c.updateassist",
    "c.UCMobile.intl",
    "p.android.vending",
    "p.dogalize",
    "p.olauncher",
    "p.simulator",
    "p.inputmethod.latin",
    "p.android.gms",
    "p.notifier",
    "p.katana",
    "p.gms.persistent",
]
gaussian_columns = all_columns[~all_columns.isin(non_gaussian_columns)]
transformed_feature_order = pd.Series(gaussian_columns.tolist() + non_gaussian_columns)

### Define pipelines


In [None]:
general_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)

vending_pipeline = Pipeline(
    [
        ("quantile_transformer", QuantileTransformer(output_distribution="normal", random_state=42)),
    ]
)

# Create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("general", general_pipe, gaussian_columns),
        ("vending", vending_pipeline, non_gaussian_columns),
    ],
    remainder="passthrough",
)

# Create complete pipeline
complete_pipeline = Pipeline([("preprocessor", preprocessor)])

### Transform data


In [None]:
# Fit and transform training data, transform test data
train_data_processed = complete_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_pipeline.transform(test_data[all_columns])

# Get selected features immediately after fitting
# feature_mask = complete_pipeline.named_steps["selector"].get_support()
# selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

# Create DataFrames with selected feature names
train_data_processed = pd.DataFrame(train_data_processed, columns=transformed_feature_order)
test_data_processed = pd.DataFrame(test_data_processed, columns=transformed_feature_order)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

### Export processed data


In [None]:
os.makedirs("../data/processed", exist_ok=True)

train_data_processed.to_csv("../data/processed/train_data.csv", index=False)
test_data_processed.to_csv("../data/processed/test_data.csv", index=False)

### Cleanup


In [None]:
del (
    file_path,
    files,
    file,
    df,
    train_data,
    test_data,
    columns_for_zscore,
    outliers_count,
    max_iterations,
    iteration,
    all_columns,
    non_gaussian_columns,
    gaussian_columns,
    transformed_feature_order,
    general_pipe,
    vending_pipeline,
    preprocessor,
    complete_pipeline,
    train_data_processed,
    test_data_processed,
    dataset,
)

## Helper Stuff


In [None]:
train_data_processed = pd.read_csv("../data/processed/train_data.csv")
test_data_processed = pd.read_csv("../data/processed/test_data.csv")

# Train data without feature selection
X_train = train_data_processed.drop(columns=["mwra"])
y_train = train_data_processed["mwra"]

# Test data without feature selection
X_test = test_data_processed.drop(columns=["mwra"])
y_test = test_data_processed["mwra"]

# Train data with feature selection
selector = SelectKBest(f_classif, k=7)
train_selected = selector.fit_transform(X_train, y_train)
selected_features = X_train.columns[selector.get_support()]
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

del selector, train_data_processed, test_data_processed, train_selected, selected_features

In [None]:
def get_scores(model, model_name, X_train, y_train, X_test, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    df = pd.DataFrame(
        index=["accuracy", "precision", "recall", "roc_auc"],
        columns=pd.MultiIndex.from_product([[model_name], ["Train", "Test"]]),
    )

    accuracy_train = accuracy_score(y_train, y_pred_train)
    precision_train = precision_score(y_train, y_pred_train)
    recall_train = recall_score(y_train, y_pred_train)
    roc_auc_train = roc_auc_score(y_train, y_pred_train)

    df.loc["accuracy", (model_name, "Train")] = accuracy_train
    df.loc["precision", (model_name, "Train")] = precision_train
    df.loc["recall", (model_name, "Train")] = recall_train
    df.loc["roc_auc", (model_name, "Train")] = roc_auc_train

    accuracy_test = accuracy_score(y_test, y_pred_test)
    precision_test = precision_score(y_test, y_pred_test)
    recall_test = recall_score(y_test, y_pred_test)
    roc_auc_test = roc_auc_score(y_test, y_pred_test)

    df.loc["accuracy", (model_name, "Test")] = accuracy_test
    df.loc["precision", (model_name, "Test")] = precision_test
    df.loc["recall", (model_name, "Test")] = recall_test
    df.loc["roc_auc", (model_name, "Test")] = roc_auc_test

    return df

In [None]:
def print_scores_cv(model, model_name, X, y, cv):
    accuracy_scores = cross_val_score(model, X, y, cv=cv, scoring="accuracy")
    precision_scores = cross_val_score(model, X, y, cv=cv, scoring="precision")
    recall_scores = cross_val_score(model, X, y, cv=cv, scoring="recall")
    roc_auc_scores = cross_val_score(model, X, y, cv=cv, scoring="roc_auc")

    print("Accuracy scores:")
    print(accuracy_scores)
    print(f"Mean: {accuracy_scores.mean():.5f}")
    print(f"Standard Deviation: {accuracy_scores.std():.5f}")

    print("\nPrecision scores:")
    print(precision_scores)
    print(f"Mean: {precision_scores.mean():.5f}")
    print(f"Standard Deviation: {precision_scores.std():.5f}")

    print("\nRecall scores:")
    print(recall_scores)
    print(f"Mean: {recall_scores.mean():.5f}")
    print(f"Standard Deviation: {recall_scores.std():.5f}")

    print("\nROC AUC scores:")
    print(roc_auc_scores)
    print(f"Mean: {roc_auc_scores.mean():.5f}")
    print(f"Standard Deviation: {roc_auc_scores.std():.5f}")

In [None]:
@register_cell_magic
def ignore(line, cell):
    pass

# 3.1


-   Jednoduchý klasifikátor na základe závislosti v dátach.


## A


Naimplementujte jednoduchý ID3 klasifikátor s hĺbkou min 2 (vrátane root/koreň).


### Not using feature selection


In [None]:
# Initialize the ID3 classifier
dtc_nofs = DecisionTreeClassifier(
    criterion="gini", max_depth=15, min_samples_split=10, min_samples_leaf=1, ccp_alpha=0.001, random_state=42
)

# Fit the classifier to the training data
dtc_nofs.fit(X_train, y_train)

In [None]:
graph = Source(
    export_graphviz(dtc_nofs, out_file=None, class_names=["no", "yes"], filled=True, feature_names=X_train.columns)
)

display(SVG(graph.pipe(format="svg")))

style = "<style>svg{width:100%;height:70%;}</style>"
HTML(style)

### Using Feature selection


In [None]:
# Initialize the ID3 classifier
dtc_fs = DecisionTreeClassifier(
    criterion="gini", max_depth=15, min_samples_split=10, min_samples_leaf=1, ccp_alpha=0.001, random_state=42
)

# Fit the classifier to the training data
dtc_fs.fit(X_train_selected, y_train)

In [None]:
graph = Source(
    export_graphviz(
        dtc_fs, out_file=None, class_names=["no", "yes"], filled=True, feature_names=X_train_selected.columns
    )
)

display(SVG(graph.pipe(format="svg")))

style = "<style>svg{width:100%;height:70%;}</style>"
HTML(style)

del style, graph

-   There are slight differences between using and not using feature selection.
-   We are going to use no feature selection for now.


## B


Vyhodnoťte Váš ID3 klasifikátor pomocou metrík accuracy, precision a recall.


In [None]:
df = get_scores(dtc_nofs, "DecisionTreeClassifier - No feature selection", X_train, y_train, X_test, y_test)
df = pd.concat(
    [
        df,
        get_scores(
            dtc_fs,
            "DecisionTreeClassifier - Feature selection",
            X_train_selected,
            y_train,
            X_test_selected,
            y_test,
        ),
    ],
    axis=1,
)
df

-   We see that using feature selection gives worse results. We will look at it in more detail later (3.4).


## C


Zístite či Váš ID3 klasifikátor má overfit.


-   Since train and test metrics are close, the model is likely not overfitting.
-   But let's also look at learning curves.


### Learning curve - train sizes


In [None]:
# Initialize the model
dtc_nofs = DecisionTreeClassifier(
    criterion="gini", max_depth=15, min_samples_split=10, min_samples_leaf=1, ccp_alpha=0.001, random_state=42
)

# Define the training sizes
train_sizes = np.linspace(0.1, 1.0, 50)


# Function to plot learning curves
def plot_learning_curve(estimator, X, y, train_sizes, cv, scoring, title):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, train_sizes=train_sizes, scoring=scoring, n_jobs=-1
    )
    train_scores_mean = 1 - train_scores.mean(axis=1)
    test_scores_mean = 1 - test_scores.mean(axis=1)

    plt.figure()
    plt.plot(train_sizes, train_scores_mean, label="Training score")
    plt.plot(train_sizes, test_scores_mean, label="Cross-Validation Score")
    plt.xlabel("Training Size")
    plt.ylabel(f"{scoring} Error")
    plt.title(title)
    plt.legend()
    plt.show()


# Plot learning curve for accuracy
plot_learning_curve(
    dtc_nofs,
    X_train,
    y_train,
    train_sizes,
    cv=5,
    scoring="accuracy",
    title="Learning Curve - Train size (Accuracy)",
)

# Plot learning curve for precision
plot_learning_curve(
    dtc_nofs,
    X_train,
    y_train,
    train_sizes,
    cv=5,
    scoring="precision",
    title="Learning Curve - Train size (Precision)",
)

# Plot learning curve for recall
plot_learning_curve(
    dtc_nofs, X_train, y_train, train_sizes, cv=5, scoring="recall", title="Learning Curve - Train size (Recall)"
)

-   We don't see a big gap in metrics between train and test data (Looking at last point since that is what we used in previous steps).
-   However we see that model starts of badly (as expected) and then improves with more data. Around 50% of data for training seems to be enough.


### Learning curve - model complexity


In [None]:
# Define the range of max_depth values
max_depth_range = range(1, 30)


def plot_model_complexity_curve(model, X_train, y_train, X_test, y_test, max_depth_range):
    # Initialize lists to store the training and validation errors
    train_errors = []
    val_errors = []

    # Loop over the range of max_depth values
    for max_depth in max_depth_range:
        # Initialize the model with the current max_depth
        model = DecisionTreeClassifier(
            criterion="gini",
            max_depth=max_depth,
            min_samples_split=10,
            min_samples_leaf=1,
            ccp_alpha=0.001,
            random_state=42,
        )

        # Fit model
        model.fit(X_train, y_train)

        # Compute the training and testing data errors
        train_score = accuracy_score(y_train, model.predict(X_train))
        test_score = accuracy_score(y_test, model.predict(X_test))

        # Compute the mean errors
        train_errors.append(1 - train_score)
        val_errors.append(1 - test_score)

    # Plot the learning curve for model complexity
    plt.figure()
    plt.plot(max_depth_range, train_errors, label="Training Error")
    plt.plot(max_depth_range, val_errors, label="Validation Error")
    plt.xlabel("Max Depth")
    plt.ylabel("Error")
    plt.title("Learning Curve (Model Complexity)")
    plt.legend()
    plt.show()


# Plot model complexity curve
plot_model_complexity_curve(dtc_nofs, X_train, y_train, X_test, y_test, max_depth_range)

-   This also shows no overfitting for our graph as there is no significant gap (looking at max_depth=15, as this is what we used in previous steps).
-   We can also see that model start of very badly and is underfitting.
-   Around max_depth=5 training and testing error stabilizes.


### ROC


In [None]:
dtc_nofs = DecisionTreeClassifier(
    criterion="gini", max_depth=15, min_samples_split=10, min_samples_leaf=1, ccp_alpha=0.001, random_state=42
)

dtc_nofs.fit(X_train, y_train)

# Compute ROC curve and ROC area for train data
fpr_train, tpr_train, _ = roc_curve(y_train, dtc_nofs.predict_proba(X_train)[:, 1])
roc_auc_train = auc(fpr_train, tpr_train)

# Compute ROC curve and ROC area for test data
fpr_test, tpr_test, _ = roc_curve(y_test, dtc_nofs.predict_proba(X_test)[:, 1])
roc_auc_test = auc(fpr_test, tpr_test)

# Plot ROC curve
plt.figure()
plt.plot(fpr_train, tpr_train, color="blue", lw=2, label=f"Train ROC curve (area = {roc_auc_train:.5f})")
plt.plot(fpr_test, tpr_test, color="red", lw=2, label=f"Test ROC curve (area = {roc_auc_test:.5f})")
plt.plot([0, 1], [0, 1], color="gray", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

-   ROC also shows no significant gap between train and test data. So we can conclude that model is not overfitting.


## Cleanup


In [None]:
del (
    dtc_fs,
    dtc_nofs,
    fpr_train,
    tpr_train,
    fpr_test,
    tpr_test,
    train_sizes,
    max_depth_range,
    roc_auc_test,
    roc_auc_train,
)

# 3.2


Trénovanie a vyhodnotenie klasifikátorov strojového učenia.


## A


Na trénovanie využite jeden stromový algoritmus v scikit-learn.


In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    criterion="gini", max_depth=15, min_samples_split=10, min_samples_leaf=1, ccp_alpha=0.001, random_state=42
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

## B


Porovnajte s jedným iným nestromovým algoritmom v scikit-learn.


In [None]:
# Initialize the LogisticRegression classifier
log_reg = LogisticRegression(max_iter=100, random_state=42)

# Fit the classifier to the training data
log_reg.fit(X_train, y_train)

# Print the scores
df = get_scores(log_reg, "LogisticRegression", X_train, y_train, X_test, y_test)
df

-   LogisticRegression with default parameters gives slightly worse results than RandomForestClassifier with best parameters found in 3.3.


## C


Porovnajte výsledky s ID3 z prvého kroku.


In [None]:
# Initialize classifiers
dtc_nofs = DecisionTreeClassifier(
    criterion="gini", max_depth=15, min_samples_split=10, min_samples_leaf=1, ccp_alpha=0.001, random_state=42
)
rf_classifier = RandomForestClassifier(
    criterion="gini", max_depth=15, min_samples_split=10, min_samples_leaf=1, ccp_alpha=0.001, random_state=42
)
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Fit the classifier to the training data
dtc_nofs.fit(X_train, y_train)
rf_classifier.fit(X_train, y_train)
log_reg.fit(X_train, y_train)

# Print the scores
df_1 = get_scores(dtc_nofs, "DecisionTreeClassifier - No feature selection", X_train, y_train, X_test, y_test)
df_2 = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df_3 = get_scores(log_reg, "LogisticRegression", X_train, y_train, X_test, y_test)

df = pd.concat([df_1, df_2, df_3], axis=1)
del df_1, df_2, df_3
df

-   RandomForestClassifier and LogisticRegression preforms better than ID3 classifier.


## D


Vizualizujte natrénované pravidlá minimálne pre jeden Vami vybraný algoritmus.


In [None]:
# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(
    criterion="gini", max_depth=15, min_samples_split=10, min_samples_leaf=1, ccp_alpha=0.001, random_state=42
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Get tree from random forest
estimator = rf_classifier.estimators_[0]

# Generate graph
graph = Source(
    export_graphviz(
        estimator, out_file=None, class_names=["no", "yes"], filled=True, feature_names=X_train.columns
    )
)

# Display graph
display(SVG(graph.pipe(format="svg")))

style = "<style>svg{width:100%;height:70%;}</style>"
HTML(style)

## E


Vyhodnoťte natrénované modely pomocou metrík accuracy, precision a recall.


In [None]:
# Initialize classifiers
dtc_nofs = DecisionTreeClassifier(
    criterion="gini", max_depth=15, min_samples_split=10, min_samples_leaf=1, ccp_alpha=0.001, random_state=42
)
rf_classifier = RandomForestClassifier(
    criterion="gini", max_depth=15, min_samples_split=10, min_samples_leaf=1, ccp_alpha=0.001, random_state=42
)
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Fit the classifier to the training data
dtc_nofs.fit(X_train, y_train)
rf_classifier.fit(X_train, y_train)
log_reg.fit(X_train, y_train)

# Print the scores
df_1 = get_scores(dtc_nofs, "DecisionTreeClassifier - No feature selection", X_train, y_train, X_test, y_test)
df_2 = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df_3 = get_scores(log_reg, "LogisticRegression", X_train, y_train, X_test, y_test)

df = pd.concat([df_1, df_2, df_3], axis=1)
del df_1, df_2, df_3
df

-   We see that RandomForestClassifier (with best parameters found in 3.3) gives the best results.
-   LogisticRegression with default parameters gives slightly worse results.
-   ID3 classifier gives the worst results.
-   None of these models are overfitting because train and test metrics are close.


## Cleanup


In [None]:
del dtc_nofs, rf_classifier, log_reg, graph, style, estimator

# 3.3


Optimalizácia alias hyperparameter tuning.


## A


Vyskúšajte rôzne nastavenie hyperparametrov (tuning) pre zvolený algoritmus tak,
aby ste optimalizovali výkonnosť (bez underfitingu).

We will test parameters for RandomForestClassifier.


### First, find baseline parameters


#### n_estimators


In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_depth=20, ccp_alpha=0.001
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_depth=20, ccp_alpha=0.001
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

-   We see that performance is not increasing with n_estimators.


#### ccp_alpha


In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(criterion="entropy", n_estimators=100, random_state=42, ccp_alpha=0.002)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(criterion="entropy", n_estimators=100, random_state=42, ccp_alpha=0.005)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

-   We see ccp_alpha 0.002 is better considering ROC AUC factor.


#### max_features


In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    criterion="entropy",
    n_estimators=100,
    random_state=42,
    ccp_alpha=0.002,
    max_features="sqrt",
    max_depth=7,
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    criterion="entropy",
    n_estimators=100,
    random_state=42,
    ccp_alpha=0.002,
    max_features="log2",
    max_depth=7,
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

-   When also using max_depth, there is difference in ROC AUC, sqrt is better.


#### criterion


In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    criterion="entropy",
    n_estimators=100,
    random_state=42,
    ccp_alpha=0.002,
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    criterion="gini",
    n_estimators=100,
    random_state=42,
    ccp_alpha=0.002,
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

-   Entropy is better considering ROC AUC factor.


### RandomizedSearchCV


-   In previous step, we found that n_estimators=100, ccp_alpha=0.002, max_features=sqrt, criterion=entropy are better.
-   However we are not gonna look at max_features and criterion in first tuning method as we will look at them at the end when we have best primary hyperparameters.
-   Not using max_features and criterion in first tuning gives more priority to other hyperparameters.
-   Our primary hyperparameters are n_estimators, max_depth, min_samples_split, min_samples_leaf, ccp_alpha.


In [None]:
%%ignore
# Define the broad parameter grid
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [5, 10, 15, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "ccp_alpha": [0.001, 0.002, 0.003],
}

# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
random_search = RandomizedSearchCV(
    estimator=rf_classifier,
    param_distributions=param_grid,
    n_iter=100,
    n_jobs=-1,
    verbose=1,
    cv=10,  # 10-fold cross-validation for more reliable results
    scoring="roc_auc",
    random_state=42,
)

# Fit the classifier to the training data
random_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = random_search.best_params_
best_score = random_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score:.5f}")

-   Best Parameters: {'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 15, 'ccp_alpha': 0.002}
-   Best Score: 0.91579


### GridSearchCV


#### First iteration


In [None]:
%%ignore
# Best parameters from RandomizedSearchCV
best_params = {
    "n_estimators": 300,
    "max_depth": 15,
    "min_samples_split": 10,
    "min_samples_leaf": 2,
    "ccp_alpha": 0.002,
}

# Define the refined parameter grid for GridSearchCV
param_grid_refined = {
    "n_estimators": [
        best_params["n_estimators"] - 100,
        best_params["n_estimators"],
        best_params["n_estimators"] + 100,
    ],
    "max_depth": [
        best_params["max_depth"] - 5,
        best_params["max_depth"],
        best_params["max_depth"] + 5,
    ],
    "min_samples_split": [
        best_params["min_samples_split"] - 5,
        best_params["min_samples_split"],
        best_params["min_samples_split"] + 5,
    ],
    "min_samples_leaf": [
        best_params["min_samples_leaf"],
        best_params["min_samples_leaf"] + 1,
    ],
    "ccp_alpha": [
        best_params["ccp_alpha"],
        best_params["ccp_alpha"] + 0.001,
        best_params["ccp_alpha"] + 0.002,
    ],
}

# Initialize the RandomForestClassifier with the best parameters from RandomizedSearchCV
rf_refined = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_refined = GridSearchCV(
    estimator=rf_refined,
    param_grid=param_grid_refined,
    verbose=1,
    n_jobs=-1,
    cv=10,  # 10-fold cross-validation for more reliable results
    scoring="roc_auc",
)

# Fit the classifier to the training data
grid_search_refined.fit(X_train, y_train)

# Get the best parameters and best score
best_params_refined = grid_search_refined.best_params_
best_score_refined = grid_search_refined.best_score_

print(f"Best Parameters after Refining: {best_params_refined}")
print(f"Best Score after Refining: {best_score_refined:.5f}")

-   Best Parameters after Refining: {'ccp_alpha': 0.002, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 15, 'n_estimators': 400}
-   Best Score after Refining: 0.91606
-   The trend for n_estimators is, bigger is better (until some point). So in next iteration we will ignore this parameter.


#### Second iteration


My colleague found different parameters to be better, so we will look at them in this step.


In [None]:
%%ignore
# Define the refined parameter grid for GridSearchCV
param_grid_refined = {
    "n_estimators": [
        200
    ],
    "max_depth": [
        15
    ],
    "min_samples_split": [
        5,10
    ],
    "min_samples_leaf": [
        1,2
    ],
    "ccp_alpha": [
        0.001, 0.002
    ],
}

# Initialize the RandomForestClassifier with the best parameters from RandomizedSearchCV
rf_refined = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_refined = GridSearchCV(
    estimator=rf_refined,
    param_grid=param_grid_refined,
    verbose=1,
    n_jobs=-1,
    cv=10,  # 10-fold cross-validation for more reliable results
    scoring="roc_auc",
)

# Fit the classifier to the training data
grid_search_refined.fit(X_train, y_train)

# Get the best parameters and best score
best_params_refined = grid_search_refined.best_params_
best_score_refined = grid_search_refined.best_score_

print(f"Best Parameters after Refining: {best_params_refined}")
print(f"Best Score after Refining: {best_score_refined:.5f}")

-   Best Parameters after Refining: {'ccp_alpha': 0.001, 'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
-   Best Score after Refining: 0.91596
-   From these observations:
    -   We can conclude that max_depth=~15 is the best, as it was 15 in every iteration.
    -   We can conclude that ccp_alpha=~0.001 or cpp_alpha=~0.002 is the best.
    -   We can conclude that min_samples_leaf=~1 or a little higher is the best.
    -   We can conclude that min_samples_split=~10 is the best.
    -   We can conclude that higher n_estimators is better (at some value it will be worse).


#### Third iteration


Lets look at cpp_alpha and max_depth in more detail.


In [None]:
%%ignore
# Define the refined parameter grid for GridSearchCV
param_grid_refined = {
    "n_estimators": [300],
    "max_depth": [14, 15, 16],
    "min_samples_split": [10],
    "min_samples_leaf": [1],
    "ccp_alpha": [0.0095, 0.01, 0.0105],
}

# Initialize the RandomForestClassifier with the best parameters from RandomizedSearchCV
rf_refined = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_refined = GridSearchCV(
    estimator=rf_refined,
    param_grid=param_grid_refined,
    verbose=1,
    n_jobs=-1,
    cv=10,  # 10-fold cross-validation for more reliable results
    scoring="roc_auc",
)

# Fit the classifier to the training data
grid_search_refined.fit(X_train, y_train)

# Get the best parameters and best score
best_params_refined = grid_search_refined.best_params_
best_score_refined = grid_search_refined.best_score_

print(f"Best Parameters after Refining: {best_params_refined}")
print(f"Best Score after Refining: {best_score_refined:.5f}")

-   cpp_alpha=0.001 is best
-   max_depth=15 is best


#### Fourth iteration


Now let's look at best n_estimators.


In [None]:
%%ignore
# Define the refined parameter grid for GridSearchCV
param_grid_refined = {
    "n_estimators": [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    "max_depth": [15],
    "min_samples_split": [10],
    "min_samples_leaf": [1],
    "ccp_alpha": [0.001],
}

# Initialize the RandomForestClassifier with the best parameters from RandomizedSearchCV
rf_refined = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_refined = GridSearchCV(
    estimator=rf_refined,
    param_grid=param_grid_refined,
    verbose=1,
    n_jobs=-1,
    cv=10,  # 10-fold cross-validation for more reliable results
    scoring="roc_auc",
)

# Fit the classifier to the training data
grid_search_refined.fit(X_train, y_train)

# Get the best parameters and best score
best_params_refined = grid_search_refined.best_params_
best_score_refined = grid_search_refined.best_score_

print(f"Best Parameters after Refining: {best_params_refined}")
print(f"Best Score after Refining: {best_score_refined:.5f}")

-   n_estimators=300 is best.


#### Final iteration


Our final parameters are:

-   n_estimators=300
-   max_depth=15
-   min_samples_split=10
-   min_samples_leaf=1
-   ccp_alpha=0.001


As discussed earlier, we will also look at max_features and criterion.


In [None]:
%%ignore
# Define the refined parameter grid for GridSearchCV
param_grid_refined = {
    "n_estimators": [300],
    "max_depth": [15],
    "min_samples_split": [10],
    "min_samples_leaf": [1],
    "ccp_alpha": [0.001],
    "max_features": ["sqrt", "log2"],
    "criterion": ["entropy", "gini"],
}

# Initialize the RandomForestClassifier with the best parameters from RandomizedSearchCV
rf_refined = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_refined = GridSearchCV(
    estimator=rf_refined,
    param_grid=param_grid_refined,
    verbose=1,
    n_jobs=-1,
    cv=10,  # 10-fold cross-validation for more reliable results
    scoring="roc_auc",
)

# Fit the classifier to the training data
grid_search_refined.fit(X_train, y_train)

# Get the best parameters and best score
best_params_refined = grid_search_refined.best_params_
best_score_refined = grid_search_refined.best_score_

print(f"Best Parameters after Refining: {best_params_refined}")
print(f"Best Score after Refining: {best_score_refined:.5f}")

-   Best Parameters after Refining: {'ccp_alpha': 0.001, 'criterion': 'gini', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}
-   Best Score after Refining: 0.91632
-   Just to note these are default values.


Our final parameters are:

-   n_estimators=300
-   max_depth=15
-   min_samples_split=10
-   min_samples_leaf=1
-   ccp_alpha=0.001
-   max_features="sqrt"
-   criterion="gini"


## B


Vyskúšajte kombinácie modelov (ensemble) pre zvolený algoritmus tak, aby ste
optimalizovali výkonnosť (bez underfitingu).


## C


Využite krížovú validáciu (cross validation) na trénovacej množine.


In [None]:
# Initialize the model
rf_classifier = RandomForestClassifier(
    n_estimators=300, max_depth=15, min_samples_split=10, min_samples_leaf=1, ccp_alpha=0.001, random_state=42
)

# Print the cross-validation scores
print_scores_cv(rf_classifier, "RandomForestClassifier", X_train, y_train, cv=5)

## D


Dokážte že Váš nastavený najlepší model je bez overfitingu.


In [None]:
# Initialize the model
rf_classifier = RandomForestClassifier(
    n_estimators=300, max_depth=15, min_samples_split=10, min_samples_leaf=1, ccp_alpha=0.001, random_state=42
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

-   Since train metrics and test metrics are close, the model is not overfitting.


In [None]:
# Initialize the model
rf_classifier = RandomForestClassifier(
    n_estimators=300, max_depth=15, min_samples_split=10, min_samples_leaf=1, ccp_alpha=0.001, random_state=42
)

# Define the training sizes
train_sizes = np.linspace(0.1, 1.0, 10)


# Function to plot learning curves
def plot_learning_curve(estimator, X, y, train_sizes, cv, scoring, title):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, train_sizes=train_sizes, scoring=scoring, n_jobs=-1
    )
    train_scores_mean = 1 - train_scores.mean(axis=1)
    test_scores_mean = 1 - test_scores.mean(axis=1)

    plt.figure()
    plt.plot(train_sizes, train_scores_mean, label="Training score")
    plt.plot(train_sizes, test_scores_mean, label="Cross-Validation Score")
    plt.xlabel("Training Size")
    plt.ylabel(scoring)
    plt.title(title)
    plt.legend()
    plt.show()


# Plot learning curve for accuracy
plot_learning_curve(
    rf_classifier, X_train, y_train, train_sizes, cv=5, scoring="accuracy", title="Learning Curve (Accuracy)"
)

# Plot learning curve for precision
plot_learning_curve(
    rf_classifier, X_train, y_train, train_sizes, cv=5, scoring="precision", title="Learning Curve (Precision)"
)

# Plot learning curve for recall
plot_learning_curve(
    rf_classifier, X_train, y_train, train_sizes, cv=5, scoring="recall", title="Learning Curve (Recall)"
)

-   We can see that early stopping could also help, as there is slight decline in performance after some point.


# 3.4


Vyhodnotenie vplyvu zvolenej stratégie riešenia na klasifikáciu.


## A


Stratégie riešenia chýbajúcich hodnôt a outlierov.


### Definitions


#### Define columns


In [None]:
# Load data
file_path: str = "../data/raw"
files: tuple[str, ...] = ("connections", "devices", "processes", "profiles")

dataset: dict[str, pd.DataFrame] = {}
for file in files:
    dataset[file] = pd.read_csv(f"{file_path}/{file}.csv", sep="\t")
    dataset[file] = dataset[file].drop_duplicates()

df = pd.merge(dataset["connections"], dataset["processes"], on=["imei", "ts", "mwra"])
df["ts"] = pd.to_datetime(df.ts)

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

# Define columns
all_columns = train_data.drop(columns=["mwra", "ts", "imei"]).columns
non_gaussian_columns = [
    "c.android.vending",
    "c.UCMobile.x86",
    "c.updateassist",
    "c.UCMobile.intl",
    "p.android.vending",
    "p.dogalize",
    "p.olauncher",
    "p.simulator",
    "p.inputmethod.latin",
    "p.android.gms",
    "p.notifier",
    "p.katana",
    "p.gms.persistent",
]
gaussian_columns = all_columns[~all_columns.isin(non_gaussian_columns)]
transformed_feature_order = pd.Series(gaussian_columns.tolist() + non_gaussian_columns)

#### Define pipelies


In [None]:
general_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)

vending_pipeline = Pipeline(
    [
        ("quantile_transformer", QuantileTransformer(output_distribution="normal", random_state=42)),
    ]
)

# Create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("general", general_pipe, gaussian_columns),
        ("vending", vending_pipeline, non_gaussian_columns),
    ],
    remainder="passthrough",
)

# Create complete pipeline
complete_pipeline = Pipeline([("preprocessor", preprocessor)])

#### Visualization


In [None]:
number_of_outliers = pd.DataFrame(
    index=["Number of Outliers", "Percentage of Outliers"],
    columns=["None-iterative IQR", "Iterative IQR", "None-iterative Z-score", "Iterative Z-score"],
)

### Cleaning methods


#### None-iterative IQR


##### Load data


In [None]:
file_path: str = "../data/raw"
files: tuple[str, ...] = ("connections", "devices", "processes", "profiles")

dataset: dict[str, pd.DataFrame] = {}
for file in files:
    dataset[file] = pd.read_csv(f"{file_path}/{file}.csv", sep="\t")
    dataset[file] = dataset[file].drop_duplicates()

df = pd.merge(dataset["connections"], dataset["processes"], on=["imei", "ts", "mwra"])
df["ts"] = pd.to_datetime(df.ts)

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

##### Cleaning


In [None]:
number_of_rows_before_outliers = train_data.shape[0]


def remove_outliers_iqr(data, columns):
    Q1 = data[columns].quantile(0.25)
    Q3 = data[columns].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[columns] >= lower_bound).all(axis=1) & (data[columns] <= upper_bound).all(axis=1)]


train_data = remove_outliers_iqr(train_data, train_data.iloc[:, 3:].columns)

train_data = train_data.reset_index(drop=True)

print(f"Number of outliers removed: {number_of_rows_before_outliers - train_data.shape[0]}")
print(
    f"Percentage of rows removed: {((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100:.2f}%"
)

number_of_outliers.loc["Number of Outliers", "None-iterative IQR"] = (
    number_of_rows_before_outliers - train_data.shape[0]
)
number_of_outliers.loc["Percentage of Outliers", "None-iterative IQR"] = round(
    ((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100, 2
)

##### Use pipelines


In [None]:
# Fit and transform training data, transform test data
train_data_processed = complete_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_pipeline.transform(test_data[all_columns])

# Create DataFrames with selected feature names
train_data_processed = pd.DataFrame(train_data_processed, columns=transformed_feature_order)
test_data_processed = pd.DataFrame(test_data_processed, columns=transformed_feature_order)

# # Add mwra column back to the DataFrames
train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

##### Export cleaned data


In [None]:
os.makedirs("../data/clean_methods", exist_ok=True)

train_data_processed.to_csv("../data/clean_methods/train_1iqr.csv", index=False)
test_data_processed.to_csv("../data/clean_methods/test_1iqr.csv", index=False)

#### Iterative IQR


##### Load Dataset


In [None]:
file_path: str = "../data/raw"
files: tuple[str, ...] = ("connections", "devices", "processes", "profiles")

dataset: dict[str, pd.DataFrame] = {}
for file in files:
    dataset[file] = pd.read_csv(f"{file_path}/{file}.csv", sep="\t")
    dataset[file] = dataset[file].drop_duplicates()

df = pd.merge(dataset["connections"], dataset["processes"], on=["imei", "ts", "mwra"])
df["ts"] = pd.to_datetime(df.ts)

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

##### Cleaning


In [None]:
number_of_rows_before_outliers = train_data.shape[0]

columns_for_iqr = train_data.iloc[:, 3:].columns.difference(["p.android.vending"])


def remove_outliers_iqr(data, columns):
    Q1 = data[columns].quantile(0.25)
    Q3 = data[columns].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[columns] >= lower_bound).all(axis=1) & (data[columns] <= upper_bound).all(axis=1)]


train_data = remove_outliers_iqr(train_data, train_data.iloc[:, 3:].columns)

outliers_count = (
    ~(
        (
            train_data[columns_for_iqr]
            >= train_data[columns_for_iqr].quantile(0.25)
            - 1.5 * (train_data[columns_for_iqr].quantile(0.75) - train_data[columns_for_iqr].quantile(0.25))
        )
        & (
            train_data[columns_for_iqr]
            <= train_data[columns_for_iqr].quantile(0.75)
            + 1.5 * (train_data[columns_for_iqr].quantile(0.75) - train_data[columns_for_iqr].quantile(0.25))
        )
    ).all(axis=1)
).sum()

max_iterations = 10
iteration = 0

while outliers_count > 0:
    train_data = remove_outliers_iqr(train_data, columns_for_iqr)
    outliers_count = (
        ~(
            (
                train_data[columns_for_iqr]
                >= train_data[columns_for_iqr].quantile(0.25)
                - 1.5 * (train_data[columns_for_iqr].quantile(0.75) - train_data[columns_for_iqr].quantile(0.25))
            )
            & (
                train_data[columns_for_iqr]
                <= train_data[columns_for_iqr].quantile(0.75)
                + 1.5 * (train_data[columns_for_iqr].quantile(0.75) - train_data[columns_for_iqr].quantile(0.25))
            )
        ).all(axis=1)
    ).sum()
    iteration += 1
    if iteration >= max_iterations:
        break

train_data = train_data.reset_index(drop=True)

print(f"Number of outliers removed: {number_of_rows_before_outliers - train_data.shape[0]}")
print(
    f"Percentage of outliers removed: {((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100:.2f}%"
)

number_of_outliers.loc["Number of Outliers", "Iterative IQR"] = (
    number_of_rows_before_outliers - train_data.shape[0]
)
number_of_outliers.loc["Percentage of Outliers", "Iterative IQR"] = round(
    ((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100, 2
)

##### Use pipelines


In [None]:
# Fit and transform training data, transform test data
train_data_processed = complete_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_pipeline.transform(test_data[all_columns])

# Create DataFrames with selected feature names
train_data_processed = pd.DataFrame(train_data_processed, columns=transformed_feature_order)
test_data_processed = pd.DataFrame(test_data_processed, columns=transformed_feature_order)

# Add mwra column back to the DataFrames
train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

##### Export processed data


In [None]:
os.makedirs("../data/clean_methods", exist_ok=True)

train_data_processed.to_csv("../data/clean_methods/train_itiqr.csv", index=False)
test_data_processed.to_csv("../data/clean_methods/test_itiqr.csv", index=False)

#### None-iterative Z-score


##### Load Dataset


In [None]:
file_path: str = "../data/raw"
files: tuple[str, ...] = ("connections", "devices", "processes", "profiles")

dataset: dict[str, pd.DataFrame] = {}
for file in files:
    dataset[file] = pd.read_csv(f"{file_path}/{file}.csv", sep="\t")
    dataset[file] = dataset[file].drop_duplicates()

df = pd.merge(dataset["connections"], dataset["processes"], on=["imei", "ts", "mwra"])
df["ts"] = pd.to_datetime(df.ts)

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

##### Cleaning


In [None]:
number_of_rows_before_outliers = train_data.shape[0]

train_data = train_data[(np.abs(zscore(train_data.iloc[:, 3:])) < 3).all(axis=1)]
train_data = train_data.reset_index(drop=True)

print(f"Number of outliers removed: {number_of_rows_before_outliers - train_data.shape[0]}")
print(
    f"Percentage of outliers removed: {((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100:.2f}%"
)
number_of_outliers.loc["Number of Outliers", "None-iterative Z-score"] = (
    number_of_rows_before_outliers - train_data.shape[0]
)
number_of_outliers.loc["Percentage of Outliers", "None-iterative Z-score"] = round(
    ((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100, 2
)

##### Use pipelines


In [None]:
# Fit and transform training data, transform test data
train_data_processed = complete_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_pipeline.transform(test_data[all_columns])

# Create DataFrames with selected feature names
train_data_processed = pd.DataFrame(train_data_processed, columns=transformed_feature_order)
test_data_processed = pd.DataFrame(test_data_processed, columns=transformed_feature_order)

# Add mwra column back to the DataFrames
train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

##### Export processed data


In [None]:
os.makedirs("../data/clean_methods", exist_ok=True)

train_data_processed.to_csv("../data/clean_methods/train_1zscore.csv", index=False)
test_data_processed.to_csv("../data/clean_methods/test_1zscore.csv", index=False)

#### Iterative Z-score


##### Load Dataset


In [None]:
file_path: str = "../data/raw"
files: tuple[str, ...] = ("connections", "devices", "processes", "profiles")

dataset: dict[str, pd.DataFrame] = {}
for file in files:
    dataset[file] = pd.read_csv(f"{file_path}/{file}.csv", sep="\t")
    dataset[file] = dataset[file].drop_duplicates()

df = pd.merge(dataset["connections"], dataset["processes"], on=["imei", "ts", "mwra"])
df["ts"] = pd.to_datetime(df.ts)

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

##### Cleaning


In [None]:
# 1 iteration of cleaning whole dataset of outliers (including p.android.vending)
train_data = train_data[(np.abs(zscore(train_data.iloc[:, 3:])) < 3).all(axis=1)]

# Using all columns except c.android.vending for outlier detection
columns_for_zscore = train_data.iloc[:, 3:].columns.difference(["p.android.vending"])
outliers_count = (~(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)).sum()
max_iterations = 10
iteration = 0

# Iterating after we removed all outliers
while outliers_count > 0:
    train_data = train_data[(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)]
    outliers_count = (~(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)).sum()
    iteration += 1
    if iteration >= max_iterations:
        break

train_data = train_data.reset_index(drop=True)

print(f"Number of outliers removed: {number_of_rows_before_outliers - train_data.shape[0]}")
print(
    f"Percentage of outliers removed: {((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100:.2f}%"
)
number_of_outliers.loc["Number of Outliers", "Iterative Z-score"] = (
    number_of_rows_before_outliers - train_data.shape[0]
)
number_of_outliers.loc["Percentage of Outliers", "Iterative Z-score"] = round(
    ((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100, 2
)

##### Use pipelines


In [None]:
# Fit and transform training data, transform test data
train_data_processed = complete_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_pipeline.transform(test_data[all_columns])

# Create DataFrames with selected feature names
train_data_processed = pd.DataFrame(train_data_processed, columns=transformed_feature_order)
test_data_processed = pd.DataFrame(test_data_processed, columns=transformed_feature_order)

# Add mwra column back to the DataFrames
train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

##### Export processed data


In [None]:
os.makedirs("../data/clean_methods", exist_ok=True)

train_data_processed.to_csv("../data/clean_methods/train_itzscore.csv", index=False)
test_data_processed.to_csv("../data/clean_methods/test_itzscore.csv", index=False)

### Show comparison


In [None]:
number_of_outliers

### Using on Models


#### None-iterative IQR


In [None]:
# Load data
train_data_processed = pd.read_csv("../data/clean_methods/train_1iqr.csv")
test_data_processed = pd.read_csv("../data/clean_methods/test_1iqr.csv")

# Train data without feature selection
X_train = train_data_processed.drop(columns=["mwra"])
y_train = train_data_processed["mwra"]

# Test data without feature selection
X_test = test_data_processed.drop(columns=["mwra"])
y_test = test_data_processed["mwra"]

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    n_estimators=300, max_depth=15, min_samples_split=10, min_samples_leaf=1, ccp_alpha=0.001, random_state=42
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
scores_1iqr = get_scores(rf_classifier, "RandomForestClassifier - 1 IQR", X_train, y_train, X_test, y_test)

#### Iterative IQR


In [None]:
# Load data
train_data_processed = pd.read_csv("../data/clean_methods/train_itiqr.csv")
test_data_processed = pd.read_csv("../data/clean_methods/test_itiqr.csv")

# Train data without feature selection
X_train = train_data_processed.drop(columns=["mwra"])
y_train = train_data_processed["mwra"]

# Test data without feature selection
X_test = test_data_processed.drop(columns=["mwra"])
y_test = test_data_processed["mwra"]

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    n_estimators=300, max_depth=15, min_samples_split=10, min_samples_leaf=1, ccp_alpha=0.001, random_state=42
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
scores_itiqr = get_scores(
    rf_classifier, "RandomForestClassifier - Iterative IQR", X_train, y_train, X_test, y_test
)

#### None-iterative Z-score


In [None]:
# Load data
train_data_processed = pd.read_csv("../data/clean_methods/train_1zscore.csv")
test_data_processed = pd.read_csv("../data/clean_methods/test_1zscore.csv")

# Train data without feature selection
X_train = train_data_processed.drop(columns=["mwra"])
y_train = train_data_processed["mwra"]

# Test data without feature selection
X_test = test_data_processed.drop(columns=["mwra"])
y_test = test_data_processed["mwra"]

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    n_estimators=300, max_depth=15, min_samples_split=10, min_samples_leaf=1, ccp_alpha=0.001, random_state=42
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
scores_1zscore = get_scores(rf_classifier, "RandomForestClassifier - 1 Z-score", X_train, y_train, X_test, y_test)

#### Iterative Z-score


In [None]:
# Load data
train_data_processed = pd.read_csv("../data/clean_methods/train_itzscore.csv")
test_data_processed = pd.read_csv("../data/clean_methods/test_itzscore.csv")

# Train data without feature selection
X_train = train_data_processed.drop(columns=["mwra"])
y_train = train_data_processed["mwra"]

# Test data without feature selection
X_test = test_data_processed.drop(columns=["mwra"])
y_test = test_data_processed["mwra"]

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    n_estimators=300, max_depth=15, min_samples_split=10, min_samples_leaf=1, ccp_alpha=0.001, random_state=42
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
scores_itzscore = get_scores(
    rf_classifier, "RandomForestClassifier - Iterative Z-score", X_train, y_train, X_test, y_test
)

### Show comparison


In [None]:
scores = pd.concat([scores_1iqr, scores_itiqr, scores_1zscore, scores_itzscore], axis=1)
scores

## B


Dátová transformácia (scaling, transformer, ...).


## C


Výber atribútov, výber algoritmov, hyperparameter tuning, ensemble learning.


## D


Ktorý model je Váš najlepší model pre nasadenie (deployment)?


## E


Aký je data pipeline pre jeho vybudovanie na základe Vášho datasetu v produkcii?


# Test


In [None]:
def get_scores(model, model_name, X_train, y_train, X_test, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    df = pd.DataFrame(
        index=["accuracy", "precision", "recall", "roc_auc"],
        columns=pd.MultiIndex.from_product([[model_name], ["Train", "Test"]]),
    )

    accuracy_train = accuracy_score(y_train, y_pred_train)
    precision_train = precision_score(y_train, y_pred_train)
    recall_train = recall_score(y_train, y_pred_train)
    roc_auc_train = roc_auc_score(y_train, y_pred_train)

    df.loc["accuracy", (model_name, "Train")] = accuracy_train
    df.loc["precision", (model_name, "Train")] = precision_train
    df.loc["recall", (model_name, "Train")] = recall_train
    df.loc["roc_auc", (model_name, "Train")] = roc_auc_train

    accuracy_test = accuracy_score(y_test, y_pred_test)
    precision_test = precision_score(y_test, y_pred_test)
    recall_test = recall_score(y_test, y_pred_test)
    roc_auc_test = roc_auc_score(y_test, y_pred_test)

    df.loc["accuracy", (model_name, "Test")] = accuracy_test
    df.loc["precision", (model_name, "Test")] = precision_test
    df.loc["recall", (model_name, "Test")] = recall_test
    df.loc["roc_auc", (model_name, "Test")] = roc_auc_test

    return df

In [None]:
log_reg = LogisticRegression(solver="lbfgs", random_state=42)

parameters = {
    "penalty": [None, "l2"],
    "C": [0.001, 0.005, 0.01, 0.05],
    "max_iter": [100, 150, 200, 250, 300],
}

grid_search = GridSearchCV(
    log_reg,
    param_grid=parameters,
    scoring="roc_auc",
    n_jobs=-1,
    cv=10,
)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score:.5f}")

In [None]:
log_reg = LogisticRegression(solver="liblinear", random_state=42)

parameters = {
    "penalty": ["l1", "l2"],
    "C": [0.001, 0.005, 0.01, 0.05],
    "max_iter": [100, 150, 200, 250, 300],
}

grid_search = GridSearchCV(
    log_reg,
    param_grid=parameters,
    scoring="roc_auc",
    n_jobs=-1,
    cv=10,
)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score:.5f}")

In [None]:
log_reg = LogisticRegression(C=0.005, max_iter=100, penalty="l2", solver="lbfgs", random_state=42)
log_reg.fit(X_train, y_train)
df_1 = get_scores(log_reg, "LogisticRegression - lbfgs", X_train, y_train, X_test, y_test)

log_reg = LogisticRegression(C=0.005, max_iter=100, penalty="l2", solver="liblinear", random_state=42)
log_reg.fit(X_train, y_train)
df_2 = get_scores(log_reg, "LogisticRegression - liblinear", X_train, y_train, X_test, y_test)

# Concatenate the DataFrames
df = pd.concat([df_1, df_2], axis=1)

df