# Start


In [None]:
import os

import numpy as np
import pandas as pd
import seaborn as sns
from graphviz import Source
from IPython.core.magic import register_cell_magic
from IPython.display import HTML, SVG
from matplotlib import pyplot as plt
from scipy.stats import zscore
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import (
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier,
    VotingClassifier,
)
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    auc,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import (
    GridSearchCV,
    ParameterGrid,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    learning_curve,
    train_test_split,
)
from sklearn.pipeline import FunctionTransformer, Pipeline
from sklearn.preprocessing import (
    MinMaxScaler,
    PowerTransformer,
    QuantileTransformer,
    RobustScaler,
    StandardScaler,
)
from sklearn.tree import DecisionTreeClassifier, export_graphviz

## Process data


### Loading data


In [None]:
file_path: str = "../data/raw"
files: tuple[str, ...] = ("connections", "devices", "processes", "profiles")

dataset: dict[str, pd.DataFrame] = {}
for file in files:
    dataset[file] = pd.read_csv(f"{file_path}/{file}.csv", sep="\t")
    dataset[file] = dataset[file].drop_duplicates()

df = pd.merge(dataset["connections"], dataset["processes"], on=["imei", "ts", "mwra"])
df["ts"] = pd.to_datetime(df.ts)

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

### Cleaning data


In [None]:
# 1 iteration of cleaning whole dataset of outliers (including p.android.vending)
train_data = train_data[(np.abs(zscore(train_data.iloc[:, 3:])) < 3).all(axis=1)]

# Using all columns except c.android.vending for outlier detection
columns_for_zscore = train_data.iloc[:, 3:].columns.difference(["p.android.vending"])
outliers_count = (~(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)).sum()
max_iterations = 10
iteration = 0

# Iterating after we removed all outliers
while outliers_count > 0:
    train_data = train_data[(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)]
    outliers_count = (~(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)).sum()
    iteration += 1
    if iteration >= max_iterations:
        break

train_data = train_data.reset_index(drop=True)

### Export cleaned data


In [None]:
# Exporting cleaned data
os.makedirs("../data/clean", exist_ok=True)

train_data.to_csv("../data/clean/train_data.csv", index=False)
test_data.to_csv("../data/clean/test_data.csv", index=False)

### Import cleaned data


In [None]:
train_data = pd.read_csv("../data/clean/train_data.csv")
test_data = pd.read_csv("../data/clean/test_data.csv")

### Define columns


In [None]:
all_columns = train_data.drop(columns=["mwra", "ts", "imei"]).columns
non_gaussian_columns = [
    "c.android.vending",
    "c.UCMobile.x86",
    "c.updateassist",
    "c.UCMobile.intl",
    "p.android.vending",
    "p.dogalize",
    "p.olauncher",
    "p.simulator",
    "p.inputmethod.latin",
    "p.android.gms",
    "p.notifier",
    "p.katana",
    "p.gms.persistent",
]
gaussian_columns = all_columns[~all_columns.isin(non_gaussian_columns)]
transformed_feature_order = pd.Series(gaussian_columns.tolist() + non_gaussian_columns)

### Define pipelines


In [None]:
general_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)

vending_pipeline = Pipeline(
    [
        ("quantile_transformer", QuantileTransformer(output_distribution="normal", random_state=42)),
    ]
)

# Create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("general", general_pipe, gaussian_columns),
        ("vending", vending_pipeline, non_gaussian_columns),
    ],
    remainder="passthrough",
)

# Create complete pipeline
complete_pipeline = Pipeline([("preprocessor", preprocessor), ("selector", SelectKBest(f_classif, k=10))])

### Transform data


In [None]:
# Fit and transform training data, transform test data
train_data_processed = complete_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_pipeline.transform(test_data[all_columns])

# Get selected features immediately after fitting
feature_mask = complete_pipeline.named_steps["selector"].get_support()
selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

# Create DataFrames with selected feature names
train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

### Export processed data


In [None]:
os.makedirs("../data/processed", exist_ok=True)

train_data_processed.to_csv("../data/processed/train_data.csv", index=False)
test_data_processed.to_csv("../data/processed/test_data.csv", index=False)

### Cleanup


In [None]:
del (
    file_path,
    files,
    file,
    df,
    train_data,
    test_data,
    columns_for_zscore,
    outliers_count,
    max_iterations,
    iteration,
    all_columns,
    non_gaussian_columns,
    gaussian_columns,
    transformed_feature_order,
    general_pipe,
    vending_pipeline,
    preprocessor,
    complete_pipeline,
    train_data_processed,
    test_data_processed,
    dataset,
    selected_features,
)

## Helper Stuff


In [None]:
train_data_processed = pd.read_csv("../data/processed/train_data.csv")
test_data_processed = pd.read_csv("../data/processed/test_data.csv")

# Train data without feature selection
X_train = train_data_processed.drop(columns=["mwra"])
y_train = train_data_processed["mwra"]

# Test data without feature selection
X_test = test_data_processed.drop(columns=["mwra"])
y_test = test_data_processed["mwra"]

del train_data_processed, test_data_processed

In [None]:
def get_scores(model, model_name, X_train, y_train, X_test, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    df = pd.DataFrame(
        index=["accuracy", "precision", "recall", "f1-score", "roc_auc"],
        columns=pd.MultiIndex.from_product([[model_name], ["Train", "Test", "Difference"]]),
    )

    accuracy_train = accuracy_score(y_train, y_pred_train)
    precision_train = precision_score(y_train, y_pred_train)
    recall_train = recall_score(y_train, y_pred_train)
    f1_train = f1_score(y_train, y_pred_train)
    roc_auc_train = roc_auc_score(y_train, y_pred_train)

    df.loc["accuracy", (model_name, "Train")] = accuracy_train
    df.loc["precision", (model_name, "Train")] = precision_train
    df.loc["recall", (model_name, "Train")] = recall_train
    df.loc["f1-score", (model_name, "Train")] = f1_train
    df.loc["roc_auc", (model_name, "Train")] = roc_auc_train

    accuracy_test = accuracy_score(y_test, y_pred_test)
    precision_test = precision_score(y_test, y_pred_test)
    recall_test = recall_score(y_test, y_pred_test)
    f1_test = f1_score(y_test, y_pred_test)
    roc_auc_test = roc_auc_score(y_test, y_pred_test)

    df.loc["accuracy", (model_name, "Test")] = accuracy_test
    df.loc["precision", (model_name, "Test")] = precision_test
    df.loc["recall", (model_name, "Test")] = recall_test
    df.loc["f1-score", (model_name, "Test")] = f1_test
    df.loc["roc_auc", (model_name, "Test")] = roc_auc_test

    df.loc["accuracy", (model_name, "Difference")] = accuracy_train - accuracy_test
    df.loc["precision", (model_name, "Difference")] = precision_train - precision_test
    df.loc["recall", (model_name, "Difference")] = recall_train - recall_test
    df.loc["f1-score", (model_name, "Difference")] = f1_train - f1_test
    df.loc["roc_auc", (model_name, "Difference")] = roc_auc_train - roc_auc_test

    return df

In [None]:
def get_scores_cv(model, model_name, X, y, cv):
    metrics = ["accuracy", "precision", "recall", "f1", "roc_auc"]

    # Create MultiIndex DataFrame
    df = pd.DataFrame(
        index=pd.MultiIndex.from_product([["Train", "Test", "Difference"], metrics]),
        columns=pd.MultiIndex.from_product([[model_name], ["Mean", "Std"]]),
    )

    # Calculate scores for each model
    scores = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring={
            "accuracy": "accuracy",
            "precision": "precision",
            "recall": "recall",
            "f1": "f1",
            "roc_auc": "roc_auc",
        },
        return_train_score=True,
    )

    # Fill DataFrame with training and test scores
    for metric in metrics:
        # Training scores
        train_key = f"train_{metric}"
        df.loc[("Train", metric), (model_name, "Mean")] = scores[train_key].mean()
        df.loc[("Train", metric), (model_name, "Std")] = scores[train_key].std()

        # Test scores
        test_key = f"test_{metric}"
        df.loc[("Test", metric), (model_name, "Mean")] = scores[test_key].mean()
        df.loc[("Test", metric), (model_name, "Std")] = scores[test_key].std()

        # Difference between training and test scores
        diff = scores[train_key] - scores[test_key]
        df.loc[("Difference", metric), (model_name, "Mean")] = diff.mean()
        df.loc[("Difference", metric), (model_name, "Std")] = diff.std()

    return df

In [None]:
def plot_learning_curve(estimator, X, y, train_sizes, cv, scoring, title):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, train_sizes=train_sizes, scoring=scoring, n_jobs=-1
    )
    train_scores_mean = 1 - train_scores.mean(axis=1)
    test_scores_mean = 1 - test_scores.mean(axis=1)

    plt.figure()
    plt.plot(train_sizes, train_scores_mean, label="Training score")
    plt.plot(train_sizes, test_scores_mean, label="Cross-Validation Score")
    plt.xlabel("Training Size")
    plt.ylabel(f"{scoring} Error")
    plt.title(title)
    plt.legend()
    plt.show()

In [None]:
def plot_model_complexity_curve(model, X_train, y_train, X_test, y_test, max_depth_range):
    # Initialize lists to store the training and validation errors
    train_errors = []
    val_errors = []

    # Loop over the range of max_depth values
    for max_depth in max_depth_range:
        # Initialize the model with the current max_depth
        model = DecisionTreeClassifier(
            criterion="gini",
            max_depth=max_depth,
            min_samples_split=10,
            min_samples_leaf=1,
            ccp_alpha=0.001,
            random_state=42,
        )

        # Fit model
        model.fit(X_train, y_train)

        # Compute the training and testing data errors
        train_score = accuracy_score(y_train, model.predict(X_train))
        test_score = accuracy_score(y_test, model.predict(X_test))

        # Compute the mean errors
        train_errors.append(1 - train_score)
        val_errors.append(1 - test_score)

    # Plot the learning curve for model complexity
    plt.figure()
    plt.plot(max_depth_range, train_errors, label="Training Error")
    plt.plot(max_depth_range, val_errors, label="Validation Error")
    plt.xlabel("Max Depth")
    plt.ylabel("Error")
    plt.title("Learning Curve (Model Complexity)")
    plt.legend()
    plt.show()

In [None]:
def compare_confusion_matrix(models, X_train, y_train, X_test, y_test):
    _, axes = plt.subplots(1, len(models), figsize=(15, 5))

    for i, (model_name, model) in enumerate(models):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        conf_matrix = confusion_matrix(y_test, y_pred)
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False, ax=axes[i])
        axes[i].set_title(model_name)

    plt.show()

In [None]:
def show_confusion_matrix(model, model_name, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    conf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.title(model_name)
    plt.show()

In [None]:
rf_best_params = {
    "n_estimators": 300,
    "max_depth": 10,
    "min_samples_split": 5,
    "min_samples_leaf": 1,
    "ccp_alpha": 0.001,
    "random_state": 42,
}

lgr_best_params = {"C": 0.01, "max_iter": 100, "penalty": "l2", "solver": "lbfgs", "tol": 0.001}

gbg_best_params = {
    "subsample": 0.8,
    "n_estimators": 100,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "max_depth": 4,
    "learning_rate": 0.01,
    "random_state": 42,
}

hbg_best_params = {
    "l2_regularization": 2.0,
    "learning_rate": 0.1,
    "max_depth": 7,
    "max_iter": 200,
    "min_samples_leaf": 20,
    "random_state": 42,
}

In [None]:
@register_cell_magic
def ignore(line, cell):
    pass

# 3.1


-   Jednoduchý klasifikátor na základe závislosti v dátach.


## A


Naimplementujte jednoduchý ID3 klasifikátor s hĺbkou min 2 (vrátane root/koreň).


In [None]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None, info_gain=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        self.info_gain = info_gain

In [None]:
class ID3Classifier:
    def __init__(self, max_depth: int = 2):
        self.max_depth = max_depth
        self.tree = None

    def build_tree(self, X, y, curr_depth: int = 0):
        _, num_features = X.shape

        if curr_depth <= self.max_depth:
            best_split = self.get_best_split(X, y, num_features)
            if best_split["info_gain"] > 0:
                left_subtree = self.build_tree(best_split["X_left"], best_split["y_left"], curr_depth + 1)
                right_subtree = self.build_tree(best_split["X_right"], best_split["y_right"], curr_depth + 1)

                return Node(
                    feature=best_split["feature"],
                    threshold=best_split["threshold"],
                    left=left_subtree,
                    right=right_subtree,
                )

        leaf_value = self.calculate_leaf_value(y)
        return Node(value=leaf_value)

    def get_best_split(self, X, y, num_features):
        best_split = {}
        max_info_gain = -float("inf")

        for feature in range(num_features):
            feature_values = X[:, feature]
            possible_thresholds = np.unique(feature_values)

            for threshold in possible_thresholds:
                X_left, X_right, y_left, y_right = self.split(X, y, feature, threshold)
                curr_info_gain = self.information_gain(y, y_left, y_right)

                if curr_info_gain > max_info_gain:
                    best_split["feature"] = feature
                    best_split["threshold"] = threshold
                    best_split["X_left"] = X_left
                    best_split["X_right"] = X_right
                    best_split["y_left"] = y_left
                    best_split["y_right"] = y_right
                    best_split["info_gain"] = curr_info_gain
                    max_info_gain = curr_info_gain

        return best_split

    @staticmethod
    def split(X, y, feature, threshold):
        left_indices = np.where(X[:, feature] <= threshold)
        right_indices = np.where(X[:, feature] > threshold)

        X_left = X[left_indices]
        X_right = X[right_indices]
        y_left = y[left_indices]
        y_right = y[right_indices]

        return X_left, X_right, y_left, y_right

    def information_gain(self, parent, l_child, r_child):
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)

        gain = self.entropy(parent) - (weight_l * self.entropy(l_child) + weight_r * self.entropy(r_child))
        return gain

    @staticmethod
    def entropy(y):
        class_labels = np.unique(y)
        ent = 0

        for cls in class_labels:
            p = np.sum(y == cls) / len(y)
            ent += p * np.log2(p)

        return -ent

    @staticmethod
    def calculate_leaf_value(y):

        Y = list(y)
        return max(y, key=Y.count)

    def fit(self, X, y):
        self.tree = self.build_tree(X, y)

    def predict(self, X):
        return [self.traverse_tree(x, self.tree) for x in X]

    def traverse_tree(self, x, node):
        if node.value is not None:
            return node.value

        if x[node.feature] <= node.threshold:
            return self.traverse_tree(x, node.left)

        return self.traverse_tree(x, node.right)

## B


Vyhodnoťte Váš ID3 klasifikátor pomocou metrík accuracy, precision a recall.


In [None]:
id3_classifier = ID3Classifier(max_depth=5)
id3_classifier.fit(X_train.values, y_train.values)

In [None]:
def get_scores_id3(model, model_name, X_train, y_train, X_test, y_test):
    y_pred_train = model.predict(X_train.values)
    y_pred_test = model.predict(X_test.values)

    df = pd.DataFrame(
        index=["accuracy", "precision", "recall", "f1-score", "roc_auc"],
        columns=pd.MultiIndex.from_product([[model_name], ["Train", "Test", "Difference"]]),
    )

    accuracy_train = accuracy_score(y_train, y_pred_train)
    precision_train = precision_score(y_train, y_pred_train)
    recall_train = recall_score(y_train, y_pred_train)
    f1_train = f1_score(y_train, y_pred_train)
    roc_auc_train = roc_auc_score(y_train, y_pred_train)

    df.loc["accuracy", (model_name, "Train")] = accuracy_train
    df.loc["precision", (model_name, "Train")] = precision_train
    df.loc["recall", (model_name, "Train")] = recall_train
    df.loc["f1-score", (model_name, "Train")] = f1_train
    df.loc["roc_auc", (model_name, "Train")] = roc_auc_train

    accuracy_test = accuracy_score(y_test, y_pred_test)
    precision_test = precision_score(y_test, y_pred_test)
    recall_test = recall_score(y_test, y_pred_test)
    f1_test = f1_score(y_test, y_pred_test)
    roc_auc_test = roc_auc_score(y_test, y_pred_test)

    df.loc["accuracy", (model_name, "Test")] = accuracy_test
    df.loc["precision", (model_name, "Test")] = precision_test
    df.loc["recall", (model_name, "Test")] = recall_test
    df.loc["f1-score", (model_name, "Test")] = f1_test
    df.loc["roc_auc", (model_name, "Test")] = roc_auc_test

    df.loc["accuracy", (model_name, "Difference")] = accuracy_train - accuracy_test
    df.loc["precision", (model_name, "Difference")] = precision_train - precision_test
    df.loc["recall", (model_name, "Difference")] = recall_train - recall_test
    df.loc["f1-score", (model_name, "Difference")] = f1_train - f1_test
    df.loc["roc_auc", (model_name, "Difference")] = roc_auc_train - roc_auc_test

    return df


# Using ID3 Classifier
scores_id3 = get_scores_id3(id3_classifier, "ID3", X_train, y_train, X_test, y_test)
scores_id3

Unnamed: 0_level_0,ID3,ID3,ID3
Unnamed: 0_level_1,Train,Test,Difference
accuracy,0.914271,0.90288,0.011391
precision,0.91727,0.903275,0.013995
recall,0.950271,0.945874,0.004397
f1-score,0.933479,0.924084,0.009395
roc_auc,0.901226,0.888562,0.012664


-   We see that using feature selection gives worse results. We will look at it in more detail later (3.4).


## C


Zístite či Váš ID3 klasifikátor má overfit.


-   Since train and test metrics are close, the model is likely not overfitting.
-   But let's also look at learning curves.


### Learning curve - train sizes


In [None]:
# Initialize the model
dtc_nofs = DecisionTreeClassifier(
    criterion="gini", max_depth=15, min_samples_split=10, min_samples_leaf=1, ccp_alpha=0.001, random_state=42
)

# Define the training sizes
train_sizes = np.linspace(0.1, 1.0, 50)

# Plot learning curve for accuracy
plot_learning_curve(
    dtc_nofs,
    X_train,
    y_train,
    train_sizes,
    cv=5,
    scoring="accuracy",
    title="Learning Curve - Train size (Accuracy)",
)

# Plot learning curve for precision
plot_learning_curve(
    dtc_nofs,
    X_train,
    y_train,
    train_sizes,
    cv=5,
    scoring="precision",
    title="Learning Curve - Train size (Precision)",
)

# Plot learning curve for recall
plot_learning_curve(
    dtc_nofs, X_train, y_train, train_sizes, cv=5, scoring="recall", title="Learning Curve - Train size (Recall)"
)

-   We don't see a big gap in metrics between train and test data (Looking at last point since that is what we used in previous steps).
-   However we see that model starts of badly (as expected) and then improves with more data. Around 50% of data for training seems to be enough.


### Learning curve - model complexity


In [None]:
# Define the range of max_depth values
max_depth_range = range(1, 30)

# Plot model complexity curve
plot_model_complexity_curve(dtc_nofs, X_train, y_train, X_test, y_test, max_depth_range)

-   This also shows no overfitting for our graph as there is no significant gap (looking at max_depth=15, as this is what we used in previous steps).
-   We can also see that model start of very badly and is underfitting.
-   Around max_depth=5 training and testing error stabilizes.


### ROC


In [None]:
dtc_nofs = DecisionTreeClassifier(
    criterion="gini", max_depth=15, min_samples_split=10, min_samples_leaf=1, ccp_alpha=0.001, random_state=42
)

dtc_nofs.fit(X_train, y_train)

# Compute ROC curve and ROC area for train data
fpr_train, tpr_train, _ = roc_curve(y_train, dtc_nofs.predict_proba(X_train)[:, 1])
roc_auc_train = auc(fpr_train, tpr_train)

# Compute ROC curve and ROC area for test data
fpr_test, tpr_test, _ = roc_curve(y_test, dtc_nofs.predict_proba(X_test)[:, 1])
roc_auc_test = auc(fpr_test, tpr_test)

# Plot ROC curve
plt.figure()
plt.plot(fpr_train, tpr_train, color="blue", lw=2, label=f"Train ROC curve (area = {roc_auc_train:.5f})")
plt.plot(fpr_test, tpr_test, color="red", lw=2, label=f"Test ROC curve (area = {roc_auc_test:.5f})")
plt.plot([0, 1], [0, 1], color="gray", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

-   ROC also shows no significant gap between train and test data. So we can conclude that model is not overfitting.


## Cleanup


In [None]:
del (
    dtc_fs,
    dtc_nofs,
    fpr_train,
    tpr_train,
    fpr_test,
    tpr_test,
    train_sizes,
    max_depth_range,
    roc_auc_test,
    roc_auc_train,
)

# 3.2


Trénovanie a vyhodnotenie klasifikátorov strojového učenia.


## A


Na trénovanie využite jeden stromový algoritmus v scikit-learn.


### Initialize DecisionTreeClassifier


In [None]:
# use some basic parameters to avoid overfitting
dst_classifier = DecisionTreeClassifier(max_depth=15, ccp_alpha=0.001, random_state=42)
dst_classifier.fit(X_train, y_train)

### Metrics


In [None]:
df_1 = get_scores_cv(dst_classifier, "DecisionTreeClassifier", X_train, y_train, cv=5)
df_1

In [None]:
show_confusion_matrix(dst_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)

-   Even quite simple model gives good results without overfitting.


## B


Porovnajte s jedným iným nestromovým algoritmom v scikit-learn.


### Initialize LogisticRegression


In [None]:
log_reg = LogisticRegression(random_state=42)

### Metrics


In [None]:
df_2 = get_scores_cv(log_reg, "LogisticRegression", X_train, y_train, cv=5)
df_2

In [None]:
show_confusion_matrix(log_reg, "LogisticRegression", X_train, y_train, X_test, y_test)

-   Basic logistic regression gives quite good results without overfitting as well


### Comparison


In [None]:
df = pd.concat([df_1, df_2], axis=1)
del df_1, df_2
df

In [None]:
compare_confusion_matrix(
    [("RandomForestClassifier", dst_classifier), ("LogisticRegression", log_reg)], X_train, y_train, X_test, y_test
)

-   Basic Logistic Regression gives slightly better results than basic Decision Tree Classifier.


## C


Porovnajte výsledky s ID3 z prvého kroku.


In [None]:
# TODO: Finish

## D


Vizualizujte natrénované pravidlá minimálne pre jeden Vami vybraný algoritmus.


In [None]:
# Initialize RandomForestClassifier
dst_classifier = DecisionTreeClassifier(max_depth=15, ccp_alpha=0.001, random_state=42)

# Fit the classifier to the training data
dst_classifier.fit(X_train, y_train)

# Generate graph
graph = Source(
    export_graphviz(
        dst_classifier, out_file=None, class_names=["no", "yes"], filled=True, feature_names=X_train.columns
    )
)

# Display graph
display(SVG(graph.pipe(format="svg")))

style = "<style>svg{width:100%;height:70%;}</style>"
HTML(style)

## E


Vyhodnoťte natrénované modely pomocou metrík accuracy, precision a recall.


-   We already did this but let's do it again.


### Initialize classifiers


In [None]:
# Initialize classifiers
dst_classifier = DecisionTreeClassifier(max_depth=15, ccp_alpha=0.001, random_state=42)
log_reg = LogisticRegression(random_state=42)

### Compare metrics


In [None]:
# Print the scores
df_1 = get_scores_cv(dst_classifier, "DecisionTreeClassifier", X_train, y_train, cv=5)
df_2 = get_scores_cv(log_reg, "LogisticRegression", X_train, y_train, cv=5)

df = pd.concat([df_1, df_2], axis=1)
del df_1, df_2
df

### Compare matrixes


In [None]:
compare_confusion_matrix(
    [
        ("DecisionTreeClassifier", dst_classifier),
        ("LogisticRegression", log_reg),
    ],
    X_train,
    y_train,
    X_test,
    y_test,
)

### Comparison


-   As we already said, Logistic Regression gives slightly better results than Decision Tree Classifier.
-   However, both models are quite good and don't overfit.


## Cleanup


In [None]:
del dst_classifier, log_reg, graph, style

# 3.3


Optimalizácia alias hyperparameter tuning.


## A


Vyskúšajte rôzne nastavenie hyperparametrov (tuning) pre zvolený algoritmus tak,
aby ste optimalizovali výkonnosť (bez underfitingu).

We will test parameters for RandomForestClassifier as we know this will be better than DecisionTreeClassifier.


### Decision Tree vs Random Forest


In [None]:
%%ignore
# Define the parameter grid
param_grid = {
    "max_depth": [5, 10, 15, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "ccp_alpha": [0.0, 0.0005, 0.001, 0.0015],
    "min_impurity_decrease": [0.0, 0.0005, 0.001, 0.0015],
}

# Initialize the DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=dt_classifier, param_grid=param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1
)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

-   Best Parameters: {'ccp_alpha': 0.0005, 'max_depth': 10, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 2}
-   Best Score: 0.9026635738931255


In [None]:
dt_classifier = DecisionTreeClassifier(
    max_depth=10, min_samples_split=2, min_samples_leaf=4, ccp_alpha=0.0005, random_state=42
)
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=15, ccp_alpha=0.001, random_state=42)

df_1 = get_scores_cv(dt_classifier, "DecisionTreeClassifier", X_train, y_train, cv=5)
df_2 = get_scores_cv(rf_classifier, "RandomForestClassifier", X_train, y_train, cv=5)

df = pd.concat([df_1, df_2], axis=1)
del df_1, df_2
df

-   This shows difference between optimized Decision Tree and some basic Random Forest (with basic paramters to avoid overfitting).
-   We can clearly see that Random Forest is better and therefore we are gonna use it for further steps.


### First, find baseline parameters


#### n_estimators


In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_depth=20, ccp_alpha=0.001
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_depth=20, ccp_alpha=0.001
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

-   We see that performance is not increasing with n_estimators.


#### ccp_alpha


In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(criterion="entropy", n_estimators=100, random_state=42, ccp_alpha=0.002)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(criterion="entropy", n_estimators=100, random_state=42, ccp_alpha=0.005)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

-   We see ccp_alpha 0.002 is better considering ROC AUC factor.


#### max_features


In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    criterion="entropy",
    n_estimators=100,
    random_state=42,
    ccp_alpha=0.002,
    max_features="sqrt",
    max_depth=7,
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    criterion="entropy",
    n_estimators=100,
    random_state=42,
    ccp_alpha=0.002,
    max_features="log2",
    max_depth=7,
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

-   When also using max_depth, there is difference in ROC AUC, sqrt is better.


#### criterion


In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    criterion="entropy",
    n_estimators=100,
    random_state=42,
    ccp_alpha=0.002,
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    criterion="gini",
    n_estimators=100,
    random_state=42,
    ccp_alpha=0.002,
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

-   Entropy is better considering ROC AUC factor.


### RandomizedSearchCV


-   In previous step, we found that n_estimators=100, ccp_alpha=0.002, max_features=sqrt, criterion=entropy are better.
-   However we are not gonna look at max_features and criterion in first tuning method as we will look at them at the end when we have best primary hyperparameters.
-   Not using max_features and criterion in first tuning gives more priority to other hyperparameters.
-   Our primary hyperparameters are n_estimators, max_depth, min_samples_split, min_samples_leaf, ccp_alpha.


In [None]:
%%ignore
# Define the broad parameter grid
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [5, 10, 15, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "ccp_alpha": [0.001, 0.002, 0.003],
}

# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
random_search = RandomizedSearchCV(
    estimator=rf_classifier,
    param_distributions=param_grid,
    n_iter=100,
    n_jobs=-1,
    verbose=1,
    cv=10,  # 10-fold cross-validation for more reliable results
    scoring="roc_auc",
    random_state=42,
)

# Fit the classifier to the training data
random_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = random_search.best_params_
best_score = random_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score:.5f}")

-   Best Parameters: {'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 15, 'ccp_alpha': 0.002}
-   Best Score: 0.91579


### GridSearchCV


#### First iteration


In [None]:
%%ignore
# Best parameters from RandomizedSearchCV
best_params = {
    "n_estimators": 300,
    "max_depth": 15,
    "min_samples_split": 10,
    "min_samples_leaf": 2,
    "ccp_alpha": 0.002,
}

# Define the refined parameter grid for GridSearchCV
param_grid_refined = {
    "n_estimators": [
        best_params["n_estimators"] - 100,
        best_params["n_estimators"],
        best_params["n_estimators"] + 100,
    ],
    "max_depth": [
        best_params["max_depth"] - 5,
        best_params["max_depth"],
        best_params["max_depth"] + 5,
    ],
    "min_samples_split": [
        best_params["min_samples_split"] - 5,
        best_params["min_samples_split"],
        best_params["min_samples_split"] + 5,
    ],
    "min_samples_leaf": [
        best_params["min_samples_leaf"],
        best_params["min_samples_leaf"] + 1,
    ],
    "ccp_alpha": [
        best_params["ccp_alpha"],
        best_params["ccp_alpha"] + 0.001,
        best_params["ccp_alpha"] + 0.002,
    ],
}

# Initialize the RandomForestClassifier with the best parameters from RandomizedSearchCV
rf_refined = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_refined = GridSearchCV(
    estimator=rf_refined,
    param_grid=param_grid_refined,
    verbose=1,
    n_jobs=-1,
    cv=10,  # 10-fold cross-validation for more reliable results
    scoring="roc_auc",
)

# Fit the classifier to the training data
grid_search_refined.fit(X_train, y_train)

# Get the best parameters and best score
best_params_refined = grid_search_refined.best_params_
best_score_refined = grid_search_refined.best_score_

print(f"Best Parameters after Refining: {best_params_refined}")
print(f"Best Score after Refining: {best_score_refined:.5f}")

-   Best Parameters after Refining: {'ccp_alpha': 0.002, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 15, 'n_estimators': 400}
-   Best Score after Refining: 0.91606
-   The trend for n_estimators is, bigger is better (until some point). So in next iteration we will ignore this parameter.


#### Second iteration


My colleague found different parameters to be better, so we will look at them in this step.


In [None]:
%%ignore
# Define the refined parameter grid for GridSearchCV
param_grid_refined = {
    "n_estimators": [
        200
    ],
    "max_depth": [
        15
    ],
    "min_samples_split": [
        5,10
    ],
    "min_samples_leaf": [
        1,2
    ],
    "ccp_alpha": [
        0.001, 0.002
    ],
}

# Initialize the RandomForestClassifier with the best parameters from RandomizedSearchCV
rf_refined = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_refined = GridSearchCV(
    estimator=rf_refined,
    param_grid=param_grid_refined,
    verbose=1,
    n_jobs=-1,
    cv=10,  # 10-fold cross-validation for more reliable results
    scoring="roc_auc",
)

# Fit the classifier to the training data
grid_search_refined.fit(X_train, y_train)

# Get the best parameters and best score
best_params_refined = grid_search_refined.best_params_
best_score_refined = grid_search_refined.best_score_

print(f"Best Parameters after Refining: {best_params_refined}")
print(f"Best Score after Refining: {best_score_refined:.5f}")

-   Best Parameters after Refining: {'ccp_alpha': 0.001, 'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
-   Best Score after Refining: 0.91596
-   From these observations:
    -   We can conclude that max_depth=~15 is the best, as it was 15 in every iteration.
    -   We can conclude that ccp_alpha=~0.001 or cpp_alpha=~0.002 is the best.
    -   We can conclude that min_samples_leaf=~1 or a little higher is the best.
    -   We can conclude that min_samples_split=~10 is the best.
    -   We can conclude that higher n_estimators is better (at some value it will be worse).


#### Third iteration


Lets look at cpp_alpha and max_depth in more detail.


In [None]:
%%ignore
# Define the refined parameter grid for GridSearchCV
param_grid_refined = {
    "n_estimators": [300],
    "max_depth": [14, 15, 16],
    "min_samples_split": [10],
    "min_samples_leaf": [1],
    "ccp_alpha": [0.0095, 0.01, 0.0105],
}

# Initialize the RandomForestClassifier with the best parameters from RandomizedSearchCV
rf_refined = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_refined = GridSearchCV(
    estimator=rf_refined,
    param_grid=param_grid_refined,
    verbose=1,
    n_jobs=-1,
    cv=10,  # 10-fold cross-validation for more reliable results
    scoring="roc_auc",
)

# Fit the classifier to the training data
grid_search_refined.fit(X_train, y_train)

# Get the best parameters and best score
best_params_refined = grid_search_refined.best_params_
best_score_refined = grid_search_refined.best_score_

print(f"Best Parameters after Refining: {best_params_refined}")
print(f"Best Score after Refining: {best_score_refined:.5f}")

-   cpp_alpha=0.001 is best
-   max_depth=15 is best


#### Fourth iteration


Now let's look at best n_estimators.


In [None]:
%%ignore
# Define the refined parameter grid for GridSearchCV
param_grid_refined = {
    "n_estimators": [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    "max_depth": [15],
    "min_samples_split": [10],
    "min_samples_leaf": [1],
    "ccp_alpha": [0.001],
}

# Initialize the RandomForestClassifier with the best parameters from RandomizedSearchCV
rf_refined = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_refined = GridSearchCV(
    estimator=rf_refined,
    param_grid=param_grid_refined,
    verbose=1,
    n_jobs=-1,
    cv=10,  # 10-fold cross-validation for more reliable results
    scoring="roc_auc",
)

# Fit the classifier to the training data
grid_search_refined.fit(X_train, y_train)

# Get the best parameters and best score
best_params_refined = grid_search_refined.best_params_
best_score_refined = grid_search_refined.best_score_

print(f"Best Parameters after Refining: {best_params_refined}")
print(f"Best Score after Refining: {best_score_refined:.5f}")

-   n_estimators=300 is best.


#### Final iteration


As discussed earlier, we will also look at max_features and criterion.


In [None]:
%%ignore
# Define the refined parameter grid for GridSearchCV
param_grid_refined = {
    "n_estimators": [300],
    "max_depth": [15],
    "min_samples_split": [10],
    "min_samples_leaf": [1],
    "ccp_alpha": [0.001],
    "max_features": ["sqrt", "log2"],
    "criterion": ["entropy", "gini"],
}

# Initialize the RandomForestClassifier with the best parameters from RandomizedSearchCV
rf_refined = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_refined = GridSearchCV(
    estimator=rf_refined,
    param_grid=param_grid_refined,
    verbose=1,
    n_jobs=-1,
    cv=10,  # 10-fold cross-validation for more reliable results
    scoring="roc_auc",
)

# Fit the classifier to the training data
grid_search_refined.fit(X_train, y_train)

# Get the best parameters and best score
best_params_refined = grid_search_refined.best_params_
best_score_refined = grid_search_refined.best_score_

print(f"Best Parameters after Refining: {best_params_refined}")
print(f"Best Score after Refining: {best_score_refined:.5f}")

-   Best Parameters after Refining: {'ccp_alpha': 0.001, 'criterion': 'gini', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}
-   Best Score after Refining: 0.91632
-   We can see that max_features=sqrt and criterion=gini are best and they are also default values.


Our final parameters are:

-   n_estimators=300
-   max_depth=15
-   min_samples_split=10
-   min_samples_leaf=1
-   ccp_alpha=0.001
-   max_features="sqrt"
-   criterion="gini"


To note we did these test without having feature selection in pipeline as we were getting better result without it. (Will be discussed further in 3.4)

Since we discovered that it is best practise to always use feature selection, we will use it it next step and look if parameters change.


In [None]:
# Define the refined parameter grid for GridSearchCV
param_grid_refined = {
    "n_estimators": [300],
    "max_depth": [10, 15],
    "min_samples_split": [5, 10],
    "min_samples_leaf": [1, 2],
    "ccp_alpha": [0.001, 0.0015],
}

# Initialize the RandomForestClassifier with the best parameters from RandomizedSearchCV
rf_refined = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_refined = GridSearchCV(
    estimator=rf_refined,
    param_grid=param_grid_refined,
    verbose=1,
    n_jobs=-1,
    cv=10,  # 10-fold cross-validation for more reliable results
    scoring="roc_auc",
)

# Fit the classifier to the training data
grid_search_refined.fit(X_train, y_train)

# Get the best parameters and best score
best_params_refined = grid_search_refined.best_params_
best_score_refined = grid_search_refined.best_score_

print(f"Best Parameters after Refining: {best_params_refined}")
print(f"Best Score after Refining: {best_score_refined:.5f}")

-   Best Parameters after Refining: {'ccp_alpha': 0.001, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
-   Best Score after Refining: 0.91591
-   Best score without feature selection was 0.91632 vs 0.91591 with feature selection.
-   As hinted before, using feature selection gives slightly worse results. But we will look at it in more detail in 3.4.
-   We can also see that parameters did change and therefore it is always crucial to try different parameters when changing pipeline.


Our final parameters with feature selection are:

-   n_estimators=300
-   max_depth=10
-   min_samples_split=5
-   min_samples_leaf=1
-   ccp_alpha=0.001
-   max_features="sqrt"
-   criterion="gini"


### GridSearchCV - Logistic Regression


-   Let's also look at Logistic Regression.


In [None]:
%%ignore
log_reg = LogisticRegression(solver="lbfgs", random_state=42)

parameters = {
    "penalty": [None, "l2"],
    "C": [0.001, 0.005, 0.01, 0.05],
    "max_iter": [100, 150, 200, 250, 300],
    "tol": [0.0001, 0.0005, 0.001, 0.005],
}

grid_search = GridSearchCV(
    log_reg,
    param_grid=parameters,
    scoring="roc_auc",
    n_jobs=-1,
    cv=10,
)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score:.5f}")

-   Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'tol': 0.0005}
-   Best Score: 0.91217


In [None]:
%%ignore
log_reg = LogisticRegression(solver="liblinear", random_state=42)

parameters = {
    "penalty": ["l1", "l2"],
    "C": [0.001, 0.005, 0.01, 0.05],
    "max_iter": [100, 150, 200, 250, 300],
    "tol": [0.0001, 0.0005, 0.001, 0.005],
}

grid_search = GridSearchCV(
    log_reg,
    param_grid=parameters,
    scoring="roc_auc",
    n_jobs=-1,
    cv=10,
)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score:.5f}")

-   Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'tol': 0.001}
-   Best Score: 0.91243


In [None]:
log_reg = LogisticRegression(C=0.01, max_iter=100, penalty="l2", solver="lbfgs", tol=0.001, random_state=42)
log_reg.fit(X_train, y_train)
df_1 = get_scores(log_reg, "LogisticRegression - lbfgs", X_train, y_train, X_test, y_test)

log_reg = LogisticRegression(C=0.01, max_iter=100, penalty="l1", solver="liblinear", tol=0.001, random_state=42)
log_reg.fit(X_train, y_train)
df_2 = get_scores(log_reg, "LogisticRegression - liblinear", X_train, y_train, X_test, y_test)

# Concatenate the DataFrames
df = pd.concat([df_1, df_2], axis=1)
df

-   Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', solver="lbfgs",'tol': 0.001}


## B


Vyskúšajte kombinácie modelov (ensemble) pre zvolený algoritmus tak, aby ste
optimalizovali výkonnosť (bez underfitingu).


We already used RandomForestClassifier in previous step, but we will now explore more ensemble methods.


### Basic Ensemble Models


#### RandomForestClassifier


In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    **rf_best_params,
)

# Print the scores
df_rf = get_scores_cv(rf_classifier, "RandomForestClassifier", X_train, y_train, cv=5)
df_rf

#### GradientBoostingClassifier


In [None]:
# Initialize the GradientBoostingClassifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Print the scores
df_gb = get_scores_cv(gb_classifier, "GradientBoostingClassifier", X_train, y_train, cv=5)
df_gb

#### HistGradientBoostingClassifier


In [None]:
hgb_classifier = HistGradientBoostingClassifier(random_state=42)

# Print the scores
df_hgb = get_scores_cv(hgb_classifier, "HistGradientBoostingClassifier", X_train, y_train, cv=5)
df_hgb

#### Comparisson


In [None]:
df = pd.concat([df_rf, df_gb, df_hgb], axis=1)
df

-   We can see that RandomForestClassifier is the best model for our data as we used hyperparameter tuning on it.
-   HistGradientBoostingClassifier shows promising results but overfits more than RandomForestClassifier or GradientBoostingClassifier


#### Hyperparameter tuning for GradientBoostingClassifier


In [None]:
%%ignore
# Define the parameter grid
param_grid = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 4, 5],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "subsample": [0.8, 0.9, 1.0],
}

# Initialize the GradientBoostingClassifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=gb_classifier,
    param_distributions=param_grid,
    n_iter=100,
    cv=5,
    verbose=1,
    n_jobs=-1,
    scoring="roc_auc", # ROC_AUC for lowering overfitting
    random_state=42,
)

# Fit the random search to the data
random_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = random_search.best_params_
best_score = random_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score:.5f}")

-   Best Parameters: {'subsample': 0.8, 'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 4, 'learning_rate': 0.01}
-   Best Score: 0.91491


In [None]:
gb_classifier = GradientBoostingClassifier(**gbg_best_params)

df_gb = get_scores_cv(gb_classifier, "GradientBoostingClassifier", X_train, y_train, X_test, y_test)
df_gb

#### Hyperparameter tuning for HistGradientBoostingClassifier


In [None]:
%%ignore
# Parameter grid
param_grid = {
    "learning_rate": [0.01, 0.1, 0.3],
    "max_depth": [3, 5, 7],
    "max_iter": [100, 200],
    "min_samples_leaf": [20, 50],
    "l2_regularization": [0, 1.0, 2.0],
}

# Base classifier
hgb = HistGradientBoostingClassifier(random_state=42)

# Grid search with 5-fold CV
grid_search = GridSearchCV(
    estimator=hgb,
    param_grid=param_grid,
    cv=5,
    scoring="roc_auc", # ROC_AUC for lowering overfitting
    n_jobs=-1,
    verbose=1,
)

# Assuming X and y are your features and target
grid_search.fit(X_train, y_train)

# Get best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Create optimized model with best parameters
best_model = HistGradientBoostingClassifier(**grid_search.best_params_, random_state=42)

-   Best parameters: {'l2_regularization': 0, 'learning_rate': 0.01, 'max_depth': 5, 'max_iter': 200, 'min_samples_leaf': 20}
-   Best cross-validation score: 0.9132202050166413


In [None]:
hgb = HistGradientBoostingClassifier(
    l2_regularization=0, learning_rate=0.01, max_depth=5, max_iter=200, min_samples_leaf=20, random_state=42
)

df_hgb = get_scores_cv(hgb, "HistGradientBoostingClassifier - Refined", X_train, y_train, cv=5)
df_hgb

In [None]:
df = pd.concat([df_rf, df_gb, df_hgb], axis=1)
del df_rf, df_gb, df_hgb
df

-   We can see that RandomForestClassifier is still the best model.
-   Using roc_auc for scoring in GridSearchCV we lover the overfitting of HistGradientBoostingClassifier but it is still worse than RandomForestClassifier.


#### Comparisson


In [None]:
rf_classifier = RandomForestClassifier(**rf_best_params)
gb_classifier = GradientBoostingClassifier(**gbg_best_params)
hbg_classifier = HistGradientBoostingClassifier(**hbg_best_params)

df_rf = get_scores_cv(rf_classifier, "RandomForestClassifier", X_train, y_train, cv=5)
df_gb = get_scores_cv(gb_classifier, "GradientBoostingClassifier", X_train, y_train, cv=5)
df_hgb = get_scores_cv(hbg_classifier, "HistGradientBoostingClassifier", X_train, y_train, cv=5)

df = pd.concat([df_rf, df_gb, df_hgb], axis=1)
del df_rf, df_gb, df_hgb
df

-   First best accuracy has RandomForestClassifier then HistGradientBoostingClassifier and last GradientBoostingClassifier.
-   Then best F1 score has RandomForestClassifier then HistGradientBoostingClassifier and last GradientBoostingClassifier.
-   For Roc_Auc score, RandomForestClassifier is the best model, then GradientBoostingClassifier and last HistGradientBoostingClassifier.
-   For overfitting, no model is overfitting greatly, but HistGradientBoostingClassifier still has biggest overfit out of three models.
-   Because of this we are going to use GradientBoostingClassifier over HistGradientBoostingClassifier.


### Voting & Stacikng Classifier


In [None]:
# Define base models with different algorithms
rf = RandomForestClassifier(**rf_best_params)

lgr = LogisticRegression(**lgr_best_params)

gbg = GradientBoostingClassifier(**gbg_best_params)

# Create voting ensemble with different algorithms
voting_clf = VotingClassifier(estimators=[("rf", rf), ("lgr", lgr), ("gbg", gbg)], n_jobs=-1, voting="soft")

# Create stacking ensemble
stacking_clf = StackingClassifier(
    estimators=[("rf", rf), ("lgr", lgr), ("gbg", gbg)],
    final_estimator=LogisticRegression(random_state=42),
    n_jobs=-1,
    cv=5,
)

# Evaluate all models
models = {
    "Random Forest": rf,
    "Logistic Regression": lgr,
    "Gradient Boosting": gbg,
    "Voting Ensemble": voting_clf,
    "Stacking Ensemble": stacking_clf,
}

df = pd.DataFrame()
for name, model in models.items():
    print(f"\nEvaluating {name}")
    df_tmp = get_scores_cv(model, name, X_train, y_train, cv=5)
    df = pd.concat([df, df_tmp], axis=1)
df

-   Voting Classifier and Stacking Classifier performs similarly, but Stacking Classifier takes longer to run. Therefore we are going to compare only Voting Classifier with other models.
-   Since Random Forest Classifier was best model until this point, it will be our primary model to compare against.


In [None]:
df_rf = get_scores_cv(rf_classifier, "RandomForestClassifier", X_train, y_train, cv=5)
df_voting = get_scores_cv(voting_clf, "VotingClassifier", X_train, y_train, cv=5)

df = pd.concat([df_rf, df_voting], axis=1)
df

In [None]:
df = pd.concat([df_rf, df_voting], axis=1)
df

-   Random Forest Classifier is still the best model across all metrics.
-   The only thing VotingClassifier excels at is lower overfitting, but it is very small difference, therefore negligible.


#### Looking at second VotingClassifier


In [None]:
# Define base models with different algorithms
rf = RandomForestClassifier(**rf_best_params)

lgr = LogisticRegression(**lgr_best_params)

gbg = GradientBoostingClassifier(**gbg_best_params)

# Create voting ensemble with different algorithms
voting_clf = VotingClassifier(estimators=[("rf", rf), ("lgr", lgr), ("gbg", gbg)], n_jobs=-1, voting="soft")

# Create stacking ensemble
voting_clf_2 = VotingClassifier(estimators=[("rf", rf), ("gbg", gbg)], n_jobs=-1, voting="soft")

# Evaluate all models
models = {
    "Random Forest": rf,
    "Voting Ensemble": voting_clf,
    "Voting Ensemble 2": voting_clf_2,
}

df = pd.DataFrame()
for name, model in models.items():
    print(f"\nEvaluating {name}")
    df_tmp = get_scores_cv(model, name, X_train, y_train, cv=5)
    df = pd.concat([df, df_tmp], axis=1)
df

-   Removing Logistic Regression from Voting Classifier gives better results slightly better results in roc_auc metric.
-   Random Forest Classifier is still the best model across all metrics.


## C


Využite krížovú validáciu (cross validation) na trénovacej množine.


-   We were already using cross validation in previous steps.


## D


Dokážte že Váš nastavený najlepší model je bez overfitingu.


In [None]:
# Define base models with different algorithms
rf_classifier = RandomForestClassifier(**rf_best_params)

rf_classifier.fit(X_train, y_train)

df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

-   Since train metrics and test metrics are close, the model is not overfitting.


In [None]:
# Define base models with different algorithms
rf_classifier = RandomForestClassifier(**rf_best_params)

rf_classifier.fit(X_train, y_train)

# Compute ROC curve and ROC area for train data
fpr_train, tpr_train, _ = roc_curve(y_train, rf_classifier.predict_proba(X_train)[:, 1])
roc_auc_train = auc(fpr_train, tpr_train)

# Compute ROC curve and ROC area for test data
fpr_test, tpr_test, _ = roc_curve(y_test, rf_classifier.predict_proba(X_test)[:, 1])
roc_auc_test = auc(fpr_test, tpr_test)

# Plot ROC curve
plt.figure()
plt.plot(fpr_train, tpr_train, color="blue", lw=2, label=f"Train ROC curve (area = {roc_auc_train:.5f})")
plt.plot(fpr_test, tpr_test, color="red", lw=2, label=f"Test ROC curve (area = {roc_auc_test:.5f})")
plt.plot([0, 1], [0, 1], color="gray", lw=2, linestyle="--")
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

-   We can also see stable ROC curve for train and test data, so we can conclude that model is not overfitting.


# 3.4


Vyhodnotenie vplyvu zvolenej stratégie riešenia na klasifikáciu.


## A


Stratégie riešenia chýbajúcich hodnôt a outlierov.


### Definitions


In [None]:
def load_data_raw():
    file_path: str = "../data/raw"
    files: tuple[str, ...] = ("connections", "devices", "processes", "profiles")

    dataset: dict[str, pd.DataFrame] = {}
    for file in files:
        dataset[file] = pd.read_csv(f"{file_path}/{file}.csv", sep="\t")
        dataset[file] = dataset[file].drop_duplicates()

    df = pd.merge(dataset["connections"], dataset["processes"], on=["imei", "ts", "mwra"])
    df["ts"] = pd.to_datetime(df.ts)

    train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
    train_data = train_data.reset_index(drop=True)
    test_data = test_data.reset_index(drop=True)

    return train_data, test_data

In [None]:
# Load data
train_data, test_data = load_data_raw()

# Define columns
all_columns = train_data.drop(columns=["mwra", "ts", "imei"]).columns
non_gaussian_columns = [
    "c.android.vending",
    "c.UCMobile.x86",
    "c.updateassist",
    "c.UCMobile.intl",
    "p.android.vending",
    "p.dogalize",
    "p.olauncher",
    "p.simulator",
    "p.inputmethod.latin",
    "p.android.gms",
    "p.notifier",
    "p.katana",
    "p.gms.persistent",
]
gaussian_columns = all_columns[~all_columns.isin(non_gaussian_columns)]
transformed_feature_order = pd.Series(gaussian_columns.tolist() + non_gaussian_columns)

In [None]:
def remove_outliers_iqr(data, columns):
    Q1 = data[columns].quantile(0.25)
    Q3 = data[columns].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[columns] >= lower_bound).all(axis=1) & (data[columns] <= upper_bound).all(axis=1)]

In [None]:
general_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)

vending_pipeline = Pipeline(
    [
        ("quantile_transformer", QuantileTransformer(output_distribution="normal", random_state=42)),
    ]
)

# Create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("general", general_pipe, gaussian_columns),
        ("vending", vending_pipeline, non_gaussian_columns),
    ],
    remainder="passthrough",
)

# Create complete pipeline
complete_pipeline = Pipeline([("preprocessor", preprocessor), ("selector", SelectKBest(f_classif, k=10))])

In [None]:
number_of_outliers = pd.DataFrame(
    index=["Number of Outliers", "Percentage of Outliers"],
    columns=["None-iterative IQR", "Iterative IQR", "None-iterative Z-score", "Iterative Z-score"],
)

### Cleaning methods


#### None-iterative IQR


##### Cleaning


In [None]:
# Load data
train_data, test_data = load_data_raw()

# Get number of rows before removing outliers
number_of_rows_before_outliers = train_data.shape[0]

# Remove outliers using IQR method
train_data = remove_outliers_iqr(train_data, train_data.iloc[:, 3:].columns)

# Reset index
train_data = train_data.reset_index(drop=True)

# Print number of outliers removed
print(f"Number of outliers removed: {number_of_rows_before_outliers - train_data.shape[0]}")
print(
    f"Percentage of rows removed: {((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100:.2f}%"
)

# Save number of outliers removed
number_of_outliers.loc["Number of Outliers", "None-iterative IQR"] = (
    number_of_rows_before_outliers - train_data.shape[0]
)
number_of_outliers.loc["Percentage of Outliers", "None-iterative IQR"] = round(
    ((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100, 2
)

Number of outliers removed: 2767
Percentage of rows removed: 23.17%


##### Use pipelines


In [None]:
# Fit and transform training data, transform test data
train_data_processed = complete_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_pipeline.transform(test_data[all_columns])

# Get selected features immediately after fitting
feature_mask = complete_pipeline.named_steps["selector"].get_support()
selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

# Create DataFrames with selected feature names
train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

##### Export cleaned data


In [None]:
os.makedirs("../data/clean_methods", exist_ok=True)

train_data_processed.to_csv("../data/clean_methods/train_1iqr.csv", index=False)
test_data_processed.to_csv("../data/clean_methods/test_1iqr.csv", index=False)

#### Iterative IQR


##### Cleaning


In [None]:
# Load data
train_data, test_data = load_data_raw()

# Get number of rows before removing outliers
number_of_rows_before_outliers = train_data.shape[0]

# Define columns for IQR
columns_for_iqr = train_data.iloc[:, 3:].columns.difference(["p.android.vending"])

# Remove outliers using IQR method (in all columns)
train_data = remove_outliers_iqr(train_data, train_data.iloc[:, 3:].columns)

# Get number of outliers removed in next iteration
outliers_count = (
    ~(
        (
            train_data[columns_for_iqr]
            >= train_data[columns_for_iqr].quantile(0.25)
            - 1.5 * (train_data[columns_for_iqr].quantile(0.75) - train_data[columns_for_iqr].quantile(0.25))
        )
        & (
            train_data[columns_for_iqr]
            <= train_data[columns_for_iqr].quantile(0.75)
            + 1.5 * (train_data[columns_for_iqr].quantile(0.75) - train_data[columns_for_iqr].quantile(0.25))
        )
    ).all(axis=1)
).sum()

# Define maximum number of iterations
max_iterations = 10
iteration = 0

# Remove outliers iteratively
while outliers_count > 0:
    # Remove outliers using IQR method (in all columns except p.android.vending)
    train_data = remove_outliers_iqr(train_data, columns_for_iqr)

    # Get number of outliers removed in next iteration
    outliers_count = (
        ~(
            (
                train_data[columns_for_iqr]
                >= train_data[columns_for_iqr].quantile(0.25)
                - 1.5 * (train_data[columns_for_iqr].quantile(0.75) - train_data[columns_for_iqr].quantile(0.25))
            )
            & (
                train_data[columns_for_iqr]
                <= train_data[columns_for_iqr].quantile(0.75)
                + 1.5 * (train_data[columns_for_iqr].quantile(0.75) - train_data[columns_for_iqr].quantile(0.25))
            )
        ).all(axis=1)
    ).sum()

    # Increment iteration and stop if maximum number of iterations reached
    iteration += 1
    if iteration >= max_iterations:
        break

# Reset index
train_data = train_data.reset_index(drop=True)

# Print number of outliers removed
print(f"Number of outliers removed: {number_of_rows_before_outliers - train_data.shape[0]}")
print(
    f"Percentage of outliers removed: {((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100:.2f}%"
)

# Save number of outliers removed
number_of_outliers.loc["Number of Outliers", "Iterative IQR"] = (
    number_of_rows_before_outliers - train_data.shape[0]
)
number_of_outliers.loc["Percentage of Outliers", "Iterative IQR"] = round(
    ((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100, 2
)

Number of outliers removed: 2934
Percentage of outliers removed: 24.57%


##### Use pipelines


In [None]:
# Fit and transform training data, transform test data
train_data_processed = complete_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_pipeline.transform(test_data[all_columns])

# Get selected features immediately after fitting
feature_mask = complete_pipeline.named_steps["selector"].get_support()
selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

# Create DataFrames with selected feature names
train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

##### Export processed data


In [None]:
os.makedirs("../data/clean_methods", exist_ok=True)

train_data_processed.to_csv("../data/clean_methods/train_itiqr.csv", index=False)
test_data_processed.to_csv("../data/clean_methods/test_itiqr.csv", index=False)

#### None-iterative Z-score


##### Cleaning


In [None]:
# Load data
train_data, test_data = load_data_raw()

# Get number of rows before removing outliers
number_of_rows_before_outliers = train_data.shape[0]

# Remove outliers using Z-score method
train_data = train_data[(np.abs(zscore(train_data.iloc[:, 3:])) < 3).all(axis=1)]
train_data = train_data.reset_index(drop=True)

print(f"Number of outliers removed: {number_of_rows_before_outliers - train_data.shape[0]}")
print(
    f"Percentage of outliers removed: {((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100:.2f}%"
)
number_of_outliers.loc["Number of Outliers", "None-iterative Z-score"] = (
    number_of_rows_before_outliers - train_data.shape[0]
)
number_of_outliers.loc["Percentage of Outliers", "None-iterative Z-score"] = round(
    ((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100, 2
)

Number of outliers removed: 477
Percentage of outliers removed: 3.99%


##### Use pipelines


In [None]:
# Fit and transform training data, transform test data
train_data_processed = complete_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_pipeline.transform(test_data[all_columns])

# Get selected features immediately after fitting
feature_mask = complete_pipeline.named_steps["selector"].get_support()
selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

# Create DataFrames with selected feature names
train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

##### Export processed data


In [None]:
os.makedirs("../data/clean_methods", exist_ok=True)

train_data_processed.to_csv("../data/clean_methods/train_1zscore.csv", index=False)
test_data_processed.to_csv("../data/clean_methods/test_1zscore.csv", index=False)

#### Iterative Z-score


##### Cleaning


In [None]:
train_data, test_data = load_data_raw()

# 1 iteration of cleaning whole dataset of outliers (including p.android.vending)
train_data = train_data[(np.abs(zscore(train_data.iloc[:, 3:])) < 3).all(axis=1)]

# Using all columns except c.android.vending for outlier detection
columns_for_zscore = train_data.iloc[:, 3:].columns.difference(["p.android.vending"])
outliers_count = (~(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)).sum()
max_iterations = 10
iteration = 0

# Iterating after we removed all outliers
while outliers_count > 0:
    train_data = train_data[(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)]
    outliers_count = (~(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)).sum()
    iteration += 1
    if iteration >= max_iterations:
        break

train_data = train_data.reset_index(drop=True)

print(f"Number of outliers removed: {number_of_rows_before_outliers - train_data.shape[0]}")
print(
    f"Percentage of outliers removed: {((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100:.2f}%"
)
number_of_outliers.loc["Number of Outliers", "Iterative Z-score"] = (
    number_of_rows_before_outliers - train_data.shape[0]
)
number_of_outliers.loc["Percentage of Outliers", "Iterative Z-score"] = round(
    ((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100, 2
)

Number of outliers removed: 570
Percentage of outliers removed: 4.77%


##### Use pipelines


In [None]:
# Fit and transform training data, transform test data
train_data_processed = complete_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_pipeline.transform(test_data[all_columns])

# Get selected features immediately after fitting
feature_mask = complete_pipeline.named_steps["selector"].get_support()
selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

# Create DataFrames with selected feature names
train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

##### Export processed data


In [None]:
os.makedirs("../data/clean_methods", exist_ok=True)

train_data_processed.to_csv("../data/clean_methods/train_itzscore.csv", index=False)
test_data_processed.to_csv("../data/clean_methods/test_itzscore.csv", index=False)

### Show comparison


In [None]:
number_of_outliers

Unnamed: 0,None-iterative IQR,Iterative IQR,None-iterative Z-score,Iterative Z-score
Number of Outliers,2767.0,2934.0,477.0,570.0
Percentage of Outliers,23.17,24.57,3.99,4.77


### Using on Models


#### Not Deleting outliers


In [None]:
# Load data
train_data, test_data = load_data_raw()

# Fit and transform training data, transform test data
train_data_processed = complete_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_pipeline.transform(test_data[all_columns])

# Get selected features immediately after fitting
feature_mask = complete_pipeline.named_steps["selector"].get_support()
selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

# Create DataFrames with selected feature names
train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

X_train = train_data_processed.drop(columns=["mwra"])
y_train = train_data_processed["mwra"]

X_test = test_data_processed.drop(columns=["mwra"])
y_test = test_data_processed["mwra"]

In [None]:
rf_classifier = RandomForestClassifier(**rf_best_params)

df_none_cv = get_scores_cv(rf_classifier, "None", X_train, y_train, cv=5)
rf_classifier.fit(X_train, y_train)
df_none_scores = get_scores(rf_classifier, "None", X_train, y_train, X_test, y_test)

#### None-iterative IQR


In [None]:
# Load data
train_data_processed = pd.read_csv("../data/clean_methods/train_1iqr.csv")
test_data_processed = pd.read_csv("../data/clean_methods/test_1iqr.csv")

# Train data without feature selection
X_train = train_data_processed.drop(columns=["mwra"])
y_train = train_data_processed["mwra"]

# Test data without feature selection
X_test = test_data_processed.drop(columns=["mwra"])
y_test = test_data_processed["mwra"]

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(**rf_best_params)

# Show the scores
df_1iqr_cv = get_scores_cv(rf_classifier, "1IQR", X_train, y_train, cv=5)
rf_classifier.fit(X_train, y_train)
df_1iqr_scores = get_scores(rf_classifier, "1IQR", X_train, y_train, X_test, y_test)

#### Iterative IQR


In [None]:
# Load data
train_data_processed = pd.read_csv("../data/clean_methods/train_itiqr.csv")
test_data_processed = pd.read_csv("../data/clean_methods/test_itiqr.csv")

# Train data without feature selection
X_train = train_data_processed.drop(columns=["mwra"])
y_train = train_data_processed["mwra"]

# Test data without feature selection
X_test = test_data_processed.drop(columns=["mwra"])
y_test = test_data_processed["mwra"]

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(**rf_best_params)

# Show the scores
df_itiqr_cv = get_scores_cv(rf_classifier, "1ITIQR", X_train, y_train, cv=5)
rf_classifier.fit(X_train, y_train)
df_itiqr_scores = get_scores(rf_classifier, "1ITIQR", X_train, y_train, X_test, y_test)

#### None-iterative Z-score


In [None]:
# Load data
train_data_processed = pd.read_csv("../data/clean_methods/train_1zscore.csv")
test_data_processed = pd.read_csv("../data/clean_methods/test_1zscore.csv")

# Train data without feature selection
X_train = train_data_processed.drop(columns=["mwra"])
y_train = train_data_processed["mwra"]

# Test data without feature selection
X_test = test_data_processed.drop(columns=["mwra"])
y_test = test_data_processed["mwra"]

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(**rf_best_params)

# Show the scores
df_1zscore_cv = get_scores_cv(rf_classifier, "1ZSCORE", X_train, y_train, cv=5)
rf_classifier.fit(X_train, y_train)
df_1zscore_scores = get_scores(rf_classifier, "1ZSCORE", X_train, y_train, X_test, y_test)

#### Iterative Z-score


In [None]:
# Load data
train_data_processed = pd.read_csv("../data/clean_methods/train_itzscore.csv")
test_data_processed = pd.read_csv("../data/clean_methods/test_itzscore.csv")

# Train data without feature selection
X_train = train_data_processed.drop(columns=["mwra"])
y_train = train_data_processed["mwra"]

# Test data without feature selection
X_test = test_data_processed.drop(columns=["mwra"])
y_test = test_data_processed["mwra"]

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(**rf_best_params)

# Show the scores
df_itzscore_cv = get_scores_cv(rf_classifier, "1ITZSCORE", X_train, y_train, cv=5)
rf_classifier.fit(X_train, y_train)
df_itzscore_scores = get_scores(rf_classifier, "1ITZSCORE", X_train, y_train, X_test, y_test)

### Show comparison


In [None]:
df_cv = pd.concat([df_none_cv, df_1iqr_cv, df_itiqr_cv, df_1zscore_cv, df_itzscore_cv], axis=1)
df_scores = pd.concat(
    [df_none_scores, df_1iqr_scores, df_itiqr_scores, df_1zscore_scores, df_itzscore_scores], axis=1
)

Using cross validation


In [None]:
df_cv

Unnamed: 0_level_0,Unnamed: 1_level_0,None,None,1IQR,1IQR,1ITIQR,1ITIQR,1ZSCORE,1ZSCORE,1ITZSCORE,1ITZSCORE
Unnamed: 0_level_1,Unnamed: 1_level_1,Mean,Std,Mean,Std,Mean,Std,Mean,Std,Mean,Std
Train,accuracy,0.918551,0.000674,0.922434,0.000418,0.922799,0.000592,0.919349,0.000706,0.919348,0.000809
Train,precision,0.921994,0.000484,0.923368,0.00061,0.923371,0.000618,0.922212,0.00052,0.922232,0.000474
Train,recall,0.950321,0.000878,0.956935,0.000982,0.958551,0.000978,0.952414,0.001061,0.952945,0.001311
Train,f1,0.935943,0.000544,0.939852,0.000345,0.940632,0.000474,0.937069,0.000572,0.937336,0.000661
Train,roc_auc,0.929811,0.001102,0.93617,0.000791,0.935692,0.001433,0.93025,0.000772,0.92971,0.000956
Test,accuracy,0.91451,0.002263,0.9199,0.003009,0.920191,0.003622,0.915315,0.003237,0.915501,0.002895
Test,precision,0.919892,0.002462,0.921469,0.003336,0.921835,0.002945,0.919554,0.002536,0.919453,0.002379
Test,recall,0.945841,0.002809,0.954914,0.0043,0.955984,0.005357,0.948679,0.003944,0.949715,0.004295
Test,f1,0.932682,0.001794,0.937884,0.002369,0.938589,0.002898,0.933885,0.002577,0.934332,0.002323
Test,roc_auc,0.916022,0.004215,0.916354,0.003951,0.914908,0.005026,0.915958,0.003882,0.915743,0.004327


-   Best accuracy: 1ITIQR
-   Best F1:1ITIQR
-   Best roc_auc: 1IQR


Using Train/Test split


In [None]:
df_scores

Unnamed: 0_level_0,None,None,None,1IQR,1IQR,1IQR,1ITIQR,1ITIQR,1ITIQR,1ZSCORE,1ZSCORE,1ZSCORE,1ITZSCORE,1ITZSCORE,1ITZSCORE
Unnamed: 0_level_1,Train,Test,Difference,Train,Test,Difference,Train,Test,Difference,Train,Test,Difference,Train,Test,Difference
accuracy,0.917609,0.915606,0.002002,0.92197,0.913262,0.008708,0.922411,0.910583,0.011828,0.919152,0.916276,0.002876,0.918667,0.916276,0.002391
precision,0.921579,0.914697,0.006882,0.923243,0.912686,0.010557,0.923219,0.908533,0.014685,0.921934,0.915211,0.006723,0.921526,0.914784,0.006742
recall,0.949184,0.953912,-0.004728,0.95629,0.952304,0.003985,0.958072,0.95284,0.005232,0.952414,0.954448,-0.002034,0.952632,0.954984,-0.002352
f1-score,0.935178,0.933893,0.001285,0.939476,0.932074,0.007401,0.940323,0.93016,0.010163,0.936926,0.934418,0.002508,0.936821,0.934452,0.002369
roc_auc,0.906955,0.902849,0.004106,0.909497,0.900259,0.009238,0.908812,0.896509,0.012303,0.907408,0.903563,0.003845,0.906359,0.903385,0.002974


-   Best accuracy: 1ITZSCORE
-   Best F1: 1ITZSCORE
-   Best roc_auc: 1ITZSCORE

-   We can see that using cross validation, it prefers lower outlier removal, this can maybe be because there is not enough data for it to reach optimal learning.
-   Using whole train data, to train model and test on test data, it prefers higher outlier removal, this is what we expected.
-   Using cross_validation for model comparison is best practice, but since 1ITIQR removes 24.57% of data which is quite substantial amount and 1ITZSCORE which removes only 4.77% and only targets very extreme outliers, we are going to use 1ITZSCORE for further steps as it had best performance in Train/Test split and it also has lower std in cross validation indicating that it is more stable.


## B


Dátová transformácia (scaling, transformer, ...).


### Definitions


Using our old clean data as we used iterative Z-score for cleaning.


In [None]:
def load_data_clean():
    file_path: str = "../data/clean"

    # Load cleaned data (Iterative Z-score)
    train_data = pd.read_csv(f"{file_path}/train_itzscore.csv")
    test_data = pd.read_csv(f"{file_path}/test_itzscore.csv")

    return train_data, test_data

In [None]:
# Define columns
all_columns = train_data.drop(columns=["mwra", "ts", "imei"]).columns

gaussian_columns = [
    "c.dogalize",
    "c.android.gm",
    "c.android.youtube",
    "c.android.chrome",
    "c.katana",
    "c.raider",
    "p.android.packageinstaller",
    "p.android.settings",
    "p.android.documentsui",
    "p.android.chrome",
    "p.android.gm",
    "p.system",
    "p.android.externalstorage",
    "p.process.gapps",
    "p.google",
    "p.browser.provider",
    "p.android.defcontainer",
]

log_columns = ["c.android.vending"]

uniform_columns = [
    "c.UCMobile.x86",
    "c.updateassist",
    "c.UCMobile.intl",
    "p.android.vending",
    "p.dogalize",
    "p.olauncher",
    "p.simulator",
    "p.inputmethod.latin",
    "p.android.gms",
    "p.notifier",
    "p.katana",
    "p.gms.persistent",
]


transformed_feature_order = pd.Series(gaussian_columns + log_columns + uniform_columns)

KeyError: "['ts', 'imei'] not found in axis"

### Define different pipelines


#### Our pipeline from 2nd Phase


In [None]:
gaussian_pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)

non_gaussian_pipeline = Pipeline(
    [
        ("quantile_transformer", QuantileTransformer(output_distribution="normal", random_state=42)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("general", gaussian_pipeline, gaussian_columns),
        ("other", non_gaussian_pipeline, [log_columns, uniform_columns]),
    ],
    remainder="passthrough",
)

our_pipeline = Pipeline([("preprocessor", preprocessor), ("selector", SelectKBest(f_classif, k=10))])

#### Defining different scalers


In [None]:
# Define pipelines
min_max_pipeline = Pipeline(
    [
        ("scaler", MinMaxScaler()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)

standard_pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)

robust_pipeline = Pipeline(
    [
        ("scaler", RobustScaler()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)

quantile_pipeline = Pipeline(
    [
        ("scaler", QuantileTransformer(output_distribution="normal", random_state=42)),
    ]
)

# Create column transformers
min_max_preprocessor = ColumnTransformer(
    transformers=[
        ("min_max", min_max_pipeline, gaussian_columns),
        ("other", quantile_pipeline, [log_columns, uniform_columns]),
    ],
    remainder="passthrough",
)

standard_preprocessor = ColumnTransformer(
    transformers=[
        ("standard", standard_pipeline, gaussian_columns),
        ("other", quantile_pipeline, [log_columns, uniform_columns]),
    ],
    remainder="passthrough",
)

robust_preprocessor = ColumnTransformer(
    transformers=[
        ("robust", robust_pipeline, gaussian_columns),
        ("other", quantile_pipeline, [log_columns, uniform_columns]),
    ],
    remainder="passthrough",
)

# Create complete pipelines
min_max_pipeline = Pipeline([("preprocessor", min_max_preprocessor), ("selector", SelectKBest(f_classif, k=10))])
standard_pipeline = Pipeline([("preprocessor", standard_preprocessor), ("selector", SelectKBest(f_classif, k=10))])
robust_pipeline = Pipeline([("preprocessor", robust_preprocessor), ("selector", SelectKBest(f_classif, k=10))])

### Using on Models


In [None]:
# Load data
train_data, test_data = load_data_clean()

# Transform data
train_data_processed = min_max_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = min_max_pipeline.transform(test_data[all_columns])

feature_mask = min_max_pipeline.named_steps["selector"].get_support()
selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

# Use on model
rf_classifier = RandomForestClassifier(**rf_best_params)
df_min_max_cv = get_scores_cv(
    rf_classifier, "MinMax", train_data_processed.drop(columns=["mwra"]), train_data_processed["mwra"], cv=5
)

KeyError: "['c.android.gm', 'c.android.chrome', 'c.raider', 'c.android.vending', 'c.UCMobile.x86', 'c.updateassist', 'c.UCMobile.intl', 'p.android.gm', 'p.android.vending', 'p.google', 'p.browser.provider', 'p.android.defcontainer', 'p.dogalize', 'p.olauncher', 'p.simulator', 'p.inputmethod.latin', 'p.android.gms', 'p.notifier', 'p.katana', 'p.gms.persistent'] not in index"

In [None]:
# Load data
train_data, test_data = load_data_clean()

# Transform data
train_data_processed = standard_pipeline.fit_transform(train_data[transformed_feature_order], train_data["mwra"])
test_data_processed = standard_pipeline.transform(test_data[transformed_feature_order])

feature_mask = standard_pipeline.named_steps["selector"].get_support()
selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

# Use on model
rf_classifier = RandomForestClassifier(**rf_best_params)
df_standard_cv = get_scores_cv(
    rf_classifier, "Standard", train_data_processed.drop(columns=["mwra"]), train_data_processed["mwra"], cv=5
)

In [None]:
# Load data
train_data, test_data = load_data_clean()

# Transform data
train_data_processed = robust_pipeline.fit_transform(train_data[transformed_feature_order], train_data["mwra"])
test_data_processed = robust_pipeline.transform(test_data[transformed_feature_order])

feature_mask = robust_pipeline.named_steps["selector"].get_support()
selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

# Use on model
df_robust_cv = get_scores_cv(
    rf_classifier, "Robust", train_data_processed.drop(columns=["mwra"]), train_data_processed["mwra"], cv=5
)

### Comparison


In [None]:
df = pd.concat([df_min_max_cv, df_standard_cv, df_robust_cv], axis=1)
del df_min_max_cv, df_standard_cv, df_robust_cv
df

## C


Výber atribútov, výber algoritmov, hyperparameter tuning, ensemble learning.


## D


Ktorý model je Váš najlepší model pre nasadenie (deployment)?


## E


Aký je data pipeline pre jeho vybudovanie na základe Vášho datasetu v produkcii?
