# Start


In [None]:
import os

import numpy as np
import pandas as pd
import seaborn as sns
from graphviz import Source
from IPython.core.magic import register_cell_magic
from IPython.display import HTML, SVG
from matplotlib import pyplot as plt
from scipy.stats import zscore
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import (
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier,
    VotingClassifier,
)
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    auc,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import (
    GridSearchCV,
    ParameterGrid,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    learning_curve,
    train_test_split,
)
from sklearn.pipeline import FunctionTransformer, Pipeline
from sklearn.preprocessing import (
    MaxAbsScaler,
    MinMaxScaler,
    Normalizer,
    PowerTransformer,
    QuantileTransformer,
    RobustScaler,
    StandardScaler,
)
from sklearn.tree import DecisionTreeClassifier, export_graphviz

## Process data


### Loading data


In [None]:
file_path: str = "../data/raw"
files: tuple[str, ...] = ("connections", "devices", "processes", "profiles")

dataset: dict[str, pd.DataFrame] = {}
for file in files:
    dataset[file] = pd.read_csv(f"{file_path}/{file}.csv", sep="\t")
    dataset[file] = dataset[file].drop_duplicates()

df = pd.merge(dataset["connections"], dataset["processes"], on=["imei", "ts", "mwra"])
df["ts"] = pd.to_datetime(df.ts)

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

### Cleaning data


In [None]:
# 1 iteration of cleaning whole dataset of outliers (including p.android.vending)
train_data = train_data[(np.abs(zscore(train_data.iloc[:, 3:])) < 3).all(axis=1)]

# Using all columns except c.android.vending for outlier detection
columns_for_zscore = train_data.iloc[:, 3:].columns.difference(["p.android.vending"])
outliers_count = (~(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)).sum()
max_iterations = 10
iteration = 0

# Iterating after we removed all outliers
while outliers_count > 0:
    train_data = train_data[(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)]
    outliers_count = (~(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)).sum()
    iteration += 1
    if iteration >= max_iterations:
        break

train_data = train_data.reset_index(drop=True)

### Export cleaned data


In [None]:
# Exporting cleaned data
os.makedirs("../data/clean", exist_ok=True)

train_data.to_csv("../data/clean/train_data.csv", index=False)
test_data.to_csv("../data/clean/test_data.csv", index=False)

### Import cleaned data


In [None]:
train_data = pd.read_csv("../data/clean/train_data.csv")
test_data = pd.read_csv("../data/clean/test_data.csv")

### Define columns


In [None]:
all_columns = train_data.drop(columns=["mwra", "ts", "imei"]).columns
non_gaussian_columns = [
    "c.android.vending",
    "c.UCMobile.x86",
    "c.updateassist",
    "c.UCMobile.intl",
    "p.android.vending",
    "p.dogalize",
    "p.olauncher",
    "p.simulator",
    "p.inputmethod.latin",
    "p.android.gms",
    "p.notifier",
    "p.katana",
    "p.gms.persistent",
]
gaussian_columns = all_columns[~all_columns.isin(non_gaussian_columns)]
transformed_feature_order = pd.Series(gaussian_columns.tolist() + non_gaussian_columns)

### Define pipelines


In [None]:
general_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)

vending_pipeline = Pipeline(
    [
        ("quantile_transformer", QuantileTransformer(output_distribution="normal", random_state=42)),
    ]
)

# Create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("general", general_pipe, gaussian_columns),
        ("vending", vending_pipeline, non_gaussian_columns),
    ],
    remainder="passthrough",
)

# Create complete pipeline
complete_pipeline = Pipeline([("preprocessor", preprocessor), ("selector", SelectKBest(f_classif, k=10))])

### Transform data


In [None]:
# Fit and transform training data, transform test data
train_data_processed = complete_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_pipeline.transform(test_data[all_columns])

# Get selected features immediately after fitting
feature_mask = complete_pipeline.named_steps["selector"].get_support()
selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

# Create DataFrames with selected feature names
train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

### Export processed data


In [None]:
os.makedirs("../data/processed", exist_ok=True)

train_data_processed.to_csv("../data/processed/train_data.csv", index=False)
test_data_processed.to_csv("../data/processed/test_data.csv", index=False)

### Cleanup


In [None]:
del (
    file_path,
    files,
    file,
    df,
    train_data,
    test_data,
    columns_for_zscore,
    outliers_count,
    max_iterations,
    iteration,
    all_columns,
    non_gaussian_columns,
    gaussian_columns,
    transformed_feature_order,
    general_pipe,
    vending_pipeline,
    preprocessor,
    complete_pipeline,
    train_data_processed,
    test_data_processed,
    dataset,
    selected_features,
)

## Helper Stuff


In [None]:
train_data_processed = pd.read_csv("../data/processed/train_data.csv")
test_data_processed = pd.read_csv("../data/processed/test_data.csv")

# Train data without feature selection
X_train = train_data_processed.drop(columns=["mwra"])
y_train = train_data_processed["mwra"]

# Test data without feature selection
X_test = test_data_processed.drop(columns=["mwra"])
y_test = test_data_processed["mwra"]

del train_data_processed, test_data_processed

In [None]:
def get_scores(model, model_name, X_train, y_train, X_test, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    df = pd.DataFrame(
        index=["accuracy", "precision", "recall", "f1-score", "roc_auc"],
        columns=pd.MultiIndex.from_product([[model_name], ["Train", "Test", "Difference"]]),
    )

    accuracy_train = accuracy_score(y_train, y_pred_train)
    precision_train = precision_score(y_train, y_pred_train)
    recall_train = recall_score(y_train, y_pred_train)
    f1_train = f1_score(y_train, y_pred_train)
    roc_auc_train = roc_auc_score(y_train, y_pred_train)

    df.loc["accuracy", (model_name, "Train")] = accuracy_train
    df.loc["precision", (model_name, "Train")] = precision_train
    df.loc["recall", (model_name, "Train")] = recall_train
    df.loc["f1-score", (model_name, "Train")] = f1_train
    df.loc["roc_auc", (model_name, "Train")] = roc_auc_train

    accuracy_test = accuracy_score(y_test, y_pred_test)
    precision_test = precision_score(y_test, y_pred_test)
    recall_test = recall_score(y_test, y_pred_test)
    f1_test = f1_score(y_test, y_pred_test)
    roc_auc_test = roc_auc_score(y_test, y_pred_test)

    df.loc["accuracy", (model_name, "Test")] = accuracy_test
    df.loc["precision", (model_name, "Test")] = precision_test
    df.loc["recall", (model_name, "Test")] = recall_test
    df.loc["f1-score", (model_name, "Test")] = f1_test
    df.loc["roc_auc", (model_name, "Test")] = roc_auc_test

    df.loc["accuracy", (model_name, "Difference")] = accuracy_train - accuracy_test
    df.loc["precision", (model_name, "Difference")] = precision_train - precision_test
    df.loc["recall", (model_name, "Difference")] = recall_train - recall_test
    df.loc["f1-score", (model_name, "Difference")] = f1_train - f1_test
    df.loc["roc_auc", (model_name, "Difference")] = roc_auc_train - roc_auc_test

    return df

In [None]:
def get_scores_cv(model, model_name, X, y, cv):
    metrics = ["accuracy", "precision", "recall", "f1", "roc_auc"]

    # Create MultiIndex DataFrame
    df = pd.DataFrame(
        index=pd.MultiIndex.from_product([["Train", "Test", "Difference"], metrics]),
        columns=pd.MultiIndex.from_product([[model_name], ["Mean", "Std"]]),
    )

    # Calculate scores for each model
    scores = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring={
            "accuracy": "accuracy",
            "precision": "precision",
            "recall": "recall",
            "f1": "f1",
            "roc_auc": "roc_auc",
        },
        return_train_score=True,
    )

    # Fill DataFrame with training and test scores
    for metric in metrics:
        # Training scores
        train_key = f"train_{metric}"
        df.loc[("Train", metric), (model_name, "Mean")] = scores[train_key].mean()
        df.loc[("Train", metric), (model_name, "Std")] = scores[train_key].std()

        # Test scores
        test_key = f"test_{metric}"
        df.loc[("Test", metric), (model_name, "Mean")] = scores[test_key].mean()
        df.loc[("Test", metric), (model_name, "Std")] = scores[test_key].std()

        # Difference between training and test scores
        diff = scores[train_key] - scores[test_key]
        df.loc[("Difference", metric), (model_name, "Mean")] = diff.mean()
        df.loc[("Difference", metric), (model_name, "Std")] = diff.std()

    return df

In [None]:
def plot_learning_curve(estimator, X, y, train_sizes, cv, scoring, title):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, train_sizes=train_sizes, scoring=scoring, n_jobs=-1
    )
    train_scores_mean = 1 - train_scores.mean(axis=1)
    test_scores_mean = 1 - test_scores.mean(axis=1)

    plt.figure()
    plt.plot(train_sizes, train_scores_mean, label="Training score")
    plt.plot(train_sizes, test_scores_mean, label="Cross-Validation Score")
    plt.xlabel("Training Size")
    plt.ylabel(f"{scoring} Error")
    plt.title(title)
    plt.legend()
    plt.show()

In [None]:
def plot_model_complexity_curve(model, X_train, y_train, X_test, y_test, max_depth_range):
    # Initialize lists to store the training and validation errors
    train_errors = []
    val_errors = []

    # Loop over the range of max_depth values
    for max_depth in max_depth_range:
        # Initialize the model with the current max_depth
        model = DecisionTreeClassifier(
            criterion="gini",
            max_depth=max_depth,
            min_samples_split=10,
            min_samples_leaf=1,
            ccp_alpha=0.001,
            random_state=42,
        )

        # Fit model
        model.fit(X_train, y_train)

        # Compute the training and testing data errors
        train_score = accuracy_score(y_train, model.predict(X_train))
        test_score = accuracy_score(y_test, model.predict(X_test))

        # Compute the mean errors
        train_errors.append(1 - train_score)
        val_errors.append(1 - test_score)

    # Plot the learning curve for model complexity
    plt.figure()
    plt.plot(max_depth_range, train_errors, label="Training Error")
    plt.plot(max_depth_range, val_errors, label="Validation Error")
    plt.xlabel("Max Depth")
    plt.ylabel("Error")
    plt.title("Learning Curve (Model Complexity)")
    plt.legend()
    plt.show()

In [None]:
def compare_confusion_matrix(models, X_train, y_train, X_test, y_test):
    _, axes = plt.subplots(1, len(models), figsize=(15, 5))

    for i, (model_name, model) in enumerate(models):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        conf_matrix = confusion_matrix(y_test, y_pred)
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False, ax=axes[i])
        axes[i].set_title(model_name)

    plt.show()

In [None]:
def show_confusion_matrix(model, model_name, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    conf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.title(model_name)
    plt.show()

In [None]:
rf_best_params = {
    "n_estimators": 300,
    "max_depth": 10,
    "min_samples_split": 5,
    "min_samples_leaf": 1,
    "ccp_alpha": 0.001,
    "random_state": 42,
}

lgr_best_params = {"C": 0.01, "max_iter": 100, "penalty": "l2", "solver": "lbfgs", "tol": 0.001}

gbg_best_params = {
    "subsample": 0.8,
    "n_estimators": 100,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "max_depth": 4,
    "learning_rate": 0.01,
    "random_state": 42,
}

hbg_best_params = {
    "l2_regularization": 2.0,
    "learning_rate": 0.1,
    "max_depth": 7,
    "max_iter": 200,
    "min_samples_leaf": 20,
    "random_state": 42,
}

In [None]:
@register_cell_magic
def ignore(line, cell):
    pass

# 3.1


-   Jednoduchý klasifikátor na základe závislosti v dátach.


## A


Naimplementujte jednoduchý ID3 klasifikátor s hĺbkou min 2 (vrátane root/koreň).


In [None]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None, info_gain=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        self.info_gain = info_gain

In [None]:
class ID3Classifier:
    def __init__(self, max_depth: int = 2):
        self.max_depth = max_depth
        self.tree = None

    def build_tree(self, X, y, curr_depth: int = 0):
        _, num_features = X.shape

        if curr_depth <= self.max_depth:
            best_split = self.get_best_split(X, y, num_features)
            if best_split["info_gain"] > 0:
                left_subtree = self.build_tree(best_split["X_left"], best_split["y_left"], curr_depth + 1)
                right_subtree = self.build_tree(best_split["X_right"], best_split["y_right"], curr_depth + 1)

                return Node(
                    feature=best_split["feature"],
                    threshold=best_split["threshold"],
                    left=left_subtree,
                    right=right_subtree,
                )

        leaf_value = self.calculate_leaf_value(y)
        return Node(value=leaf_value)

    def get_best_split(self, X, y, num_features):
        best_split = {}
        max_info_gain = -float("inf")

        for feature in range(num_features):
            feature_values = X[:, feature]
            possible_thresholds = np.unique(feature_values)

            for threshold in possible_thresholds:
                X_left, X_right, y_left, y_right = self.split(X, y, feature, threshold)
                curr_info_gain = self.information_gain(y, y_left, y_right)

                if curr_info_gain > max_info_gain:
                    best_split["feature"] = feature
                    best_split["threshold"] = threshold
                    best_split["X_left"] = X_left
                    best_split["X_right"] = X_right
                    best_split["y_left"] = y_left
                    best_split["y_right"] = y_right
                    best_split["info_gain"] = curr_info_gain
                    max_info_gain = curr_info_gain

        return best_split

    @staticmethod
    def split(X, y, feature, threshold):
        left_indices = np.where(X[:, feature] <= threshold)
        right_indices = np.where(X[:, feature] > threshold)

        X_left = X[left_indices]
        X_right = X[right_indices]
        y_left = y[left_indices]
        y_right = y[right_indices]

        return X_left, X_right, y_left, y_right

    def information_gain(self, parent, l_child, r_child):
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)

        gain = self.entropy(parent) - (weight_l * self.entropy(l_child) + weight_r * self.entropy(r_child))
        return gain

    @staticmethod
    def entropy(y):
        class_labels = np.unique(y)
        ent = 0

        for cls in class_labels:
            p = np.sum(y == cls) / len(y)
            ent += p * np.log2(p)

        return -ent

    @staticmethod
    def calculate_leaf_value(y):

        Y = list(y)
        return max(y, key=Y.count)

    def fit(self, X, y):
        self.tree = self.build_tree(X, y)

    def predict(self, X):
        return [self.traverse_tree(x, self.tree) for x in X]

    def traverse_tree(self, x, node):
        if node.value is not None:
            return node.value

        if x[node.feature] <= node.threshold:
            return self.traverse_tree(x, node.left)

        return self.traverse_tree(x, node.right)

## B


Vyhodnoťte Váš ID3 klasifikátor pomocou metrík accuracy, precision a recall.


In [None]:
id3_classifier = ID3Classifier(max_depth=5)
id3_classifier.fit(X_train.values, y_train.values)

In [None]:
def get_scores_id3(model, model_name, X_train, y_train, X_test, y_test):
    y_pred_train = model.predict(X_train.values)
    y_pred_test = model.predict(X_test.values)

    df = pd.DataFrame(
        index=["accuracy", "precision", "recall", "f1-score", "roc_auc"],
        columns=pd.MultiIndex.from_product([[model_name], ["Train", "Test", "Difference"]]),
    )

    accuracy_train = accuracy_score(y_train, y_pred_train)
    precision_train = precision_score(y_train, y_pred_train)
    recall_train = recall_score(y_train, y_pred_train)
    f1_train = f1_score(y_train, y_pred_train)
    roc_auc_train = roc_auc_score(y_train, y_pred_train)

    df.loc["accuracy", (model_name, "Train")] = accuracy_train
    df.loc["precision", (model_name, "Train")] = precision_train
    df.loc["recall", (model_name, "Train")] = recall_train
    df.loc["f1-score", (model_name, "Train")] = f1_train
    df.loc["roc_auc", (model_name, "Train")] = roc_auc_train

    accuracy_test = accuracy_score(y_test, y_pred_test)
    precision_test = precision_score(y_test, y_pred_test)
    recall_test = recall_score(y_test, y_pred_test)
    f1_test = f1_score(y_test, y_pred_test)
    roc_auc_test = roc_auc_score(y_test, y_pred_test)

    df.loc["accuracy", (model_name, "Test")] = accuracy_test
    df.loc["precision", (model_name, "Test")] = precision_test
    df.loc["recall", (model_name, "Test")] = recall_test
    df.loc["f1-score", (model_name, "Test")] = f1_test
    df.loc["roc_auc", (model_name, "Test")] = roc_auc_test

    df.loc["accuracy", (model_name, "Difference")] = accuracy_train - accuracy_test
    df.loc["precision", (model_name, "Difference")] = precision_train - precision_test
    df.loc["recall", (model_name, "Difference")] = recall_train - recall_test
    df.loc["f1-score", (model_name, "Difference")] = f1_train - f1_test
    df.loc["roc_auc", (model_name, "Difference")] = roc_auc_train - roc_auc_test

    return df


# Using ID3 Classifier
scores_id3 = get_scores_id3(id3_classifier, "ID3", X_train, y_train, X_test, y_test)
scores_id3

## C


Zístite či Váš ID3 klasifikátor má overfit.


In [None]:
scores_id3

-   Since train and test metrics are close, the model is likely not overfitting.


# 3.2


Trénovanie a vyhodnotenie klasifikátorov strojového učenia.


## A


Na trénovanie využite jeden stromový algoritmus v scikit-learn.


### Initialize DecisionTreeClassifier


In [None]:
# use some basic parameters to avoid overfitting
dst_classifier = DecisionTreeClassifier(max_depth=15, ccp_alpha=0.001, random_state=42)

### Metrics


In [None]:
df_1 = get_scores_cv(dst_classifier, "DecisionTreeClassifier", X_train, y_train, cv=5)
df_1

In [None]:
show_confusion_matrix(dst_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)

-   Even quite simple model gives good results without overfitting.


## B


Porovnajte s jedným iným nestromovým algoritmom v scikit-learn.


### Initialize LogisticRegression


In [None]:
log_reg = LogisticRegression(random_state=42)

### Metrics


In [None]:
df_2 = get_scores_cv(log_reg, "LogisticRegression", X_train, y_train, cv=5)
df_2

In [None]:
show_confusion_matrix(log_reg, "LogisticRegression", X_train, y_train, X_test, y_test)

-   Basic logistic regression gives quite good results without overfitting as well


### Comparison


In [None]:
df = pd.concat([df_1, df_2], axis=1)
del df_1, df_2
df

In [None]:
compare_confusion_matrix(
    [("RandomForestClassifier", dst_classifier), ("LogisticRegression", log_reg)], X_train, y_train, X_test, y_test
)

-   Basic Logistic Regression gives slightly better results than basic Decision Tree Classifier.


## C


Porovnajte výsledky s ID3 z prvého kroku.


In [None]:
dst_classifier = DecisionTreeClassifier(max_depth=15, ccp_alpha=0.001, random_state=42)
dst_classifier.fit(X_train, y_train)

log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

df_1 = get_scores(dst_classifier, "DecisionTreeClassifier", X_train, y_train, X_test, y_test)
df_2 = get_scores(log_reg, "LogisticRegression", X_train, y_train, X_test, y_test)
df = pd.concat([scores_id3, df_1, df_2], axis=1)
df

-   Our ID3 somehow outperforms DecisionTreeClassifier.
-   But we don't have best parameters for DecisionTreeClassifier so it can be also better.
-   Also our model doesn't have many parameters to tune.


## D


Vizualizujte natrénované pravidlá minimálne pre jeden Vami vybraný algoritmus.


In [None]:
# Initialize RandomForestClassifier
dst_classifier = DecisionTreeClassifier(max_depth=15, ccp_alpha=0.001, random_state=42)

# Fit the classifier to the training data
dst_classifier.fit(X_train, y_train)

# Generate graph
graph = Source(
    export_graphviz(
        dst_classifier, out_file=None, class_names=["no", "yes"], filled=True, feature_names=X_train.columns
    )
)

# Display graph
display(SVG(graph.pipe(format="svg")))

style = "<style>svg{width:100%;height:70%;}</style>"
HTML(style)

## E


Vyhodnoťte natrénované modely pomocou metrík accuracy, precision a recall.


-   We already did this but let's do it again.


### Initialize classifiers


In [None]:
# Initialize classifiers
dst_classifier = DecisionTreeClassifier(max_depth=15, ccp_alpha=0.001, random_state=42)
log_reg = LogisticRegression(random_state=42)

### Compare metrics


In [None]:
# Print the scores
df_1 = get_scores_cv(dst_classifier, "DecisionTreeClassifier", X_train, y_train, cv=5)
df_2 = get_scores_cv(log_reg, "LogisticRegression", X_train, y_train, cv=5)

df = pd.concat([df_1, df_2], axis=1)
del df_1, df_2
df

### Compare matrixes


In [None]:
compare_confusion_matrix(
    [
        ("DecisionTreeClassifier", dst_classifier),
        ("LogisticRegression", log_reg),
    ],
    X_train,
    y_train,
    X_test,
    y_test,
)

### Comparison


-   As we already said, Logistic Regression gives slightly better results than Decision Tree Classifier.
-   However, both models are quite good and don't overfit.


# 3.3


Optimalizácia alias hyperparameter tuning.


## A


Vyskúšajte rôzne nastavenie hyperparametrov (tuning) pre zvolený algoritmus tak,
aby ste optimalizovali výkonnosť (bez underfitingu).

We will test parameters for RandomForestClassifier as we know this will be better than DecisionTreeClassifier.


### Decision Tree vs Random Forest


In [None]:
%%ignore
# Define the parameter grid
param_grid = {
    "max_depth": [5, 10, 15, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "ccp_alpha": [0.0, 0.0005, 0.001, 0.0015],
    "min_impurity_decrease": [0.0, 0.0005, 0.001, 0.0015],
}

# Initialize the DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=dt_classifier, param_grid=param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1
)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

-   Best Parameters: {'ccp_alpha': 0.0005, 'max_depth': 10, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 2}
-   Best Score: 0.9026635738931255


In [None]:
dt_classifier = DecisionTreeClassifier(
    max_depth=10, min_samples_split=2, min_samples_leaf=4, ccp_alpha=0.0005, random_state=42
)
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=15, ccp_alpha=0.001, random_state=42)

df_1 = get_scores_cv(dt_classifier, "DecisionTreeClassifier", X_train, y_train, cv=5)
df_2 = get_scores_cv(rf_classifier, "RandomForestClassifier", X_train, y_train, cv=5)

df = pd.concat([df_1, df_2], axis=1)
del df_1, df_2
df

-   This shows difference between optimized Decision Tree and some basic Random Forest (with basic paramters to avoid overfitting).
-   We can clearly see that Random Forest is better and therefore we are gonna use it for further steps.


### First, find baseline parameters


#### n_estimators


In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_depth=20, ccp_alpha=0.001
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_depth=20, ccp_alpha=0.001
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

-   We see that more estimators did help but not greatly. Also at some point the is very little improvement in adding more estimators.
-   It also takes more time to train the model.


#### ccp_alpha


In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(criterion="entropy", n_estimators=100, random_state=42, ccp_alpha=0.002)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(criterion="entropy", n_estimators=100, random_state=42, ccp_alpha=0.005)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

-   We see ccp_alpha 0.002 is better considering ROC AUC factor.


#### max_features


In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    criterion="entropy",
    n_estimators=100,
    random_state=42,
    ccp_alpha=0.002,
    max_features="sqrt",
    max_depth=7,
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    criterion="entropy",
    n_estimators=100,
    random_state=42,
    ccp_alpha=0.002,
    max_features="log2",
    max_depth=7,
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

-   When also using max_depth, there is difference in ROC AUC, sqrt is better.


#### criterion


In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    criterion="entropy",
    n_estimators=100,
    random_state=42,
    ccp_alpha=0.002,
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    criterion="gini",
    n_estimators=100,
    random_state=42,
    ccp_alpha=0.002,
)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Print the scores
df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

-   Entropy is better considering ROC AUC factor.


### RandomizedSearchCV


-   In previous step, we found that n_estimators=100, ccp_alpha=0.002, max_features=sqrt, criterion=entropy are better.
-   However we are not gonna look at max_features and criterion in first tuning method as we will look at them at the end when we have best primary hyperparameters.
-   Not using max_features and criterion in first tuning gives more priority to other hyperparameters.
-   Our primary hyperparameters are n_estimators, max_depth, min_samples_split, min_samples_leaf, ccp_alpha.


In [None]:
%%ignore
# Define the broad parameter grid
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [5, 10, 15, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "ccp_alpha": [0.001, 0.002, 0.003],
}

# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
random_search = RandomizedSearchCV(
    estimator=rf_classifier,
    param_distributions=param_grid,
    n_iter=100,
    n_jobs=-1,
    verbose=1,
    cv=10,  # 10-fold cross-validation for more reliable results
    scoring="roc_auc",
    random_state=42,
)

# Fit the classifier to the training data
random_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = random_search.best_params_
best_score = random_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score:.5f}")

-   Best Parameters: {'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 15, 'ccp_alpha': 0.002}
-   Best Score: 0.91579


### GridSearchCV


#### First iteration


In [None]:
%%ignore
# Best parameters from RandomizedSearchCV
best_params = {
    "n_estimators": 300,
    "max_depth": 15,
    "min_samples_split": 10,
    "min_samples_leaf": 2,
    "ccp_alpha": 0.002,
}

# Define the refined parameter grid for GridSearchCV
param_grid_refined = {
    "n_estimators": [
        best_params["n_estimators"] - 100,
        best_params["n_estimators"],
        best_params["n_estimators"] + 100,
    ],
    "max_depth": [
        best_params["max_depth"] - 5,
        best_params["max_depth"],
        best_params["max_depth"] + 5,
    ],
    "min_samples_split": [
        best_params["min_samples_split"] - 5,
        best_params["min_samples_split"],
        best_params["min_samples_split"] + 5,
    ],
    "min_samples_leaf": [
        best_params["min_samples_leaf"],
        best_params["min_samples_leaf"] + 1,
    ],
    "ccp_alpha": [
        best_params["ccp_alpha"],
        best_params["ccp_alpha"] + 0.001,
        best_params["ccp_alpha"] + 0.002,
    ],
}

# Initialize the RandomForestClassifier with the best parameters from RandomizedSearchCV
rf_refined = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_refined = GridSearchCV(
    estimator=rf_refined,
    param_grid=param_grid_refined,
    verbose=1,
    n_jobs=-1,
    cv=10,  # 10-fold cross-validation for more reliable results
    scoring="roc_auc",
)

# Fit the classifier to the training data
grid_search_refined.fit(X_train, y_train)

# Get the best parameters and best score
best_params_refined = grid_search_refined.best_params_
best_score_refined = grid_search_refined.best_score_

print(f"Best Parameters after Refining: {best_params_refined}")
print(f"Best Score after Refining: {best_score_refined:.5f}")

-   Best Parameters after Refining: {'ccp_alpha': 0.002, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 15, 'n_estimators': 400}
-   Best Score after Refining: 0.91606
-   The trend for n_estimators is, bigger is better (until some point). So in next iteration we will ignore this parameter.


#### Second iteration


My colleague found different parameters to be better, so we will look at them in this step.


In [None]:
%%ignore
# Define the refined parameter grid for GridSearchCV
param_grid_refined = {
    "n_estimators": [
        200
    ],
    "max_depth": [
        15
    ],
    "min_samples_split": [
        5,10
    ],
    "min_samples_leaf": [
        1,2
    ],
    "ccp_alpha": [
        0.001, 0.002
    ],
}

# Initialize the RandomForestClassifier with the best parameters from RandomizedSearchCV
rf_refined = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_refined = GridSearchCV(
    estimator=rf_refined,
    param_grid=param_grid_refined,
    verbose=1,
    n_jobs=-1,
    cv=10,  # 10-fold cross-validation for more reliable results
    scoring="roc_auc",
)

# Fit the classifier to the training data
grid_search_refined.fit(X_train, y_train)

# Get the best parameters and best score
best_params_refined = grid_search_refined.best_params_
best_score_refined = grid_search_refined.best_score_

print(f"Best Parameters after Refining: {best_params_refined}")
print(f"Best Score after Refining: {best_score_refined:.5f}")

-   Best Parameters after Refining: {'ccp_alpha': 0.001, 'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
-   Best Score after Refining: 0.91596
-   From these observations:
    -   We can conclude that max_depth=~15 is the best, as it was 15 in every iteration.
    -   We can conclude that ccp_alpha=~0.001 or cpp_alpha=~0.002 is the best.
    -   We can conclude that min_samples_leaf=~1 or a little higher is the best.
    -   We can conclude that min_samples_split=~10 is the best.
    -   We can conclude that higher n_estimators is better (at some value it will be worse).


#### Third iteration


Lets look at cpp_alpha and max_depth in more detail.


In [None]:
%%ignore
# Define the refined parameter grid for GridSearchCV
param_grid_refined = {
    "n_estimators": [300],
    "max_depth": [14, 15, 16],
    "min_samples_split": [10],
    "min_samples_leaf": [1],
    "ccp_alpha": [0.0095, 0.01, 0.0105],
}

# Initialize the RandomForestClassifier with the best parameters from RandomizedSearchCV
rf_refined = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_refined = GridSearchCV(
    estimator=rf_refined,
    param_grid=param_grid_refined,
    verbose=1,
    n_jobs=-1,
    cv=10,  # 10-fold cross-validation for more reliable results
    scoring="roc_auc",
)

# Fit the classifier to the training data
grid_search_refined.fit(X_train, y_train)

# Get the best parameters and best score
best_params_refined = grid_search_refined.best_params_
best_score_refined = grid_search_refined.best_score_

print(f"Best Parameters after Refining: {best_params_refined}")
print(f"Best Score after Refining: {best_score_refined:.5f}")

-   cpp_alpha=0.001 is best
-   max_depth=15 is best


#### Fourth iteration


Now let's look at best n_estimators.


In [None]:
%%ignore
# Define the refined parameter grid for GridSearchCV
param_grid_refined = {
    "n_estimators": [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    "max_depth": [15],
    "min_samples_split": [10],
    "min_samples_leaf": [1],
    "ccp_alpha": [0.001],
}

# Initialize the RandomForestClassifier with the best parameters from RandomizedSearchCV
rf_refined = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_refined = GridSearchCV(
    estimator=rf_refined,
    param_grid=param_grid_refined,
    verbose=1,
    n_jobs=-1,
    cv=10,  # 10-fold cross-validation for more reliable results
    scoring="roc_auc",
)

# Fit the classifier to the training data
grid_search_refined.fit(X_train, y_train)

# Get the best parameters and best score
best_params_refined = grid_search_refined.best_params_
best_score_refined = grid_search_refined.best_score_

print(f"Best Parameters after Refining: {best_params_refined}")
print(f"Best Score after Refining: {best_score_refined:.5f}")

-   n_estimators=300 is best.


#### Final iteration


As discussed earlier, we will also look at max_features and criterion.


In [None]:
%%ignore
# Define the refined parameter grid for GridSearchCV
param_grid_refined = {
    "n_estimators": [300],
    "max_depth": [15],
    "min_samples_split": [10],
    "min_samples_leaf": [1],
    "ccp_alpha": [0.001],
    "max_features": ["sqrt", "log2"],
    "criterion": ["entropy", "gini"],
}

# Initialize the RandomForestClassifier with the best parameters from RandomizedSearchCV
rf_refined = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_refined = GridSearchCV(
    estimator=rf_refined,
    param_grid=param_grid_refined,
    verbose=1,
    n_jobs=-1,
    cv=10,  # 10-fold cross-validation for more reliable results
    scoring="roc_auc",
)

# Fit the classifier to the training data
grid_search_refined.fit(X_train, y_train)

# Get the best parameters and best score
best_params_refined = grid_search_refined.best_params_
best_score_refined = grid_search_refined.best_score_

print(f"Best Parameters after Refining: {best_params_refined}")
print(f"Best Score after Refining: {best_score_refined:.5f}")

-   Best Parameters after Refining: {'ccp_alpha': 0.001, 'criterion': 'gini', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}
-   Best Score after Refining: 0.91632
-   We can see that max_features=sqrt and criterion=gini are best and they are also default values.


Our final parameters are:

-   n_estimators=300
-   max_depth=15
-   min_samples_split=10
-   min_samples_leaf=1
-   ccp_alpha=0.001
-   max_features="sqrt"
-   criterion="gini"


To note we did these test without having feature selection in pipeline as we were getting better result without it. (Will be discussed further in 3.4)

Since we discovered that it is best practise to always use feature selection, we will use it it next step and look if parameters change.


In [None]:
%%ignore
# Define the refined parameter grid for GridSearchCV
param_grid_refined = {
    "n_estimators": [300],
    "max_depth": [10, 15],
    "min_samples_split": [5, 10],
    "min_samples_leaf": [1, 2],
    "ccp_alpha": [0.001, 0.0015],
}

# Initialize the RandomForestClassifier with the best parameters from RandomizedSearchCV
rf_refined = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_refined = GridSearchCV(
    estimator=rf_refined,
    param_grid=param_grid_refined,
    verbose=1,
    n_jobs=-1,
    cv=10,  # 10-fold cross-validation for more reliable results
    scoring="roc_auc",
)

# Fit the classifier to the training data
grid_search_refined.fit(X_train, y_train)

# Get the best parameters and best score
best_params_refined = grid_search_refined.best_params_
best_score_refined = grid_search_refined.best_score_

print(f"Best Parameters after Refining: {best_params_refined}")
print(f"Best Score after Refining: {best_score_refined:.5f}")

-   Best Parameters after Refining: {'ccp_alpha': 0.001, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
-   Best Score after Refining: 0.91591
-   Best score without feature selection was 0.91632 vs 0.91591 with feature selection.
-   As hinted before, using feature selection gives slightly worse results. But we will look at it in more detail in 3.4.
-   We can also see that parameters did change and therefore it is always crucial to try different parameters when changing pipeline.


Our final parameters with feature selection are:

-   n_estimators=300
-   max_depth=10
-   min_samples_split=5
-   min_samples_leaf=1
-   ccp_alpha=0.001
-   max_features="sqrt"
-   criterion="gini"


### GridSearchCV - Logistic Regression


-   Let's also look at Logistic Regression.


In [None]:
%%ignore
log_reg = LogisticRegression(solver="lbfgs", random_state=42)

parameters = {
    "penalty": [None, "l2"],
    "C": [0.001, 0.005, 0.01, 0.05],
    "max_iter": [100, 150, 200, 250, 300],
    "tol": [0.0001, 0.0005, 0.001, 0.005],
}

grid_search = GridSearchCV(
    log_reg,
    param_grid=parameters,
    scoring="roc_auc",
    n_jobs=-1,
    cv=10,
)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score:.5f}")

-   Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'tol': 0.0005}
-   Best Score: 0.91217


In [None]:
%%ignore
log_reg = LogisticRegression(solver="liblinear", random_state=42)

parameters = {
    "penalty": ["l1", "l2"],
    "C": [0.001, 0.005, 0.01, 0.05],
    "max_iter": [100, 150, 200, 250, 300],
    "tol": [0.0001, 0.0005, 0.001, 0.005],
}

grid_search = GridSearchCV(
    log_reg,
    param_grid=parameters,
    scoring="roc_auc",
    n_jobs=-1,
    cv=10,
)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score:.5f}")

-   Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'tol': 0.001}
-   Best Score: 0.91243


In [None]:
log_reg = LogisticRegression(C=0.01, max_iter=100, penalty="l2", solver="lbfgs", tol=0.001, random_state=42)
log_reg.fit(X_train, y_train)
df_1 = get_scores(log_reg, "LogisticRegression - lbfgs", X_train, y_train, X_test, y_test)

log_reg = LogisticRegression(C=0.01, max_iter=100, penalty="l1", solver="liblinear", tol=0.001, random_state=42)
log_reg.fit(X_train, y_train)
df_2 = get_scores(log_reg, "LogisticRegression - liblinear", X_train, y_train, X_test, y_test)

# Concatenate the DataFrames
df = pd.concat([df_1, df_2], axis=1)
df

-   Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', solver="lbfgs",'tol': 0.001}


## B


Vyskúšajte kombinácie modelov (ensemble) pre zvolený algoritmus tak, aby ste
optimalizovali výkonnosť (bez underfitingu).


We already used RandomForestClassifier in previous step, but we will now explore more ensemble methods.


### Basic Ensemble Models


#### RandomForestClassifier


In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(
    **rf_best_params,
)

# Print the scores
df_rf = get_scores_cv(rf_classifier, "RandomForestClassifier", X_train, y_train, cv=5)
df_rf

#### GradientBoostingClassifier


In [None]:
# Initialize the GradientBoostingClassifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Print the scores
df_gb = get_scores_cv(gb_classifier, "GradientBoostingClassifier", X_train, y_train, cv=5)
df_gb

#### HistGradientBoostingClassifier


In [None]:
hgb_classifier = HistGradientBoostingClassifier(random_state=42)

# Print the scores
df_hgb = get_scores_cv(hgb_classifier, "HistGradientBoostingClassifier", X_train, y_train, cv=5)
df_hgb

#### Comparisson


In [None]:
df = pd.concat([df_rf, df_gb, df_hgb], axis=1)
df

-   We can see that RandomForestClassifier is the best model for our data as we used hyperparameter tuning on it.
-   HistGradientBoostingClassifier shows promising results but overfits more than RandomForestClassifier or GradientBoostingClassifier


#### Hyperparameter tuning for GradientBoostingClassifier


In [None]:
%%ignore
# Define the parameter grid
param_grid = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 4, 5],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "subsample": [0.8, 0.9, 1.0],
}

# Initialize the GradientBoostingClassifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=gb_classifier,
    param_distributions=param_grid,
    n_iter=100,
    cv=5,
    verbose=1,
    n_jobs=-1,
    scoring="roc_auc", # ROC_AUC for lowering overfitting
    random_state=42,
)

# Fit the random search to the data
random_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = random_search.best_params_
best_score = random_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score:.5f}")

-   Best Parameters: {'subsample': 0.8, 'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 4, 'learning_rate': 0.01}
-   Best Score: 0.91491


In [None]:
gb_classifier = GradientBoostingClassifier(**gbg_best_params)

df_gb = get_scores_cv(gb_classifier, "GradientBoostingClassifier", X_train, y_train, cv=5)
df_gb

#### Hyperparameter tuning for HistGradientBoostingClassifier


In [None]:
%%ignore
# Parameter grid
param_grid = {
    "learning_rate": [0.01, 0.1, 0.3],
    "max_depth": [3, 5, 7],
    "max_iter": [100, 200],
    "min_samples_leaf": [20, 50],
    "l2_regularization": [0, 1.0, 2.0],
}

# Base classifier
hgb = HistGradientBoostingClassifier(random_state=42)

# Grid search with 5-fold CV
grid_search = GridSearchCV(
    estimator=hgb,
    param_grid=param_grid,
    cv=5,
    scoring="roc_auc", # ROC_AUC for lowering overfitting
    n_jobs=-1,
    verbose=1,
)

# Assuming X and y are your features and target
grid_search.fit(X_train, y_train)

# Get best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Create optimized model with best parameters
best_model = HistGradientBoostingClassifier(**grid_search.best_params_, random_state=42)

-   Best parameters: {'l2_regularization': 0, 'learning_rate': 0.01, 'max_depth': 5, 'max_iter': 200, 'min_samples_leaf': 20}
-   Best cross-validation score: 0.9132202050166413


In [None]:
hgb = HistGradientBoostingClassifier(
    l2_regularization=0, learning_rate=0.01, max_depth=5, max_iter=200, min_samples_leaf=20, random_state=42
)

df_hgb = get_scores_cv(hgb, "HistGradientBoostingClassifier - Refined", X_train, y_train, cv=5)
df_hgb

In [None]:
df = pd.concat([df_rf, df_gb, df_hgb], axis=1)
del df_rf, df_gb, df_hgb
df

-   We can see that RandomForestClassifier is still the best model.
-   Using roc_auc for scoring in GridSearchCV we lover the overfitting of HistGradientBoostingClassifier but it is still worse than RandomForestClassifier.


#### Comparisson


In [None]:
rf_classifier = RandomForestClassifier(**rf_best_params)
gb_classifier = GradientBoostingClassifier(**gbg_best_params)
hbg_classifier = HistGradientBoostingClassifier(**hbg_best_params)

df_rf = get_scores_cv(rf_classifier, "RandomForestClassifier", X_train, y_train, cv=5)
df_gb = get_scores_cv(gb_classifier, "GradientBoostingClassifier", X_train, y_train, cv=5)
df_hgb = get_scores_cv(hbg_classifier, "HistGradientBoostingClassifier", X_train, y_train, cv=5)

df = pd.concat([df_rf, df_gb, df_hgb], axis=1)
df

-   First best accuracy has RandomForestClassifier then HistGradientBoostingClassifier and last GradientBoostingClassifier.
-   Then best F1 score has RandomForestClassifier then HistGradientBoostingClassifier and last GradientBoostingClassifier.
-   For Roc_Auc score, RandomForestClassifier is the best model, then GradientBoostingClassifier and last HistGradientBoostingClassifier.
-   For overfitting, no model is overfitting greatly, but HistGradientBoostingClassifier still has biggest overfit out of three models.
-   Because of this we are going to use GradientBoostingClassifier over HistGradientBoostingClassifier.


### Voting & Stacikng Classifier


In [None]:
# Define base models with different algorithms
rf = RandomForestClassifier(**rf_best_params)

lgr = LogisticRegression(**lgr_best_params)

gbg = GradientBoostingClassifier(**gbg_best_params)

# Create voting ensemble with different algorithms
voting_clf = VotingClassifier(estimators=[("rf", rf), ("lgr", lgr), ("gbg", gbg)], n_jobs=-1, voting="soft")

# Create stacking ensemble
stacking_clf = StackingClassifier(
    estimators=[("rf", rf), ("lgr", lgr), ("gbg", gbg)],
    final_estimator=LogisticRegression(random_state=42),
    n_jobs=-1,
    cv=5,
)

# Evaluate all models
models = {
    "Random Forest": rf,
    "Logistic Regression": lgr,
    "Gradient Boosting": gbg,
    "Voting Ensemble": voting_clf,
    "Stacking Ensemble": stacking_clf,
}

df = pd.DataFrame()
for name, model in models.items():
    print(f"\nEvaluating {name}")
    df_tmp = get_scores_cv(model, name, X_train, y_train, cv=5)
    df = pd.concat([df, df_tmp], axis=1)
df

-   Stacking Classifier performs better than Voting Classifier.
-   Since Random Forest Classifier was best model until this point, it will be our primary model to compare against.


In [None]:
df_rf = get_scores_cv(rf_classifier, "RandomForestClassifier", X_train, y_train, cv=5)
df_voting = get_scores_cv(stacking_clf, "Stacking Classifier", X_train, y_train, cv=5)

df = pd.concat([df_rf, df_voting], axis=1)
df

-   Random Forest Classifier is still the best model across all metrics.
-   The only thing Classifier excels at is lower overfitting, but it is very small difference, therefore negligible.


#### Looking at second VotingClassifier


In [None]:
# Define base models with different algorithms
rf = RandomForestClassifier(**rf_best_params)

lgr = LogisticRegression(**lgr_best_params)

gbg = GradientBoostingClassifier(**gbg_best_params)

# Create voting ensemble with different algorithms
voting_clf = VotingClassifier(estimators=[("rf", rf), ("lgr", lgr), ("gbg", gbg)], n_jobs=-1, voting="soft")

# Create stacking ensemble
voting_clf_2 = VotingClassifier(estimators=[("rf", rf), ("gbg", gbg)], n_jobs=-1, voting="soft")

# Evaluate all models
models = {
    "Random Forest": rf,
    "Voting Ensemble": voting_clf,
    "Voting Ensemble 2": voting_clf_2,
}

df = pd.DataFrame()
for name, model in models.items():
    print(f"\nEvaluating {name}")
    df_tmp = get_scores_cv(model, name, X_train, y_train, cv=5)
    df = pd.concat([df, df_tmp], axis=1)
df

-   Removing Logistic Regression from Voting Classifier gives better results slightly better results in roc_auc metric.
-   Random Forest Classifier is still the best model across all metrics.


## C


Využite krížovú validáciu (cross validation) na trénovacej množine.


-   We were already using cross validation in previous steps.


## D


Dokážte že Váš nastavený najlepší model je bez overfitingu.


In [None]:
# Define base models with different algorithms
rf_classifier = RandomForestClassifier(**rf_best_params)

rf_classifier.fit(X_train, y_train)

df = get_scores(rf_classifier, "RandomForestClassifier", X_train, y_train, X_test, y_test)
df

-   Since train metrics and test metrics are close, the model is not overfitting.


In [None]:
# Define base models with different algorithms
rf_classifier = RandomForestClassifier(**rf_best_params)

rf_classifier.fit(X_train, y_train)

# Compute ROC curve and ROC area for train data
fpr_train, tpr_train, _ = roc_curve(y_train, rf_classifier.predict_proba(X_train)[:, 1])
roc_auc_train = auc(fpr_train, tpr_train)

# Compute ROC curve and ROC area for test data
fpr_test, tpr_test, _ = roc_curve(y_test, rf_classifier.predict_proba(X_test)[:, 1])
roc_auc_test = auc(fpr_test, tpr_test)

# Plot ROC curve
plt.figure()
plt.plot(fpr_train, tpr_train, color="blue", lw=2, label=f"Train ROC curve (area = {roc_auc_train:.5f})")
plt.plot(fpr_test, tpr_test, color="red", lw=2, label=f"Test ROC curve (area = {roc_auc_test:.5f})")
plt.plot([0, 1], [0, 1], color="gray", lw=2, linestyle="--")
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

-   We can also see stable ROC curve for train and test data, so we can conclude that model is not overfitting.


# 3.4


Vyhodnotenie vplyvu zvolenej stratégie riešenia na klasifikáciu.


## A


Stratégie riešenia chýbajúcich hodnôt a outlierov.


### Definitions


In [None]:
def load_data_raw():
    file_path: str = "../data/raw"
    files: tuple[str, ...] = ("connections", "devices", "processes", "profiles")

    dataset: dict[str, pd.DataFrame] = {}
    for file in files:
        dataset[file] = pd.read_csv(f"{file_path}/{file}.csv", sep="\t")
        dataset[file] = dataset[file].drop_duplicates()

    df = pd.merge(dataset["connections"], dataset["processes"], on=["imei", "ts", "mwra"])
    df["ts"] = pd.to_datetime(df.ts)

    train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
    train_data = train_data.reset_index(drop=True)
    test_data = test_data.reset_index(drop=True)

    return train_data, test_data

In [None]:
# Load data
train_data, test_data = load_data_raw()

# Define columns
all_columns = train_data.drop(columns=["mwra", "ts", "imei"]).columns
non_gaussian_columns = [
    "c.android.vending",
    "c.UCMobile.x86",
    "c.updateassist",
    "c.UCMobile.intl",
    "p.android.vending",
    "p.dogalize",
    "p.olauncher",
    "p.simulator",
    "p.inputmethod.latin",
    "p.android.gms",
    "p.notifier",
    "p.katana",
    "p.gms.persistent",
]
gaussian_columns = all_columns[~all_columns.isin(non_gaussian_columns)]
transformed_feature_order = pd.Series(gaussian_columns.tolist() + non_gaussian_columns)

In [None]:
def remove_outliers_iqr(data, columns):
    Q1 = data[columns].quantile(0.25)
    Q3 = data[columns].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[columns] >= lower_bound).all(axis=1) & (data[columns] <= upper_bound).all(axis=1)]

In [None]:
general_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)

vending_pipeline = Pipeline(
    [
        ("quantile_transformer", QuantileTransformer(output_distribution="normal", random_state=42)),
    ]
)

# Create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("general", general_pipe, gaussian_columns),
        ("vending", vending_pipeline, non_gaussian_columns),
    ],
    remainder="passthrough",
)

# Create complete pipeline
complete_pipeline = Pipeline([("preprocessor", preprocessor), ("selector", SelectKBest(f_classif, k=10))])

In [None]:
number_of_outliers = pd.DataFrame(
    index=["Number of Outliers", "Percentage of Outliers"],
    columns=["None-iterative IQR", "Iterative IQR", "None-iterative Z-score", "Iterative Z-score"],
)

### Cleaning methods


#### None-iterative IQR


##### Cleaning


In [None]:
# Load data
train_data, test_data = load_data_raw()

# Get number of rows before removing outliers
number_of_rows_before_outliers = train_data.shape[0]

# Remove outliers using IQR method
train_data = remove_outliers_iqr(train_data, train_data.iloc[:, 3:].columns)

# Reset index
train_data = train_data.reset_index(drop=True)

# Print number of outliers removed
print(f"Number of outliers removed: {number_of_rows_before_outliers - train_data.shape[0]}")
print(
    f"Percentage of rows removed: {((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100:.2f}%"
)

# Save number of outliers removed
number_of_outliers.loc["Number of Outliers", "None-iterative IQR"] = (
    number_of_rows_before_outliers - train_data.shape[0]
)
number_of_outliers.loc["Percentage of Outliers", "None-iterative IQR"] = round(
    ((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100, 2
)

##### Use pipelines


In [None]:
# Fit and transform training data, transform test data
train_data_processed = complete_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_pipeline.transform(test_data[all_columns])

# Get selected features immediately after fitting
feature_mask = complete_pipeline.named_steps["selector"].get_support()
selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

# Create DataFrames with selected feature names
train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

##### Export cleaned data


In [None]:
os.makedirs("../data/clean_methods", exist_ok=True)

train_data_processed.to_csv("../data/clean_methods/train_1iqr.csv", index=False)
test_data_processed.to_csv("../data/clean_methods/test_1iqr.csv", index=False)

#### Iterative IQR


##### Cleaning


In [None]:
# Load data
train_data, test_data = load_data_raw()

# Get number of rows before removing outliers
number_of_rows_before_outliers = train_data.shape[0]

# Define columns for IQR
columns_for_iqr = train_data.iloc[:, 3:].columns.difference(["p.android.vending"])

# Remove outliers using IQR method (in all columns)
train_data = remove_outliers_iqr(train_data, train_data.iloc[:, 3:].columns)

# Get number of outliers removed in next iteration
outliers_count = (
    ~(
        (
            train_data[columns_for_iqr]
            >= train_data[columns_for_iqr].quantile(0.25)
            - 1.5 * (train_data[columns_for_iqr].quantile(0.75) - train_data[columns_for_iqr].quantile(0.25))
        )
        & (
            train_data[columns_for_iqr]
            <= train_data[columns_for_iqr].quantile(0.75)
            + 1.5 * (train_data[columns_for_iqr].quantile(0.75) - train_data[columns_for_iqr].quantile(0.25))
        )
    ).all(axis=1)
).sum()

# Define maximum number of iterations
max_iterations = 10
iteration = 0

# Remove outliers iteratively
while outliers_count > 0:
    # Remove outliers using IQR method (in all columns except p.android.vending)
    train_data = remove_outliers_iqr(train_data, columns_for_iqr)

    # Get number of outliers removed in next iteration
    outliers_count = (
        ~(
            (
                train_data[columns_for_iqr]
                >= train_data[columns_for_iqr].quantile(0.25)
                - 1.5 * (train_data[columns_for_iqr].quantile(0.75) - train_data[columns_for_iqr].quantile(0.25))
            )
            & (
                train_data[columns_for_iqr]
                <= train_data[columns_for_iqr].quantile(0.75)
                + 1.5 * (train_data[columns_for_iqr].quantile(0.75) - train_data[columns_for_iqr].quantile(0.25))
            )
        ).all(axis=1)
    ).sum()

    # Increment iteration and stop if maximum number of iterations reached
    iteration += 1
    if iteration >= max_iterations:
        break

# Reset index
train_data = train_data.reset_index(drop=True)

# Print number of outliers removed
print(f"Number of outliers removed: {number_of_rows_before_outliers - train_data.shape[0]}")
print(
    f"Percentage of outliers removed: {((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100:.2f}%"
)

# Save number of outliers removed
number_of_outliers.loc["Number of Outliers", "Iterative IQR"] = (
    number_of_rows_before_outliers - train_data.shape[0]
)
number_of_outliers.loc["Percentage of Outliers", "Iterative IQR"] = round(
    ((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100, 2
)

##### Use pipelines


In [None]:
# Fit and transform training data, transform test data
train_data_processed = complete_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_pipeline.transform(test_data[all_columns])

# Get selected features immediately after fitting
feature_mask = complete_pipeline.named_steps["selector"].get_support()
selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

# Create DataFrames with selected feature names
train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

##### Export processed data


In [None]:
os.makedirs("../data/clean_methods", exist_ok=True)

train_data_processed.to_csv("../data/clean_methods/train_itiqr.csv", index=False)
test_data_processed.to_csv("../data/clean_methods/test_itiqr.csv", index=False)

#### None-iterative Z-score


##### Cleaning


In [None]:
# Load data
train_data, test_data = load_data_raw()

# Get number of rows before removing outliers
number_of_rows_before_outliers = train_data.shape[0]

# Remove outliers using Z-score method
train_data = train_data[(np.abs(zscore(train_data.iloc[:, 3:])) < 3).all(axis=1)]
train_data = train_data.reset_index(drop=True)

print(f"Number of outliers removed: {number_of_rows_before_outliers - train_data.shape[0]}")
print(
    f"Percentage of outliers removed: {((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100:.2f}%"
)
number_of_outliers.loc["Number of Outliers", "None-iterative Z-score"] = (
    number_of_rows_before_outliers - train_data.shape[0]
)
number_of_outliers.loc["Percentage of Outliers", "None-iterative Z-score"] = round(
    ((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100, 2
)

##### Use pipelines


In [None]:
# Fit and transform training data, transform test data
train_data_processed = complete_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_pipeline.transform(test_data[all_columns])

# Get selected features immediately after fitting
feature_mask = complete_pipeline.named_steps["selector"].get_support()
selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

# Create DataFrames with selected feature names
train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

##### Export processed data


In [None]:
os.makedirs("../data/clean_methods", exist_ok=True)

train_data_processed.to_csv("../data/clean_methods/train_1zscore.csv", index=False)
test_data_processed.to_csv("../data/clean_methods/test_1zscore.csv", index=False)

#### Iterative Z-score


##### Cleaning


In [None]:
train_data, test_data = load_data_raw()

# 1 iteration of cleaning whole dataset of outliers (including p.android.vending)
train_data = train_data[(np.abs(zscore(train_data.iloc[:, 3:])) < 3).all(axis=1)]

# Using all columns except c.android.vending for outlier detection
columns_for_zscore = train_data.iloc[:, 3:].columns.difference(["p.android.vending"])
outliers_count = (~(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)).sum()
max_iterations = 10
iteration = 0

# Iterating after we removed all outliers
while outliers_count > 0:
    train_data = train_data[(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)]
    outliers_count = (~(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)).sum()
    iteration += 1
    if iteration >= max_iterations:
        break

train_data = train_data.reset_index(drop=True)

print(f"Number of outliers removed: {number_of_rows_before_outliers - train_data.shape[0]}")
print(
    f"Percentage of outliers removed: {((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100:.2f}%"
)
number_of_outliers.loc["Number of Outliers", "Iterative Z-score"] = (
    number_of_rows_before_outliers - train_data.shape[0]
)
number_of_outliers.loc["Percentage of Outliers", "Iterative Z-score"] = round(
    ((number_of_rows_before_outliers - train_data.shape[0]) / number_of_rows_before_outliers) * 100, 2
)

##### Use pipelines


In [None]:
# Fit and transform training data, transform test data
train_data_processed = complete_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_pipeline.transform(test_data[all_columns])

# Get selected features immediately after fitting
feature_mask = complete_pipeline.named_steps["selector"].get_support()
selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

# Create DataFrames with selected feature names
train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

##### Export processed data


In [None]:
os.makedirs("../data/clean_methods", exist_ok=True)

train_data_processed.to_csv("../data/clean_methods/train_itzscore.csv", index=False)
test_data_processed.to_csv("../data/clean_methods/test_itzscore.csv", index=False)

### Show comparison


In [None]:
number_of_outliers

### Using on Models


#### Not Deleting outliers


In [None]:
# Load data
train_data, test_data = load_data_raw()

# Fit and transform training data, transform test data
train_data_processed = complete_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_pipeline.transform(test_data[all_columns])

# Get selected features immediately after fitting
feature_mask = complete_pipeline.named_steps["selector"].get_support()
selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

# Create DataFrames with selected feature names
train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

X_train = train_data_processed.drop(columns=["mwra"])
y_train = train_data_processed["mwra"]

X_test = test_data_processed.drop(columns=["mwra"])
y_test = test_data_processed["mwra"]

In [None]:
rf_classifier = RandomForestClassifier(**rf_best_params)

df_none_cv = get_scores_cv(rf_classifier, "None", X_train, y_train, cv=5)
rf_classifier.fit(X_train, y_train)
df_none_scores = get_scores(rf_classifier, "None", X_train, y_train, X_test, y_test)

#### None-iterative IQR


In [None]:
# Load data
train_data_processed = pd.read_csv("../data/clean_methods/train_1iqr.csv")
test_data_processed = pd.read_csv("../data/clean_methods/test_1iqr.csv")

# Train data without feature selection
X_train = train_data_processed.drop(columns=["mwra"])
y_train = train_data_processed["mwra"]

# Test data without feature selection
X_test = test_data_processed.drop(columns=["mwra"])
y_test = test_data_processed["mwra"]

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(**rf_best_params)

# Show the scores
df_1iqr_cv = get_scores_cv(rf_classifier, "1IQR", X_train, y_train, cv=5)
rf_classifier.fit(X_train, y_train)
df_1iqr_scores = get_scores(rf_classifier, "1IQR", X_train, y_train, X_test, y_test)

#### Iterative IQR


In [None]:
# Load data
train_data_processed = pd.read_csv("../data/clean_methods/train_itiqr.csv")
test_data_processed = pd.read_csv("../data/clean_methods/test_itiqr.csv")

# Train data without feature selection
X_train = train_data_processed.drop(columns=["mwra"])
y_train = train_data_processed["mwra"]

# Test data without feature selection
X_test = test_data_processed.drop(columns=["mwra"])
y_test = test_data_processed["mwra"]

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(**rf_best_params)

# Show the scores
df_itiqr_cv = get_scores_cv(rf_classifier, "1ITIQR", X_train, y_train, cv=5)
rf_classifier.fit(X_train, y_train)
df_itiqr_scores = get_scores(rf_classifier, "1ITIQR", X_train, y_train, X_test, y_test)

#### None-iterative Z-score


In [None]:
# Load data
train_data_processed = pd.read_csv("../data/clean_methods/train_1zscore.csv")
test_data_processed = pd.read_csv("../data/clean_methods/test_1zscore.csv")

# Train data without feature selection
X_train = train_data_processed.drop(columns=["mwra"])
y_train = train_data_processed["mwra"]

# Test data without feature selection
X_test = test_data_processed.drop(columns=["mwra"])
y_test = test_data_processed["mwra"]

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(**rf_best_params)

# Show the scores
df_1zscore_cv = get_scores_cv(rf_classifier, "1ZSCORE", X_train, y_train, cv=5)
rf_classifier.fit(X_train, y_train)
df_1zscore_scores = get_scores(rf_classifier, "1ZSCORE", X_train, y_train, X_test, y_test)

#### Iterative Z-score


In [None]:
# Load data
train_data_processed = pd.read_csv("../data/clean_methods/train_itzscore.csv")
test_data_processed = pd.read_csv("../data/clean_methods/test_itzscore.csv")

# Train data without feature selection
X_train = train_data_processed.drop(columns=["mwra"])
y_train = train_data_processed["mwra"]

# Test data without feature selection
X_test = test_data_processed.drop(columns=["mwra"])
y_test = test_data_processed["mwra"]

In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(**rf_best_params)

# Show the scores
df_itzscore_cv = get_scores_cv(rf_classifier, "1ITZSCORE", X_train, y_train, cv=5)
rf_classifier.fit(X_train, y_train)
df_itzscore_scores = get_scores(rf_classifier, "1ITZSCORE", X_train, y_train, X_test, y_test)

### Show comparison


In [None]:
df_cv = pd.concat([df_none_cv, df_1iqr_cv, df_itiqr_cv, df_1zscore_cv, df_itzscore_cv], axis=1)
df_scores = pd.concat(
    [df_none_scores, df_1iqr_scores, df_itiqr_scores, df_1zscore_scores, df_itzscore_scores], axis=1
)

Using cross validation


In [None]:
df_cv

-   Best accuracy: 1ITIQR
-   Best F1:1ITIQR
-   Best roc_auc: 1IQR


Using Train/Test split


In [None]:
df_scores

-   Best accuracy: 1ITZSCORE
-   Best F1: 1ITZSCORE
-   Best roc_auc: 1ITZSCORE

-   We can see that using cross validation, it prefers lower outlier removal, this can maybe be because there is not enough data for it to reach optimal learning.
-   Using whole train data, to train model and test on test data, it prefers higher outlier removal, this is what we expected.
-   Using cross_validation for model comparison is best practice, but since 1ITIQR removes 24.57% of data which is quite substantial amount and 1ITZSCORE which removes only 4.77% and only targets very extreme outliers, we are going to use 1ITZSCORE for further steps as it had best performance in Train/Test split and it also has lower std in cross validation indicating that it is more stable.


## B


Dátová transformácia (scaling, transformer, ...).


### Definitions


Using our old clean data as we used iterative Z-score for cleaning.


In [None]:
def load_data_clean():
    file_path: str = "../data/clean"

    # Load cleaned data (Iterative Z-score)
    train_data = pd.read_csv(f"{file_path}/train_data.csv")
    test_data = pd.read_csv(f"{file_path}/test_data.csv")

    return train_data, test_data

In [None]:
# Define columns
train_data, test_data = load_data_clean()
all_columns = train_data.drop(columns=["mwra", "ts", "imei"]).columns

gaussian_columns = [
    "c.dogalize",
    "c.android.gm",
    "c.android.youtube",
    "c.android.chrome",
    "c.katana",
    "c.raider",
    "p.android.packageinstaller",
    "p.android.settings",
    "p.android.documentsui",
    "p.android.chrome",
    "p.android.gm",
    "p.system",
    "p.android.externalstorage",
    "p.process.gapps",
    "p.google",
    "p.browser.provider",
    "p.android.defcontainer",
]

log_columns = ["c.android.vending"]

uniform_columns = [
    "c.UCMobile.x86",
    "c.updateassist",
    "c.UCMobile.intl",
    "p.android.vending",
    "p.dogalize",
    "p.olauncher",
    "p.simulator",
    "p.inputmethod.latin",
    "p.android.gms",
    "p.notifier",
    "p.katana",
    "p.gms.persistent",
]


transformed_feature_order = pd.Series(gaussian_columns + log_columns + uniform_columns)

### Define different pipelines


#### Defining different scalers


In [None]:
# Define pipelines
min_max_pipeline = Pipeline(
    [
        ("scaler", MinMaxScaler()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)

standard_pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)

robust_pipeline = Pipeline(
    [
        ("scaler", RobustScaler()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)

quantile_pipeline = Pipeline(
    [
        ("scaler", QuantileTransformer(output_distribution="normal", random_state=42)),
    ]
)

# Create column transformers
min_max_preprocessor = ColumnTransformer(
    transformers=[
        ("min_max", min_max_pipeline, gaussian_columns),
        ("other", quantile_pipeline, log_columns + uniform_columns),
    ],
    remainder="passthrough",
)

standard_preprocessor = ColumnTransformer(
    transformers=[
        ("standard", standard_pipeline, gaussian_columns),
        ("other", quantile_pipeline, log_columns + uniform_columns),
    ],
    remainder="passthrough",
)

robust_preprocessor = ColumnTransformer(
    transformers=[
        ("robust", robust_pipeline, gaussian_columns),
        ("other", quantile_pipeline, log_columns + uniform_columns),
    ],
    remainder="passthrough",
)

# Create complete pipelines
complete_min_max_pipeline = Pipeline(
    [("preprocessor", min_max_preprocessor), ("selector", SelectKBest(f_classif, k=10))]
)
complete_standard_pipeline = Pipeline(
    [("preprocessor", standard_preprocessor), ("selector", SelectKBest(f_classif, k=10))]
)
complete_robust_pipeline = Pipeline(
    [("preprocessor", robust_preprocessor), ("selector", SelectKBest(f_classif, k=10))]
)

### Using on Models


In [None]:
# Load data
train_data, test_data = load_data_clean()

# Transform data
train_data_processed = complete_min_max_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_min_max_pipeline.transform(test_data[all_columns])

feature_mask = complete_min_max_pipeline.named_steps["selector"].get_support()
selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

# Use on model
rf_classifier = RandomForestClassifier(**rf_best_params)
df_min_max_cv = get_scores_cv(
    rf_classifier, "MinMax", train_data_processed.drop(columns=["mwra"]), train_data_processed["mwra"], cv=5
)

In [None]:
# Load data
train_data, test_data = load_data_clean()

# Transform data
train_data_processed = complete_standard_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_standard_pipeline.transform(test_data[all_columns])

feature_mask = complete_standard_pipeline.named_steps["selector"].get_support()
selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

# Use on model
rf_classifier = RandomForestClassifier(**rf_best_params)
df_standard_cv = get_scores_cv(
    rf_classifier, "Standard", train_data_processed.drop(columns=["mwra"]), train_data_processed["mwra"], cv=5
)

In [None]:
# Load data
train_data, test_data = load_data_clean()

# Transform data
train_data_processed = complete_robust_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_robust_pipeline.transform(test_data[all_columns])

feature_mask = complete_robust_pipeline.named_steps["selector"].get_support()
selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

# Use on model
df_robust_cv = get_scores_cv(
    rf_classifier, "Robust", train_data_processed.drop(columns=["mwra"]), train_data_processed["mwra"], cv=5
)

### Comparison


In [None]:
df = pd.concat([df_min_max_cv, df_standard_cv, df_robust_cv], axis=1)
del df_min_max_cv, df_standard_cv, df_robust_cv
df

-   All data is the same, so it looks like Scaler doesn't matter.


### More unique pipelines


In [None]:
# Define new pipelines

standard_pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)

standard_norm_pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("normalizer", Normalizer()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)


minmax_norm_pipeline = Pipeline(
    [
        ("minmax", MinMaxScaler()),
        ("normalizer", Normalizer()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)

maxabs_pipeline = Pipeline(
    [
        ("maxabs", MaxAbsScaler()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)

max_abs_norm_pipeline = Pipeline(
    [
        ("maxabs", MaxAbsScaler()),
        ("normalizer", Normalizer()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)

quantile_pipeline = Pipeline(
    [
        ("scaler", QuantileTransformer(output_distribution="normal", random_state=42)),
    ]
)

# Create column transformers
standard_preprocessor = ColumnTransformer(
    transformers=[
        ("standard", standard_pipeline, gaussian_columns),
        ("other", quantile_pipeline, log_columns + uniform_columns),
    ],
    remainder="passthrough",
)

standard_norm_preprocessor = ColumnTransformer(
    transformers=[
        ("standard_norm", standard_norm_pipeline, gaussian_columns),
        ("other", quantile_pipeline, log_columns + uniform_columns),
    ],
    remainder="passthrough",
)

minmax_norm_preprocessor = ColumnTransformer(
    transformers=[
        ("minmax_norm", minmax_norm_pipeline, gaussian_columns),
        ("other", quantile_pipeline, log_columns + uniform_columns),
    ],
    remainder="passthrough",
)

maxabs_preprocessor = ColumnTransformer(
    transformers=[
        ("maxabs", maxabs_pipeline, gaussian_columns),
        ("other", quantile_pipeline, log_columns + uniform_columns),
    ],
    remainder="passthrough",
)

max_abs_norm_preprocessor = ColumnTransformer(
    transformers=[
        ("max_abs_norm", max_abs_norm_pipeline, gaussian_columns),
        ("other", quantile_pipeline, log_columns + uniform_columns),
    ],
    remainder="passthrough",
)

# Create complete pipelines
complete_standard_pipeline = Pipeline(
    [("preprocessor", standard_preprocessor), ("selector", SelectKBest(f_classif, k=10))]
)
complete_standard_norm_pipeline = Pipeline(
    [("preprocessor", standard_norm_preprocessor), ("selector", SelectKBest(f_classif, k=10))]
)
complete_min_max_pipeline = Pipeline(
    [("preprocessor", minmax_norm_preprocessor), ("selector", SelectKBest(f_classif, k=10))]
)

complete_maxabs_pipeline = Pipeline(
    [("preprocessor", maxabs_preprocessor), ("selector", SelectKBest(f_classif, k=10))]
)
complete_max_abs_norm_pipeline = Pipeline(
    [("preprocessor", max_abs_norm_preprocessor), ("selector", SelectKBest(f_classif, k=10))]
)

pipelines = {
    "Standard": complete_standard_pipeline,
    "Standard Normalized": complete_standard_norm_pipeline,
    "MinMax Normalized": complete_min_max_pipeline,
    "MaxAbs Normalized": complete_maxabs_pipeline,
    "MaxAbs Normalizer": complete_max_abs_norm_pipeline,
}

### Using on Models and Comparison


In [None]:
# Load data
train_data, test_data = load_data_clean()
df = pd.DataFrame()
for name, pipeline in pipelines.items():
    # Transform data
    train_data_processed = pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
    test_data_processed = pipeline.transform(test_data[all_columns])

    feature_mask = pipeline.named_steps["selector"].get_support()
    selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

    train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
    test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

    train_data_processed["mwra"] = train_data["mwra"]
    test_data_processed["mwra"] = test_data["mwra"]

    # Use on model
    df_tmp = get_scores_cv(
        rf_classifier, name, train_data_processed.drop(columns=["mwra"]), train_data_processed["mwra"], cv=5
    )
    df = pd.concat([df, df_tmp], axis=1)
df

-   MinMax Normalized is best pipeline in all metrics except roc_auc.
-   Standard is best pipeline in roc_auc metric.


#### Comparing Standard and MinMax Normalized


In [None]:
# Define new pipelines
standard_pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)


minmax_norm_pipeline = Pipeline(
    [
        ("minmax", MinMaxScaler()),
        ("normalizer", Normalizer()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)

quantile_pipeline = Pipeline(
    [
        ("scaler", QuantileTransformer(output_distribution="normal", random_state=42)),
    ]
)

# Create column transformers
standard_preprocessor = ColumnTransformer(
    transformers=[
        ("standard", standard_pipeline, gaussian_columns),
        ("other", quantile_pipeline, log_columns + uniform_columns),
    ],
    remainder="passthrough",
)

minmax_norm_preprocessor = ColumnTransformer(
    transformers=[
        ("minmax_norm", minmax_norm_pipeline, gaussian_columns),
        ("other", quantile_pipeline, log_columns + uniform_columns),
    ],
    remainder="passthrough",
)

# Create complete pipelines
complete_standard_pipeline = Pipeline(
    [("preprocessor", standard_preprocessor), ("selector", SelectKBest(f_classif, k=10))]
)

complete_min_max_pipeline = Pipeline(
    [("preprocessor", minmax_norm_preprocessor), ("selector", SelectKBest(f_classif, k=10))]
)

In [None]:
# Load data
train_data, test_data = load_data_clean()

# Transform data
train_data_processed = complete_standard_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_standard_pipeline.transform(test_data[all_columns])

feature_mask = complete_standard_pipeline.named_steps["selector"].get_support()
selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

train_data_processed.plot(kind="hist", bins=50, figsize=(20, 20), subplots=True, layout=(10, 3))
plt.tight_layout()
plt.show()

In [None]:
# Load data
train_data, test_data = load_data_clean()

# Transform data
train_data_processed = complete_min_max_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_min_max_pipeline.transform(test_data[all_columns])

feature_mask = complete_min_max_pipeline.named_steps["selector"].get_support()
selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

train_data_processed.plot(kind="hist", bins=50, figsize=(20, 20), subplots=True, layout=(10, 3))
plt.tight_layout()
plt.show()

-   Data seems ok in both cases, except p.android.documentsui, which looks better in Standard Scaler.
-   Our primary metric could be accuracy, but since our data is moderately imbalanced roc_auc is also important.
-   Using Normalizer and then PowerTransformer seems quite weird and add more complexity and the is rick of over-processing.
-   Therefore we are going to stick to our Standard Scaler pipeline from 2nd Phase, since metrics are not that different and it is simpler (It has also higher roc_auc than complete_min_max_pipeline).


## C


Výber atribútov, výber algoritmov, hyperparameter tuning, ensemble learning.


### Atribute selection


In [None]:
# Define pipelines
standard_pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)

quantile_pipeline = Pipeline(
    [
        ("scaler", QuantileTransformer(output_distribution="normal", random_state=42)),
    ]
)

# Create column transformers
preprocessor = ColumnTransformer(
    transformers=[
        ("standard", standard_pipeline, gaussian_columns),
        ("quantile", quantile_pipeline, log_columns + uniform_columns),
    ],
    remainder="passthrough",
)

# Create complete pipelines
complete_pipeline_nofs = Pipeline([("preprocessor", preprocessor)])
complete_pipeline_fs_5 = Pipeline([("preprocessor", preprocessor), ("selector", SelectKBest(f_classif, k=5))])
complete_pipeline_fs_7 = Pipeline([("preprocessor", preprocessor), ("selector", SelectKBest(f_classif, k=7))])
complete_pipeline_fs_10 = Pipeline([("preprocessor", preprocessor), ("selector", SelectKBest(f_classif, k=10))])
complete_pipeline_fs_15 = Pipeline([("preprocessor", preprocessor), ("selector", SelectKBest(f_classif, k=15))])

pipelines = {
    "5 Features": complete_pipeline_fs_5,
    "7 Features": complete_pipeline_fs_7,
    "10 Features": complete_pipeline_fs_10,
    "15 Features": complete_pipeline_fs_15,
}

In [None]:
# Load data
train_data, test_data = load_data_clean()
df = pd.DataFrame()

# No feature selection
train_data_processed = complete_pipeline_nofs.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_pipeline_nofs.transform(test_data[all_columns])

train_data_processed = pd.DataFrame(train_data_processed, columns=transformed_feature_order)
test_data_processed = pd.DataFrame(test_data_processed, columns=transformed_feature_order)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

rf_classifier = RandomForestClassifier(**rf_best_params)
df_tmp = get_scores_cv(
    rf_classifier, "None", train_data_processed.drop(columns=["mwra"]), train_data_processed["mwra"], cv=5
)
df = pd.concat([df, df_tmp], axis=1)

# Feature selection
for name, pipeline in pipelines.items():
    # Transform data
    train_data_processed = pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
    test_data_processed = pipeline.transform(test_data[all_columns])

    feature_mask = pipeline.named_steps["selector"].get_support()
    selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

    train_data_processed = pd.DataFrame(train_data_processed, columns=selected_features)
    test_data_processed = pd.DataFrame(test_data_processed, columns=selected_features)

    train_data_processed["mwra"] = train_data["mwra"]
    test_data_processed["mwra"] = test_data["mwra"]

    # Use on model
    rf_classifier = RandomForestClassifier(**rf_best_params)
    df_tmp = get_scores_cv(
        rf_classifier, name, train_data_processed.drop(columns=["mwra"]), train_data_processed["mwra"], cv=5
    )
    df = pd.concat([df, df_tmp], axis=1)

df

-   As mentioned in previous steps, we were initially not using feature selection (because it gave best results) and then used it as we discoverd it to be best practice.
-   One reasoning for this is that Random Forest uses it's own feature selection.
-   One interesting thing is that in Phase 2 we showed impotences and there were around ~10 features with some usefulness, but Random Forest seems to be maybe using these non-important features in some way.
-   Since None feature selection gave better results in every category such as: train/test metrics, but also in stability (std) and also in overfitting.
-   Therefore we are going to use None feature selection for further steps, although it is weird not to use feature selection maybe.


### Algorithm Pick


-   Base algorithm is decision tree.
-   We are going to use Random Forest as we compared it to number of different algorithms and it was always the best.


### Hyperparameter tuning


-   We already did hyperparameter tuning in previous steps for Random Forest with no feature selection.
-   Results were:
    -   n_estimators=300
    -   max_depth=15
    -   min_samples_split=10
    -   min_samples_leaf=1
    -   ccp_alpha=0.001
    -   max_features="sqrt"
    -   criterion="gini"


In [None]:
paramters = {
    "n_estimators": [300],
    "max_depth": [10, 15],  # 10 was with feature selection
    "min_samples_split": [5, 10],  # 5 was with feature selection
    "min_samples_leaf": [1],
    "ccp_alpha": [0.001],
    "random_state": [42],
}

rf_classifier = RandomForestClassifier()
rf_grid = GridSearchCV(rf_classifier, param_grid=paramters, cv=5, n_jobs=-1, verbose=2)
rf_grid.fit(train_data_processed.drop(columns=["mwra"]), train_data_processed["mwra"])
rf_grid.best_params_

-   This is just a confirmation that we are going to use Random Forest with these parameters.


### Ensemble learning


-   Sole Random Forest performed better than ensemble models.


## D


Ktorý model je Váš najlepší model pre nasadenie (deployment)?


In [None]:
rf_best_params = {
    "n_estimators": 300,
    "max_depth": 15,
    "min_samples_split": 10,
    "min_samples_leaf": 1,
    "ccp_alpha": 0.001,
    "random_state": 42,
}
rf_classifier = RandomForestClassifier(**rf_best_params)

## E


Aký je data pipeline pre jeho vybudovanie na základe Vášho datasetu v produkcii?


### Define pipeline


In [None]:
def create_clean_data():
    file_path: str = "../data/raw"
    files: tuple[str, ...] = ("connections", "devices", "processes", "profiles")

    dataset: dict[str, pd.DataFrame] = {}
    for file in files:
        dataset[file] = pd.read_csv(f"{file_path}/{file}.csv", sep="\t")
        dataset[file] = dataset[file].drop_duplicates()

    df = pd.merge(dataset["connections"], dataset["processes"], on=["imei", "ts", "mwra"])
    df["ts"] = pd.to_datetime(df.ts)

    train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
    train_data = train_data.reset_index(drop=True)
    test_data = test_data.reset_index(drop=True)

    # 1 iteration of cleaning whole dataset of outliers (including p.android.vending)
    train_data = train_data[(np.abs(zscore(train_data.iloc[:, 3:])) < 3).all(axis=1)]

    # Using all columns except c.android.vending for outlier detection
    columns_for_zscore = train_data.iloc[:, 3:].columns.difference(["p.android.vending"])
    outliers_count = (~(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)).sum()
    max_iterations = 10
    iteration = 0

    # Iterating after we removed all outliers
    while outliers_count > 0:
        train_data = train_data[(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)]
        outliers_count = (~(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)).sum()
        iteration += 1
        if iteration >= max_iterations:
            break

    train_data = train_data.reset_index(drop=True)

    os.makedirs("../data/clean", exist_ok=True)
    train_data.to_csv("../data/clean/train_data.csv", index=False)
    test_data.to_csv("../data/clean/test_data.csv", index=False)

In [None]:
def get_processed_data():
    # Load data
    train_data = pd.read_csv("../data/clean/train_data.csv")
    test_data = pd.read_csv("../data/clean/test_data.csv")

    # Define columns
    all_columns = train_data.drop(columns=["mwra", "ts", "imei"]).columns
    gaussian_columns = [
        "c.dogalize",
        "c.android.gm",
        "c.android.youtube",
        "c.android.chrome",
        "c.katana",
        "c.raider",
        "p.android.packageinstaller",
        "p.android.settings",
        "p.android.documentsui",
        "p.android.chrome",
        "p.android.gm",
        "p.system",
        "p.android.externalstorage",
        "p.process.gapps",
        "p.google",
        "p.browser.provider",
        "p.android.defcontainer",
    ]
    log_columns = ["c.android.vending"]
    uniform_columns = [
        "c.UCMobile.x86",
        "c.updateassist",
        "c.UCMobile.intl",
        "p.android.vending",
        "p.dogalize",
        "p.olauncher",
        "p.simulator",
        "p.inputmethod.latin",
        "p.android.gms",
        "p.notifier",
        "p.katana",
        "p.gms.persistent",
    ]
    transformed_feature_order = pd.Series(gaussian_columns + log_columns + uniform_columns)

    # Define pipeline
    standard_pipeline = Pipeline(
        [
            ("scaler", StandardScaler()),
            ("power_transformer", PowerTransformer(method="yeo-johnson")),
        ]
    )
    quantile_pipeline = Pipeline(
        [
            ("scaler", QuantileTransformer(output_distribution="normal", random_state=42)),
        ]
    )
    preprocessor = ColumnTransformer(
        transformers=[
            ("standard", standard_pipeline, gaussian_columns),
            ("quantile", quantile_pipeline, log_columns + uniform_columns),
        ],
        remainder="passthrough",
    )
    complete_pipeline = Pipeline([("preprocessor", preprocessor)])

    # Transform data
    train_data_processed = complete_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
    test_data_processed = complete_pipeline.transform(test_data[all_columns])

    train_data_processed = pd.DataFrame(train_data_processed, columns=transformed_feature_order)
    test_data_processed = pd.DataFrame(test_data_processed, columns=transformed_feature_order)

    train_data_processed["mwra"] = train_data["mwra"]
    test_data_processed["mwra"] = test_data["mwra"]

    os.makedirs("../data/clean_methods", exist_ok=True)
    train_data_processed.to_csv("../data/processed/train_data.csv", index=False)
    train_data_processed.to_csv("../data/processed/test_data.csv", index=False)

    X_train = train_data_processed.drop(columns=["mwra"])
    y_train = train_data_processed["mwra"]

    X_test = test_data_processed.drop(columns=["mwra"])
    y_test = test_data_processed["mwra"]

    return X_train, y_train, X_test, y_test

### Use pipeline


In [None]:
create_clean_data()
X_train, y_train, X_test, y_test = get_processed_data()

rf_best_params = {
    "n_estimators": 300,
    "max_depth": 15,
    "min_samples_split": 10,
    "min_samples_leaf": 1,
    "ccp_alpha": 0.001,
    "random_state": 42,
}
rf_classifier = RandomForestClassifier(**rf_best_params)

rf_classifier.fit(X_train, y_train)

df = get_scores(rf_classifier, "Processed", X_train, y_train, X_test, y_test)
df

-   This is our final pipeline and model for deployment.
