# Start


In [None]:
import os

import numpy as np
import pandas as pd
from graphviz import Source
from IPython.display import HTML, SVG
from matplotlib import pyplot as plt
from scipy.stats import zscore
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import (
    GridSearchCV,
    ParameterGrid,
    RandomizedSearchCV,
    learning_curve,
    train_test_split,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from tqdm.notebook import tqdm

## Process data


### Loading data


In [None]:
file_path: str = "../data/raw"
files: tuple[str, ...] = ("connections", "devices", "processes", "profiles")

dataset: dict[str, pd.DataFrame] = {}
for file in files:
    dataset[file] = pd.read_csv(f"{file_path}/{file}.csv", sep="\t")
    dataset[file] = dataset[file].drop_duplicates()

df = pd.merge(dataset["connections"], dataset["processes"], on=["imei", "ts", "mwra"])
df["ts"] = pd.to_datetime(df.ts)

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

### Cleaning data


In [None]:
number_of_rows_before_outliers = train_data.shape[0]
# 1 iteration of cleaning whole dataset of outliers (including p.android.vending)
train_data = train_data[(np.abs(zscore(train_data.iloc[:, 3:])) < 3).all(axis=1)]

# Using all columns except c.android.vending for outlier detection
columns_for_zscore = train_data.iloc[:, 3:].columns.difference(["p.android.vending"])
outliers_count = (~(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)).sum()
max_iterations = 10
iteration = 0

# Iterating after we removed all outliers
while outliers_count > 0:
    train_data = train_data[(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)]
    outliers_count = (~(np.abs(zscore(train_data[columns_for_zscore])) < 3).all(axis=1)).sum()
    iteration += 1
    if iteration >= max_iterations:
        break

train_data = train_data.reset_index(drop=True)

### Export cleaned data


In [None]:
# Exporting cleaned data
os.makedirs("../data/clean", exist_ok=True)

train_data.to_csv("../data/clean/train_data.csv", index=False)
test_data.to_csv("../data/clean/test_data.csv", index=False)

### Import cleaned data


In [None]:
train_data = pd.read_csv("../data/clean/train_data.csv")
test_data = pd.read_csv("../data/clean/test_data.csv")

### Define columns


In [None]:
all_columns = train_data.drop(columns=["mwra", "ts", "imei"]).columns
non_gaussian_columns = [
    "c.android.vending",
    "c.UCMobile.x86",
    "c.updateassist",
    "c.UCMobile.intl",
    "p.android.vending",
    "p.dogalize",
    "p.olauncher",
    "p.simulator",
    "p.inputmethod.latin",
    "p.android.gms",
    "p.notifier",
    "p.katana",
    "p.gms.persistent",
]
gaussian_columns = all_columns[~all_columns.isin(non_gaussian_columns)]
transformed_feature_order = pd.Series(gaussian_columns.tolist() + non_gaussian_columns)

### Define pipelines


In [None]:
general_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("power_transformer", PowerTransformer(method="yeo-johnson")),
    ]
)

vending_pipeline = Pipeline(
    [
        ("quantile_transformer", QuantileTransformer(output_distribution="normal", random_state=42)),
    ]
)

# Create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("general", general_pipe, gaussian_columns),
        ("vending", vending_pipeline, non_gaussian_columns),
    ],
    remainder="passthrough",
)

# Create complete pipeline
complete_pipeline = Pipeline([("preprocessor", preprocessor)])

### Transform data


In [None]:
# Fit and transform training data, transform test data
train_data_processed = complete_pipeline.fit_transform(train_data[all_columns], train_data["mwra"])
test_data_processed = complete_pipeline.transform(test_data[all_columns])

# Get selected features immediately after fitting
# feature_mask = complete_pipeline.named_steps["selector"].get_support()
# selected_features = transformed_feature_order[feature_mask]  # order of features is preserved

# Create DataFrames with selected feature names
train_data_processed = pd.DataFrame(train_data_processed, columns=transformed_feature_order)
test_data_processed = pd.DataFrame(test_data_processed, columns=transformed_feature_order)

train_data_processed["mwra"] = train_data["mwra"]
test_data_processed["mwra"] = test_data["mwra"]

### Export processed data


In [None]:
os.makedirs("../data/processed", exist_ok=True)

train_data_processed.to_csv("../data/processed/train_data.csv", index=False)
test_data_processed.to_csv("../data/processed/test_data.csv", index=False)

# 3.1


-   Jednoduchý klasifikátor na základe závislosti v dátach.


In [None]:
train_data_processed = pd.read_csv("../data/processed/train_data.csv")
test_data_processed = pd.read_csv("../data/processed/test_data.csv")

# Train data without feature selection
X_train = train_data_processed.drop(columns=["mwra"])
y_train = train_data_processed["mwra"]

# Test data without feature selection
X_test = test_data_processed.drop(columns=["mwra"])
y_test = test_data_processed["mwra"]

# Train data with feature selection
selector = SelectKBest(f_classif, k=7)
train_selected = selector.fit_transform(X_train, y_train)
selected_features = X_train.columns[selector.get_support()]
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

## A


Naimplementujte jednoduchý ID3 klasifikátor s hĺbkou min 2 (vrátane root/koreň).


### Not using feature selection


In [None]:
# Initialize the ID3 classifier
dtc_nofs = DecisionTreeClassifier(criterion="entropy", max_depth=2, random_state=42)

# Fit the classifier to the training data
dtc_nofs.fit(X_train, y_train)

In [None]:
graph = Source(export_graphviz(dtc_nofs, out_file=None, class_names=["no", "yes"], filled=True))

display(SVG(graph.pipe(format="svg")))

style = "<style>svg{width:100%;height:70%;}</style>"
HTML(style)

### Using Feature selection


In [None]:
# Initialize the ID3 classifier
dtc_fs = DecisionTreeClassifier(criterion="entropy", max_depth=2, random_state=42)

# Fit the classifier to the training data
dtc_fs.fit(X_train_selected, y_train)

In [None]:
graph = Source(export_graphviz(dtc_fs, out_file=None, class_names=["no", "yes"], filled=True))

display(SVG(graph.pipe(format="svg")))

style = "<style>svg{width:100%;height:70%;}</style>"
HTML(style)

## B


Vyhodnoťte Váš ID3 klasifikátor pomocou metrík accuracy, precision a recall.


### Metrics


In [None]:
# Predict the target variable for the training data
y_pred_train = dtc_nofs.predict(X_train)

# Calculate accuracy, precision, and recall
accuracy = accuracy_score(y_train, y_pred_train)
precision = precision_score(y_train, y_pred_train)
recall = recall_score(y_train, y_pred_train)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

In [None]:
# Predict the target variable for the training data
y_pred_test = dtc_nofs.predict(X_test)

# Calculate accuracy, precision, and recall
accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test)
recall = recall_score(y_test, y_pred_test)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

In [None]:
# Initialize the model
dtc_nofs = DecisionTreeClassifier(criterion="entropy", max_depth=2, random_state=42)

# Define the training sizes
train_sizes = np.linspace(0.1, 1.0, 30)


# Function to plot learning curves
def plot_learning_curve(estimator, X, y, train_sizes, cv, scoring, title):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, train_sizes=train_sizes, scoring=scoring, n_jobs=-1
    )
    train_scores_mean = 1 - train_scores.mean(axis=1)
    test_scores_mean = 1 - test_scores.mean(axis=1)

    plt.figure()
    plt.plot(train_sizes, train_scores_mean, label="Training score")
    plt.plot(train_sizes, test_scores_mean, label="Cross-Validation Score")
    plt.xlabel("Training Size")
    plt.ylabel(scoring)
    plt.title(title)
    plt.legend()
    plt.show()


# Plot learning curve for accuracy
plot_learning_curve(
    dtc_nofs, X_train, y_train, train_sizes, cv=5, scoring="accuracy", title="Learning Curve (Accuracy)"
)

# Plot learning curve for precision
plot_learning_curve(
    dtc_nofs, X_train, y_train, train_sizes, cv=5, scoring="precision", title="Learning Curve (Precision)"
)

# Plot learning curve for recall
plot_learning_curve(
    dtc_nofs, X_train, y_train, train_sizes, cv=5, scoring="recall", title="Learning Curve (Recall)"
)

In [None]:
from sklearn.metrics import auc, roc_curve

dtc_nofs.fit(X_train, y_train)

# Compute ROC curve and ROC area for train data
fpr_train, tpr_train, _ = roc_curve(y_train, dtc_nofs.predict_proba(X_train)[:, 1])
roc_auc_train = auc(fpr_train, tpr_train)

# Compute ROC curve and ROC area for test data
fpr_test, tpr_test, _ = roc_curve(y_test, dtc_nofs.predict_proba(X_test)[:, 1])
roc_auc_test = auc(fpr_test, tpr_test)

# Plot ROC curve
plt.figure()
plt.plot(fpr_train, tpr_train, color="blue", lw=2, label=f"Train ROC curve (area = {roc_auc_train:.2f})")
plt.plot(fpr_test, tpr_test, color="red", lw=2, label=f"Test ROC curve (area = {roc_auc_test:.2f})")
plt.plot([0, 1], [0, 1], color="gray", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

## C


Zístite či Váš ID3 klasifikátor má overfit.


-   Since train metrics and test metrics are close, the model is not overfitting.


# 3.2


Trénovanie a vyhodnotenie klasifikátorov strojového učenia.


## A


Na trénovanie využite jeden stromový algoritmus v scikit-learn.


In [None]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(criterion="entropy", n_estimators=100, random_state=42, ccp_alpha=0.01)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Predict the target variable for the training data
y_pred_train = rf_classifier.predict(X_train)

accuracy_rf = accuracy_score(y_train, y_pred_train)
precision_rf = precision_score(y_train, y_pred_train)
recall_rf = recall_score(y_train, y_pred_train)

print("Train metrics")
print(f"RandomForestClassifier Accuracy: {accuracy_rf:.2f}")
print(f"RandomForestClassifier Precision: {precision_rf:.2f}")
print(f"RandomForestClassifier Recall: {recall_rf:.2f}")

# Predict the target variable for the test data
y_pred_test = rf_classifier.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_test)
precision_rf = precision_score(y_test, y_pred_test)
recall_rf = recall_score(y_test, y_pred_test)

print("Test metrics")
print(f"RandomForestClassifier Accuracy: {accuracy_rf:.2f}")
print(f"RandomForestClassifier Precision: {precision_rf:.2f}")
print(f"RandomForestClassifier Recall: {recall_rf:.2f}")

## B


Porovnajte s jedným iným nestromovým algoritmom v scikit-learn.


In [None]:
# Initialize the LogisticRegression classifier
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Fit the classifier to the training data
log_reg.fit(X_train, y_train)

# Predict the target variable for the training data
y_pred_train_log_reg = log_reg.predict(X_train)

# Calculate accuracy, precision, and recall for training data
accuracy_log_reg_train = accuracy_score(y_train, y_pred_train_log_reg)
precision_log_reg_train = precision_score(y_train, y_pred_train_log_reg)
recall_log_reg_train = recall_score(y_train, y_pred_train_log_reg)

print("Train metrics")
print(f"LogisticRegression Accuracy: {accuracy_log_reg_train:.2f}")
print(f"LogisticRegression Precision: {precision_log_reg_train:.2f}")
print(f"LogisticRegression Recall: {recall_log_reg_train:.2f}")

# Predict the target variable for the test data
y_pred_test_log_reg = log_reg.predict(X_test)

# Calculate accuracy, precision, and recall for test data
accuracy_log_reg_test = accuracy_score(y_test, y_pred_test_log_reg)
precision_log_reg_test = precision_score(y_test, y_pred_test_log_reg)
recall_log_reg_test = recall_score(y_test, y_pred_test_log_reg)

print("Test metrics")
print(f"LogisticRegression Accuracy: {accuracy_log_reg_test:.2f}")
print(f"LogisticRegression Precision: {precision_log_reg_test:.2f}")
print(f"LogisticRegression Recall: {recall_log_reg_test:.2f}")

## C


Porovnajte výsledky s ID3 z prvého kroku.


## D


Vizualizujte natrénované pravidlá minimálne pre jeden Vami vybraný algoritmus.


## E


Vyhodnoťte natrénované modely pomocou metrík accuracy, precision a recall.


# 3.3


Optimalizácia alias hyperparameter tuning.


In [None]:
# Define the parameter grid
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"],
    "ccp_alpha": [0.0, 0.005, 0.01],
    "criterion": ["gini", "entropy"],
}

# Initialize the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Create a parameter grid
param_list = list(ParameterGrid(param_grid))

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, n_jobs=-1, cv=3, scoring="accuracy")
random_search = RandomizedSearchCV(
    estimator=rf, param_distributions=param_grid, n_iter=100, n_jobs=-1, cv=3, scoring="accuracy"
)

# Fit GridSearchCV to the training data with progress bar
with tqdm(total=len(param_list) * 5) as pbar:
    for params in param_list:
        rf.set_params(**params)
        random_search.fit(X_train_selected, y_train)  # Use selected features for a little bit faster computation
        pbar.update(5)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score:.2f}")

## A


Vyskúšajte rôzne nastavenie hyperparametrov (tuning) pre zvolený algoritmus tak,
aby ste optimalizovali výkonnosť (bez underfitingu).


## B


Vyskúšajte kombinácie modelov (ensemble) pre zvolený algoritmus tak, aby ste
optimalizovali výkonnosť (bez underfitingu) .


## C


Využite krížovú validáciu (cross validation) na trénovacej množine.


## D


Dokážte že Váš nastavený najlepší model je bez overfitingu.


# 3.4


Vyhodnotenie vplyvu zvolenej stratégie riešenia na klasifikáciu.


## A


Stratégie riešenia chýbajúcich hodnôt a outlierov.


## B


Dátová transformácia (scaling, transformer, ...).


## C


Výber atribútov, výber algoritmov, hyperparameter tuning, ensemble learning.


## D


Ktorý model je Váš najlepší model pre nasadenie (deployment)?


## E


Aký je data pipeline pre jeho vybudovanie na základe Vášho datasetu v produkcii?
