# Start


In [78]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import anderson, iqr, skew, zscore
from sklearn.feature_selection import (
    SelectKBest,
    VarianceThreshold,
    f_classif,
    f_regression,
    mutual_info_classif,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    MinMaxScaler,
    PowerTransformer,
    QuantileTransformer,
    StandardScaler,
)

In [3]:
file_path: str = "../dataset"
files: tuple[str, ...] = ("connections", "devices", "processes", "profiles")

dataset: dict[str, pd.DataFrame] = {}
for file in files:
    dataset[file] = pd.read_csv(f"{file_path}/{file}.csv", sep="\t")
    dataset[file] = dataset[file].drop_duplicates()

In [4]:
df = pd.merge(dataset["connections"], dataset["processes"], on=["imei", "ts", "mwra"], how="inner")
df["ts"] = pd.to_datetime(df.ts)

# 2.1


## A

### **Zadanie:** Dáta si rozdeľte na trénovaciu a testovaciu množinu podľa vami preddefinovaného pomeru. Ďalej pracujte len s trénovacím datasetom.


In [5]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

## B

### **Zadanie:** Transformujte dáta na vhodný formát pre ML t.j. jedno pozorovanie musí byť opísané jedným riadkom a každý atribút musí byť v numerickom formáte (encoding). Iteratívne integrujte aj kroky v predspracovaní dát z prvej fázy (missing values, outlier detection) ako celok.


In [None]:
duplicate_ts = train_data.duplicated(subset="ts").any()
print(f"Are there any duplicate timestamps? {duplicate_ts}")

-   There are no duplicates in datetime, therefore every observation is in one row


In [None]:
train_data.dtypes.value_counts()

-   All columns are in numerical format


In [None]:
null_counts = train_data.isnull().sum()
null_counts = null_counts[null_counts > 0]
null_counts

-   There are no missing values


Graph showing the boxplot and outliers of the dataset.


In [None]:
train_data.iloc[:, 3:].plot(
    kind="box", vert=False, figsize=(20, 20), flierprops=dict(marker="o", color="r", alpha=0.5)
)
plt.show()

In [None]:
norm_distribution = True

for col in train_data.columns[2:]:
    result = anderson(train_data[col], dist="norm")

    if not result.fit_result.success:
        norm_distribution = False
        print(f"{col} is not normally distributed")

if norm_distribution:
    print("All columns are normally distributed")

We are going to use Z-score (3), to detect outliers.


In [None]:
number_of_rows_before = train_data.shape[0]
print(f"Number of rows before removing outliers: {number_of_rows_before}")

train_data = train_data[(np.abs(zscore(train_data.iloc[:, 3:])) < 3).all(axis=1)]

number_of_rows_after = train_data.shape[0]
print(f"Number of rows after removing outliers: {number_of_rows_after}")
print(f"Number of rows removed: {number_of_rows_before - number_of_rows_after}")
print(
    f"Percentage of rows removed: {(number_of_rows_before - number_of_rows_after) / number_of_rows_before * 100:.2f}%"
)

-   Removing 4% of the is ok, therefore we are not going to use methods to replace outliers


In [None]:
train_data.iloc[:, 3:].plot(
    kind="box", vert=False, figsize=(20, 20), flierprops=dict(marker="o", color="r", alpha=0.5)
)
plt.show()

In [None]:
train_data["p.android.vending"].plot(kind="hist", bins=100)
plt.show()
train_data["p.android.vending"].plot(kind="box", vert=False)
plt.show()

In [None]:
def identify_outliers(a):
    lower = a.quantile(0.25) - 1.5 * iqr(a)
    upper = a.quantile(0.75) + 1.5 * iqr(a)

    return a[(a > upper) | (a < lower)]


train_data_copy = train_data.copy()
outliers = train_data_copy[["p.android.vending"]].apply(identify_outliers)
train_data_copy = train_data_copy.drop(outliers.index)
print(f"Number of outliers: {train_data_copy[["p.android.vending"]].apply(identify_outliers).count().values[0]}")
print(
    f"Percentage of outliers: {train_data_copy[["p.android.vending"]].apply(identify_outliers).count().values[0] / train_data_copy.shape[0] * 100:.2f}%"
)

In [None]:
train_data["p.android.vending"].plot(kind="hist", bins=100)
plt.show()
train_data["p.android.vending"].plot(kind="box", vert=False)
plt.show()

-   In column **'p.android.vending'** there were too many outliers
-   We tried to remove them using 25% and 75% quantiles, but there was almost no noticeable difference
-   Using this we would remove 1183 (12.23%) rows which is quite substantial amount, therefore we decided not ot manipulate with outliers in this column


## C

### **Zadanie:** Transformujte atribúty dát pre strojové učenie podľa dostupných techník minimálne: scaling (2 techniky), transformers (2 techniky) a ďalšie. Cieľom je aby ste testovali efekty a vhodne kombinovali v dátovom pipeline (od časti 2.3 a v 3. fáze).


In [16]:
def skewness_type(skew_value: float) -> str:
    if skew_value <= -1:
        return "Highly Negative Skew"

    elif skew_value <= -0.5:
        return "Moderately Negative Skew"

    elif skew_value <= 0.5:
        return "Approximately Symmetric"

    elif skew_value <= 1:
        return "Moderately Positive Skew"

    return "Highly Positive Skew"


def kurtosis_type(kurtosis_value: float) -> str:
    if kurtosis_value < -1:
        return "Negative Kurtosis"

    elif kurtosis_value < -0.5:
        return "Moderately Negative Kurtosis"

    elif kurtosis_value < 0.5:
        return "Approximately Normal Kurtosis"

    elif kurtosis_value < 1:
        return "Moderately Positive Kurtosis"

    return "Positive Kurtosis"

In [None]:
columns = train_data.columns[3:].tolist()
skew_values = [skew(train_data[col]) for col in columns]
kurtosis_values = [train_data[col].kurtosis() for col in columns]

data = {
    "columns": columns,
    "skew": skew_values,
    "kurtosis": kurtosis_values,
}

shape_train_data = pd.DataFrame(data)

shape_train_data["result skew"] = shape_train_data["skew"].apply(skewness_type)
shape_train_data["result kurtosis"] = shape_train_data["kurtosis"].apply(kurtosis_type)
shape_train_data

In [None]:
train_data.iloc[:, 3:].plot(kind="hist", bins=50, figsize=(20, 20), subplots=True, layout=(10, 3))
plt.tight_layout()
plt.show()

In [None]:
scaler = MinMaxScaler()
scaled_data_minmax = scaler.fit_transform(train_data.iloc[:, 3:])

scaled_data_minmax = pd.DataFrame(scaled_data_minmax, columns=train_data.columns[3:])
scaled_data_minmax.plot(kind="hist", bins=50, figsize=(20, 20), subplots=True, layout=(10, 3))
plt.tight_layout()
plt.show()

In [None]:
scaler = StandardScaler()
scaled_data_standard = scaler.fit_transform(train_data.iloc[:, 3:])

scaled_data_standard = pd.DataFrame(scaled_data_standard, columns=train_data.columns[3:])

scaled_data_standard.plot(kind="hist", bins=50, figsize=(20, 20), subplots=True, layout=(10, 3))
plt.tight_layout()
plt.show()

In [None]:
scaled_data_standard.drop(columns=["p.android.vending"]).plot(
    kind="hist", bins=50, figsize=(20, 20), subplots=True, layout=(10, 3)
)
plt.tight_layout()
plt.show()

In [None]:
power_transformer = PowerTransformer(method="yeo-johnson")
transformed_data_power = power_transformer.fit_transform(train_data.iloc[:, 3:])

transformed_data_power = pd.DataFrame(transformed_data_power, columns=train_data.columns[3:])
transformed_data_power.plot(kind="hist", bins=50, figsize=(20, 20), subplots=True, layout=(10, 3))
plt.tight_layout()
plt.show()

In [None]:
quantile_transformer = QuantileTransformer(output_distribution="normal", random_state=42)
transformed_data = quantile_transformer.fit_transform(train_data.iloc[:, 3:])

transformed_data = pd.DataFrame(transformed_data, columns=train_data.columns[3:])
transformed_data.plot(kind="hist", bins=50, figsize=(20, 20), subplots=True, layout=(10, 3))
plt.tight_layout()
plt.show()

In [None]:
power_transformer = PowerTransformer(method="yeo-johnson")
transformed_data_power = power_transformer.fit_transform(scaled_data_minmax)

transformed_data_power = pd.DataFrame(transformed_data_power, columns=train_data.columns[3:])
transformed_data_power.plot(kind="hist", bins=50, figsize=(20, 20), subplots=True, layout=(10, 3))
plt.tight_layout()
plt.show()

In [None]:
power_transformer = PowerTransformer(method="yeo-johnson")
transformed_data_power = power_transformer.fit_transform(scaled_data_standard)

transformed_data_power = pd.DataFrame(transformed_data_power, columns=train_data.columns[3:])
transformed_data_power.plot(kind="hist", bins=50, figsize=(20, 20), subplots=True, layout=(10, 3))
plt.tight_layout()
plt.show()

In [None]:
scaler = StandardScaler()
scaled_data_minmax = scaler.fit_transform(train_data.iloc[:, 3:])

scaled_data_minmax = pd.DataFrame(scaled_data_minmax, columns=train_data.columns[3:])
quantile_transformer = QuantileTransformer(output_distribution="normal", random_state=42)
scaled_data_minmax["p.android.vending"] = quantile_transformer.fit_transform(
    scaled_data_minmax[["p.android.vending"]]
)
scaled_data_minmax = pd.DataFrame(scaled_data_minmax, columns=train_data.columns[3:])

scaled_data_minmax.plot(kind="hist", bins=50, figsize=(20, 20), subplots=True, layout=(10, 3))
plt.xlim(-4, 4)
plt.tight_layout()
plt.show()

In [None]:
scaler = StandardScaler()
scaled_data_minmax = scaler.fit_transform(train_data.iloc[:, 3:])

scaled_data_minmax = pd.DataFrame(scaled_data_minmax, columns=train_data.columns[3:])
quantile_transformer = QuantileTransformer(output_distribution="normal", random_state=42)
scaled_data_minmax["p.android.vending"] = quantile_transformer.fit_transform(
    scaled_data_minmax[["p.android.vending"]]
)
scaled_data_minmax = pd.DataFrame(scaled_data_minmax, columns=train_data.columns[3:])

scaled_data_minmax.plot(kind="hist", bins=50, figsize=(20, 20), subplots=True, layout=(10, 3))
plt.tight_layout()
plt.show()

## D

### **Zadanie:** Zdôvodnite Vaše voľby/rozhodnutie pre realizáciu (t.j. zdokumentovanie)


In previous sections we have gathered these insights:

-   Scaling:
    -   We identified need for scaling data, because data has different ranges
    -   Min-Max Scaling showed promising results, when not combining with anything else
    -   Standard Scaler showed also promising results, the only problem was column **'p.android.vending'** as it has too big range and many outliers (we used Z-score(3), maybe using quantile detection could improved this), the scaled graph also showed outliers
-   Transformers:
    -   Power Transformer didn't show that promising results for uniform graphs (it made the a little logarithmic), but promising results for normal graphs
    -   Quantile Transformer transforms data to perfect normal distribution, but it could distort linear correlations
-   Combinations:
    -   Using Min-Max Scaler and Power Transformer showed promising results, as it transform data to normal distributions and also kept uniform distributions (didn't make them logarithmic, but there is some very slight logarithmic effect)
    -   Using Standard Scaler and Power Transformer showed even more promising results compared to Using Min-Max Scaler and Power Transformer, as it transform data to normal distributions and also kept uniform distributions without any logarithmic effect
-   Final choice:
    -   We are going to use Standard Scaler and Power Transformer, and after this we apply Quantile Transformer for **'p.android.vending'**


# 2.2


## A

### **Zadanie:** Zistite, ktoré atribúty (features) vo vašich dátach pre ML sú informatívne k predikovanej premennej (minimálne 3 techniky s porovnaním medzi sebou).


### Variance Threshold


In [None]:
selector = VarianceThreshold(threshold=(0.8 * (1 - 0.8)))
selector.fit(train_data.iloc[:, 3:])
support_mask = selector.get_support()

removed_columns = train_data.iloc[:, 3:].columns[~support_mask]

print("Removed columns:", removed_columns)

-   We see thins method would remove **'p.android.vending'** feature, which is the most problematic feature that we have in our dataset


In [None]:
variance_threshold_data = train_data.iloc[:, 3:]

thresholds = np.arange(0.0, 0.55, 0.05)
results = list()
for t in thresholds:
    transform = VarianceThreshold(threshold=t)
    tmp = transform.fit_transform(variance_threshold_data)
    n_features = tmp.shape[1]
    print(">Threshold=%.2f, Features=%d" % (t, n_features))
    results.append(n_features)

-   We see different threshold values have no effect on the results


### Mutual Information


In [98]:
X = train_data.drop(columns=["mwra", "ts", "imei"])
y = train_data["mwra"]

In [99]:
def mic(X, y):
    selector = mutual_info_classif(X, y)
    scores = pd.Series(selector, index=X.columns).sort_values(ascending=False)

    plt.figure(figsize=(16, 16))
    ax = scores.plot(kind="barh")

    for i, v in enumerate(scores):
        ax.text(v + 0.001, i, f"{v:.2f}", va="center")

    plt.gca().invert_yaxis()
    plt.show()


def mic_threshold(X, y):
    selector = mutual_info_classif(X, y)
    scores = pd.Series(selector, index=X.columns).sort_values(ascending=False)
    scores = scores[scores > 0.05]

    plt.figure(figsize=(16, 8))
    ax = scores.plot(kind="barh")

    for i, v in enumerate(scores):
        ax.text(v + 0.001, i, f"{v:.2f}", va="center")

    plt.gca().invert_yaxis()
    plt.show()
    print(scores)

In [None]:
mic(X, y)
mic_threshold(X, y)

-   We got these features as best:
    -   **p.android.settings**, **c.katana**, **p.system**, **c.android.youtube**, **p.android.packageinstaller**, **p.android.documentsui**, **p.android.externalstorage**


### Chi-Squared


-   We don't have categorical data, therefore we won't use this method


### F Statistic


-   Our target variable **mwra** could be specified as categorical (True, False), so we are gonna try this method


Selecting K-best (k=7) features using F-statistic


In [None]:
selector = SelectKBest(score_func=f_regression, k=7)
x_tmp = selector.fit_transform(X, y)
mask = selector.get_support()

selected_columns = X.columns[mask]
scores = selector.scores_[mask]

selected_features = pd.DataFrame({"Feature": selected_columns, "Score": scores})

sorted_features = selected_features.sort_values(by="Score", ascending=False).reset_index(drop=True)
sorted_features

-   We got these features as best:
    -   **p.android.settings**, **c.katana**, **p.system**, **c.android.youtube**, **p.android.chrome**, **p.android.externalstorage**, **p.android.packageinstaller**
-   Compared to MI, we got some different features


Using F-statistic to select features


In [None]:
selector = f_classif(X, y)
x_tmp = pd.Series(selector[0], index=X.columns).sort_values(ascending=False)
x_tmp

plt.figure(figsize=(10, 6))
x_tmp.plot(kind="barh")
plt.gca().invert_yaxis()
plt.show()

In [None]:
selector = f_classif(X, y)
x_tmp = pd.Series(selector[0], index=X.columns).sort_values(ascending=False)
x_tmp = x_tmp[x_tmp > 900]

plt.figure(figsize=(10, 6))
x_tmp.plot(kind="barh")
plt.gca().invert_yaxis()
plt.show()

pd.DataFrame(x_tmp)

-   We got these features as best:
    -   **p.android.settings**, **c.katana**, **p.system**, **c.android.youtube**, **p.android.chrome**, **p.android.externalstorage**, **p.android.packageinstaller**, **c.dogalize**, **p.android.documentsui**
-   Now we see a little bit more difference between MI and F-statistic, as we got 2 extra features


## B

### **Zadanie:** Zoraďte zistené atribúty v poradí podľa dôležitosti.


## C

### **Zadanie:** Zdôvodnite Vaše voľby/rozhodnutie pre realizáciu (t.j. zdokumentovanie)


# 2.3


## A

### **Zadanie:** Upravte váš kód realizujúci predspracovanie trénovacej množiny tak, aby ho bolo možné bez ďalších úprav znovu použiť na predspracovanie testovacej množiny v kontexte strojového učenia.


## B

### **Zadanie:** Využite možnosti sklearn.pipeline
