# Start


In [143]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import anderson, skew, zscore
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    MinMaxScaler,
    PowerTransformer,
    QuantileTransformer,
    StandardScaler,
)

In [144]:
file_path: str = "../dataset"
files: tuple[str, ...] = ("connections", "devices", "processes", "profiles")

dataset: dict[str, pd.DataFrame] = {}
for file in files:
    dataset[file] = pd.read_csv(f"{file_path}/{file}.csv", sep="\t")
    dataset[file] = dataset[file].drop_duplicates()

# 2.1


In [145]:
df = pd.merge(dataset["connections"], dataset["processes"], on=["imei", "ts", "mwra"], how="inner")
df["ts"] = pd.to_datetime(df.ts)

## A

### **Zadanie:** Dáta si rozdeľte na trénovaciu a testovaciu množinu podľa vami preddefinovaného pomeru. Ďalej pracujte len s trénovacím datasetom.


In [146]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

## B

### **Zadanie:** Transformujte dáta na vhodný formát pre ML t.j. jedno pozorovanie musí byť opísané jedným riadkom a každý atribút musí byť v numerickom formáte (encoding). Iteratívne integrujte aj kroky v predspracovaní dát z prvej fázy (missing values, outlier detection) ako celok.


In [None]:
duplicate_ts = train_data.duplicated(subset="ts").any()
print(f"Are there any duplicate timestamps? {duplicate_ts}")

-   There are no duplicates in datetime, therefore every observation is in one row


In [None]:
train_data.dtypes.value_counts()

-   All columns are in numerical format


In [None]:
null_counts = train_data.isnull().sum()
null_counts = null_counts[null_counts > 0]
null_counts

-   There are no missing values


Graph showing the boxplot and outliers of the dataset.


In [None]:
train_data.iloc[:, 3:].plot(
    kind="box", vert=False, figsize=(20, 20), flierprops=dict(marker="o", color="r", alpha=0.5)
)
plt.show()

In [None]:
norm_distribution = True

for col in train_data.columns[2:]:
    result = anderson(train_data[col], dist="norm")

    if not result.fit_result.success:
        norm_distribution = False
        print(f"{col} is not normally distributed")

if norm_distribution:
    print("All columns are normally distributed")

We are going to use Z-score (3), to detect outliers.


In [None]:
number_of_rows_before = train_data.shape[0]
print(f"Number of rows before removing outliers: {number_of_rows_before}")

train_data = train_data[(np.abs(zscore(train_data.iloc[:, 3:])) < 3).all(axis=1)]

number_of_rows_after = train_data.shape[0]
print(f"Number of rows after removing outliers: {number_of_rows_after}")
print(f"Number of rows removed: {number_of_rows_before - number_of_rows_after}")
print(
    f"Percentage of rows removed: {(number_of_rows_before - number_of_rows_after) / number_of_rows_before * 100:.2f}%"
)

-   Removing 4% of the is ok, therefore we are not going to use methods to replace outliers


In [None]:
train_data.iloc[:, 3:].plot(
    kind="box", vert=False, figsize=(20, 20), flierprops=dict(marker="o", color="r", alpha=0.5)
)
plt.show()

## C

### **Zadanie:** Transformujte atribúty dát pre strojové učenie podľa dostupných techník minimálne: scaling (2 techniky), transformers (2 techniky) a ďalšie. Cieľom je aby ste testovali efekty a vhodne kombinovali v dátovom pipeline (od časti 2.3 a v 3. fáze).


In [154]:
def skewness_type(skew_value: float) -> str:
    if skew_value <= -1:
        return "Highly Negative Skew"

    elif skew_value <= -0.5:
        return "Moderately Negative Skew"

    elif skew_value <= 0.5:
        return "Approximately Symmetric"

    elif skew_value <= 1:
        return "Moderately Positive Skew"

    return "Highly Positive Skew"


def kurtosis_type(kurtosis_value: float) -> str:
    if kurtosis_value < -1:
        return "Negative Kurtosis"

    elif kurtosis_value < -0.5:
        return "Moderately Negative Kurtosis"

    elif kurtosis_value < 0.5:
        return "Approximately Normal Kurtosis"

    elif kurtosis_value < 1:
        return "Moderately Positive Kurtosis"

    return "Positive Kurtosis"

In [None]:
columns = train_data.columns[3:].tolist()
skew_values = [skew(train_data[col]) for col in columns]
kurtosis_values = [train_data[col].kurtosis() for col in columns]

data = {
    "columns": columns,
    "skew": skew_values,
    "kurtosis": kurtosis_values,
}

shape_train_data = pd.DataFrame(data)

shape_train_data["result skew"] = shape_train_data["skew"].apply(skewness_type)
shape_train_data["result kurtosis"] = shape_train_data["kurtosis"].apply(kurtosis_type)
shape_train_data

In [None]:
train_data.iloc[:, 3:].plot(kind="hist", bins=50, figsize=(20, 20), subplots=True, layout=(10, 3))
plt.tight_layout()
plt.show()

In [None]:
scaler = MinMaxScaler()
scaled_data_minmax = scaler.fit_transform(train_data.iloc[:, 3:])

scaled_data_minmax = pd.DataFrame(scaled_data_minmax, columns=train_data.columns[3:])
scaled_data_minmax.plot(kind="hist", bins=50, figsize=(20, 20), subplots=True, layout=(10, 3))
plt.tight_layout()
plt.show()

In [None]:
columns = scaled_data_minmax.columns.tolist()
skew_values = [skew(scaled_data_minmax[col]) for col in columns]
kurtosis_values = [scaled_data_minmax[col].kurtosis() for col in columns]

data = {
    "columns": columns,
    "skew": skew_values,
    "kurtosis": kurtosis_values,
}

shape_train_data = pd.DataFrame(data)

shape_train_data["result skew"] = shape_train_data["skew"].apply(skewness_type)
shape_train_data["result kurtosis"] = shape_train_data["kurtosis"].apply(kurtosis_type)
shape_train_data

In [None]:
scaler = StandardScaler()
scaled_data_standard = scaler.fit_transform(train_data.iloc[:, 3:])


scaled_data_standard = pd.DataFrame(scaled_data_standard, columns=train_data.columns[3:])
scaled_data_standard.plot(kind="hist", bins=50, figsize=(20, 20), subplots=True, layout=(10, 3), xlim=(-5, 5))
plt.tight_layout()
plt.show()

In [None]:
power_transformer = PowerTransformer(method="yeo-johnson")
transformed_data_power = power_transformer.fit_transform(train_data.iloc[:, 3:])

transformed_data_power = pd.DataFrame(transformed_data_power, columns=train_data.columns[3:])
transformed_data_power.plot(kind="hist", bins=50, figsize=(20, 20), subplots=True, layout=(10, 3))
plt.tight_layout()
plt.show()

In [None]:
columns = transformed_data_power.columns.tolist()
skew_values = [skew(transformed_data_power[col]) for col in columns]
kurtosis_values = [transformed_data_power[col].kurtosis() for col in columns]

data = {
    "columns": columns,
    "skew": skew_values,
    "kurtosis": kurtosis_values,
}

shape_train_data = pd.DataFrame(data)

shape_train_data["result skew"] = shape_train_data["skew"].apply(skewness_type)
shape_train_data["result kurtosis"] = shape_train_data["kurtosis"].apply(kurtosis_type)
shape_train_data

In [None]:
quantile_transformer = QuantileTransformer(output_distribution="normal", random_state=42)
transformed_data = quantile_transformer.fit_transform(train_data.iloc[:, 3:])

transformed_data = pd.DataFrame(transformed_data, columns=train_data.columns[3:])
transformed_data.plot(kind="hist", bins=50, figsize=(20, 20), subplots=True, layout=(10, 3))
plt.tight_layout()
plt.show()

In [None]:
columns = transformed_data.columns.tolist()
skew_values = [skew(transformed_data[col]) for col in columns]
kurtosis_values = [transformed_data[col].kurtosis() for col in columns]

data = {
    "columns": columns,
    "skew": skew_values,
    "kurtosis": kurtosis_values,
}

shape_train_data = pd.DataFrame(data)

shape_train_data["result skew"] = shape_train_data["skew"].apply(skewness_type)
shape_train_data["result kurtosis"] = shape_train_data["kurtosis"].apply(kurtosis_type)
shape_train_data

In [None]:
log_transformed_data = np.log1p(scaled_data_minmax)

log_transformed_data.plot(kind="hist", bins=50, figsize=(20, 20), subplots=True, layout=(10, 3))
plt.tight_layout()
plt.show()

## D

### **Zadanie:** Zdôvodnite Vaše voľby/rozhodnutie pre realizáciu (t.j. zdokumentovanie)


# 2.2


## A

### **Zadanie:** Zistite, ktoré atribúty (features) vo vašich dátach pre ML sú informatívne k predikovanej premennej (minimálne 3 techniky s porovnaním medzi sebou).


## B

### **Zadanie:** Zoraďte zistené atribúty v poradí podľa dôležitosti.


## C

### **Zadanie:** Zdôvodnite Vaše voľby/rozhodnutie pre realizáciu (t.j. zdokumentovanie)


# 2.3


## A

### **Zadanie:** Upravte váš kód realizujúci predspracovanie trénovacej množiny tak, aby ho bolo možné bez ďalších úprav znovu použiť na predspracovanie testovacej množiny v kontexte strojového učenia.


## B

### **Zadanie:** Využite možnosti sklearn.pipeline
