# Start


Importing the necessary libraries


In [1]:
from typing import Optional

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from statsmodels.stats.power import TTestIndPower

Loading dataset


In [2]:
file_path: str = "../dataset"
files: tuple[str, ...] = ("connections", "devices", "processes", "profiles")

dataset: dict[str, pd.DataFrame] = {}
for file in files:
    dataset[file] = pd.read_csv(f"{file_path}/{file}.csv", sep="\t")

# 1.1 - Bacis description of data along with their characteristics


## A.)


#### Connections description


In [None]:
dataset["connections"].info()

-   Total Entries: 15,108
-   Total Columns: 13
-   Column Types:
-   11 columns of type float64
-   1 column of type int64
-   1 column of type object
-   There are no missing values in this data


In [None]:
dataset["connections"]["ts"]

-   The object column "ts" is date and time


In [5]:
dataset["connections"]["ts"] = pd.to_datetime(dataset["connections"]["ts"])

-   Cast the "ts" column to datetime


In [None]:
connection_summary = dataset["connections"].describe()
median = (
    dataset["connections"].select_dtypes(include=["float64", "int64"]).median()
)  ## adding median to describe method output
connection_summary.loc["median"] = median

## dropping imei, as it has no meaning to make these statistics out of it
connection_summary.drop(columns=["imei"], inplace=True)
connection_summary

-   From these tables above we can see imei is a long integer and looks like an ID, if we look at processes table, we can also see same values indicating this could be an Id of device.
-   Another assumption we can make is that columns specifying a connection type (columns starting with c. such as c.android.youtube) have values ranging from 0 to 100, this could indicate that it is a percentage amount of time that the connection was established.


-   First few rows might indicate that the data was sample in a 1 minute interval.
-   Let's look at it closer.


In [None]:
times = dataset["connections"].sort_values(by="ts")["ts"]
times

-   Now we see it looks like samples are in a 1 minute interval.
-   Let's go further.


In [None]:
previous_time: Optional[pd.Series] = None

same_times: int = 0
non_minute_differences: int = 0


for current_time in times:
    if previous_time is None:
        previous_time = current_time
        continue

    if (current_time - previous_time).seconds == 0:
        same_times += 1

    elif (current_time - previous_time).seconds != 60:
        non_minute_differences += 1

    previous_time = current_time

print(f"Non minute differences: {non_minute_differences}")
print(f"Same times: {same_times}")

-   From this we can see, that there are data every minute, sometimes more than once at the same time.
-   There are either some duplicates, or data for different devices at the same time.


In [None]:
times = (
    dataset["connections"]
    .groupby(by="imei")[["imei", "ts"]]
    .apply(lambda val: val.sort_values(by="ts", ascending=True))
    .reset_index(drop=True)
)
times

-   If we assume that columns starting with "c." are representing percentage amount of time being active during a time window, we need to group them by device serial number (imei) and then look at the time difference.


In [None]:
dataset["connections"]["mwra"].value_counts()

-   mwra is (Malware-related-activity)
-   In data there are only values 1.0 and 0.0 indicating if there was a malware activity in specific time frame.


#### Devices description


In [None]:
dataset["devices"].info()

In [None]:
devices_summary = dataset["devices"].describe()
median = (
    dataset["devices"].select_dtypes(include=["float64", "int64"]).median()
)  ## adding median to describe method output
devices_summary.loc["median"] = median

## dropping imei, as it has no meaning to make these statistics out of it
devices_summary.drop(columns=["imei"], inplace=True)
devices_summary

In [None]:
dataset["devices"].head()

-   "store_name" object is a string
-   "code" is string, holding code for state
-   "location" is a string, containing continent and city


#### Processes description


In [None]:
dataset["processes"].info()

In [15]:
dataset["processes"]["ts"] = pd.to_datetime(dataset["processes"]["ts"])

In [None]:
processes_summary = dataset["processes"].describe()
median = (
    dataset["processes"].select_dtypes(include=["float64", "int64"]).median()
)  ## adding median to describe method output
processes_summary.loc["median"] = median

## dropping imei, as it has no meaning to make these statistics out of it
processes_summary.drop(columns=["imei"], inplace=True)
processes_summary

In [None]:
dataset["processes"].head()

#### Profiles description


In [None]:
dataset["profiles"].info()

In [19]:
dataset["profiles"]["birthdate"] = pd.to_datetime(dataset["profiles"]["birthdate"])

In [None]:
profiles_summary = dataset["profiles"].describe()
median = (
    dataset["profiles"].select_dtypes(include=["float64", "int64"]).median()
)  ## adding median to describe method output
profiles_summary.loc["median"] = median

## dropping imei, as it has no meaning to make these statistics out of it
profiles_summary.drop(columns=["imei"], inplace=True)
profiles_summary

In [None]:
dataset["profiles"].head()

In [None]:
null_values = {file: data.isnull().sum() for file, data in dataset.items()}
for file, nulls in null_values.items():
    if nulls.sum() == 0:
        continue
    print(f"Null values in {file} dataset:")
    print(nulls)
    print("\n")

## B.)


### MWRA


-   First we look at the most important column "mwra" and look at it more in depth.


In [None]:
dataset["connections"]["mwra"].value_counts(normalize=True) * 100

-   In "connections" we can see that positive mwra is ~62%, indicating that there are more positive cases and therefore in future when we put it into our model might falsely evaluate some connections. I would say the closer we are to 50/50 the better.


In [None]:
dataset["processes"]["mwra"].value_counts(normalize=True) * 100

-   "mwra" is the same for "processes" as it is for "connections"


### Connections


#### Defining the variables


In [25]:
chrome_data = dataset["connections"]["c.android.chrome"]
chrome_mean = chrome_data.mean()
chrome_std = chrome_data.std()

dogalize_data = dataset["connections"]["c.dogalize"]
dogalize_mean = dogalize_data.mean()
dogalize_std = dogalize_data.std()

gm_data = dataset["connections"]["c.android.gm"]
gm_mean = gm_data.mean()
gm_std = gm_data.std()

youtube_data = dataset["connections"]["c.android.youtube"]
youtube_mean = youtube_data.mean()
youtube_std = youtube_data.std()

katana_data = dataset["connections"]["c.katana"]
katana_mean = katana_data.mean()
katana_std = katana_data.std()

raider_data = dataset["connections"]["c.raider"]
raider_mean = raider_data.mean()
raider_std = raider_data.std()

vending_data = dataset["connections"]["c.android.vending"]
vending_mean = vending_data.mean()
vending_std = vending_data.std()

x86_data = dataset["connections"]["c.UCMobile.x86"]
x86_mean = x86_data.mean()
x86_std = x86_data.std()


updateassist_data = dataset["connections"]["c.updateassist"]
updateassist_mean = updateassist_data.mean()
updateassist_std = updateassist_data.std()

intl_data = dataset["connections"]["c.UCMobile.intl"]
intl_mean = intl_data.mean()
intl_std = intl_data.std()

all_str_connections: list[str] = [
    "c.android.chrome",
    "c.dogalize",
    "c.android.gm",
    "c.android.youtube",
    "c.katana",
    "c.raider",
    "c.android.vending",
    "c.UCMobile.x86",
    "c.updateassist",
    "c.UCMobile.intl",
]

all_data_connections: list[pd.Series] = [
    chrome_data,
    dogalize_data,
    gm_data,
    youtube_data,
    katana_data,
    raider_data,
    vending_data,
    x86_data,
    updateassist_data,
    intl_data,
]

all_means_connections: list[float] = [
    chrome_mean,
    dogalize_mean,
    gm_mean,
    youtube_mean,
    katana_mean,
    raider_mean,
    vending_mean,
    x86_mean,
    updateassist_mean,
    intl_mean,
]

all_std_connections: list[float] = [
    chrome_std,
    dogalize_std,
    gm_std,
    youtube_std,
    katana_std,
    raider_std,
    vending_std,
    x86_std,
    updateassist_std,
    intl_std,
]

#### Measure of dispersion


In [None]:
df = dataset["connections"].iloc[:, 3:].describe()
df.loc["full_range"] = df.loc["max"] - df.loc["min"]
df.loc["interquartile_range"] = df.loc["75%"] - df.loc["25%"]
df

#### Measure of center


In [None]:
apps = all_str_connections

means = all_means_connections

medians = [
    chrome_data.median(),
    dogalize_data.median(),
    gm_data.median(),
    youtube_data.median(),
    katana_data.median(),
    raider_data.median(),
    vending_data.median(),
    x86_data.median(),
    updateassist_data.median(),
    intl_data.median(),
]

max_values = []
most_occurring_values = []

value_counts = chrome_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = dogalize_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = gm_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = youtube_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = katana_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = raider_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = vending_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = x86_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = updateassist_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())

value_counts = intl_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())

for i in range(len(most_occurring_values)):
    most_occurring_values[i] = most_occurring_values[i][: min(3, len(most_occurring_values[i]))]


data = {
    "connection": apps,
    "mean": means,
    "median": medians,
    "mode_count": max_values,
    "mode_values": most_occurring_values,
}

df = pd.DataFrame(data)
df

#### Measure of shape


In [28]:
def skewness_type(skew_value: float) -> str:
    if skew_value <= -1:
        return "Highly Negative Skew"

    elif skew_value <= -0.5:
        return "Moderately Negative Skew"

    elif skew_value <= 0.5:
        return "Approximately Symmetric"

    elif skew_value <= 1:
        return "Moderately Positive Skew"

    return "Highly Positive Skew"

In [29]:
def kurtosis_type(kurtosis_value: float) -> str:
    if kurtosis_value < -1:
        return "Negative Kurtosis"

    elif kurtosis_value < -0.5:
        return "Moderately Negative Kurtosis"

    elif kurtosis_value < 0.5:
        return "Approximately Normal Kurtosis"

    elif kurtosis_value < 1:
        return "Moderately Positive Kurtosis"

    return "Positive Kurtosis"

In [None]:
data = {
    "connection": all_str_connections,
    "skew": [
        stats.skew(chrome_data),
        stats.skew(dogalize_data),
        stats.skew(gm_data),
        stats.skew(youtube_data),
        stats.skew(katana_data),
        stats.skew(raider_data),
        stats.skew(vending_data),
        stats.skew(x86_data),
        stats.skew(updateassist_data),
        stats.skew(intl_data),
    ],
    "kurtosis": [
        stats.kurtosis(chrome_data),
        stats.kurtosis(dogalize_data),
        stats.kurtosis(gm_data),
        stats.kurtosis(youtube_data),
        stats.kurtosis(katana_data),
        stats.kurtosis(raider_data),
        stats.kurtosis(vending_data),
        stats.kurtosis(x86_data),
        stats.kurtosis(updateassist_data),
        stats.kurtosis(intl_data),
    ],
}

shape_df = pd.DataFrame(data)

shape_df["result skew"] = shape_df["skew"].apply(skewness_type)
shape_df["result kurtosis"] = shape_df["kurtosis"].apply(kurtosis_type)
shape_df.set_index("connection", inplace=True)
shape_df

#### Histograms with KDE


In [None]:
_, axes = plt.subplots(5, 2, figsize=(16, 26))


## Having all the data in a list, we can iterate over it and plot the histogram with KDE for each connection.
for i in range(len(all_data_connections)):
    sns.histplot(all_data_connections[i], bins=30, kde=True, ax=axes[i // 2, i % 2])
    axes[i // 2, i % 2].axvline(
        all_means_connections[i], color="r", linestyle="--", label=f"Mean: {all_means_connections[i]:.2f}"
    )
    axes[i // 2, i % 2].axvline(
        linestyle="", label=f"Skewness: {shape_df.loc[all_str_connections[i]]['skew']:.2f}"
    )
    axes[i // 2, i % 2].axvline(
        linestyle="", label=f"Kurtosis: {shape_df.loc[all_str_connections[i]]['kurtosis']:.2f}"
    )
    axes[i // 2, i % 2].set_title(f"Distribution of {all_str_connections[i]}")
    axes[i // 2, i % 2].legend()
    axes[i // 2, i % 2].grid(True, alpha=0.3)

## Adjust the layout
plt.tight_layout(w_pad=3, h_pad=3)
plt.show()

In [None]:
_, axes = plt.subplots(5, 2, figsize=(16, 26))

for i in range(len(all_data_connections)):
    sns.histplot(
        data=all_data_connections[i],
        bins=30,
        stat="density",
        alpha=0.3,
        color="gray",
        label="Histogram",
        ax=axes[i // 2, i % 2],
    )
    sns.kdeplot(
        data=all_data_connections[i],
        color="blue",
        label="Actual Distribution",
        linewidth=2,
        ax=axes[i // 2, i % 2],
    )

    if i <= 5:  ## We know that first 6 connections are not uniform
        x = np.linspace(all_data_connections[i].min(), all_data_connections[i].max(), 100)
        gaussian = stats.norm.pdf(x, all_means_connections[i], all_std_connections[i])
        axes[i // 2, i % 2].plot(
            x, gaussian, color="red", linestyle="--", label="Gaussian Model", linewidth=2
        )

    else:
        a = all_data_connections[i].min()
        b = all_data_connections[i].max()
        x = np.linspace(a, b, all_data_connections[i].size)
        uniform_dist = stats.uniform(loc=a, scale=b - a)
        axes[i // 2, i % 2].plot(
            x, uniform_dist.pdf(x), color="red", linestyle="--", label="Uniform Model", linewidth=2
        )

    axes[i // 2, i % 2].set_title(f"Distribution of {all_str_connections[i]}")
    axes[i // 2, i % 2].legend()
    axes[i // 2, i % 2].grid(True, alpha=0.3)


## Adjust the layout
plt.tight_layout(w_pad=3, h_pad=3)
plt.show()

#### Boxplots


In [None]:
_, axes = plt.subplots(5, 2, figsize=(16, 26))

for i in range(len(all_data_connections)):
    sns.boxplot(all_data_connections[i], ax=axes[i // 2, i % 2])
    axes[i // 2, i % 2].set_title(f"Distribution of {all_str_connections[i]}")
    axes[i // 2, i % 2].grid(True, alpha=0.3)

## Adjust the layout
plt.tight_layout(w_pad=3, h_pad=3)
plt.show()

#### Q-Q plots


In [None]:
_, axes = plt.subplots(5, 2, figsize=(16, 26))

for i in range(len(all_data_connections)):
    sm.qqplot(all_data_connections[i], fit=True, line="45", ax=axes[i // 2, i % 2])
    axes[i // 2, i % 2].set_title(f"Distribution of {all_str_connections[i]}")
    axes[i // 2, i % 2].grid(True, alpha=0.3)

## Adjust the layout
plt.tight_layout(w_pad=3, h_pad=3)
plt.show()

### Processes


#### Defining the variables


In [35]:
chrome_data = dataset["processes"]["p.android.chrome"]
chrome_mean = chrome_data.mean()
chrome_std = chrome_data.std()

dogalize_data = dataset["processes"]["p.dogalize"]
dogalize_mean = dogalize_data.mean()
dogalize_std = dogalize_data.std()

katana_data = dataset["processes"]["p.katana"]
katana_mean = katana_data.mean()
katana_std = katana_data.std()

settings_data = dataset["processes"]["p.android.settings"]
settings_mean = settings_data.mean()
settings_std = settings_data.std()

system_data = dataset["processes"]["p.system"]
system_mean = system_data.mean()
system_std = system_data.std()

simulator_data = dataset["processes"]["p.simulator"]
simulator_mean = simulator_data.mean()
simulator_std = simulator_data.std()

all_str_processes: list[str] = [
    "p.android.chrome",
    "p.dogalize",
    "p.katana",
    "p.android.settings",
    "p.system",
    "p.simulator",
]

all_data_processes: list[pd.Series] = [
    chrome_data,
    dogalize_data,
    katana_data,
    settings_data,
    system_data,
    simulator_data,
]

all_means_processes: list[float] = [
    chrome_mean,
    dogalize_mean,
    katana_mean,
    settings_mean,
    system_mean,
    simulator_mean,
]

all_std_processes: list[float] = [
    chrome_std,
    dogalize_std,
    katana_std,
    settings_std,
    system_std,
    simulator_std,
]

#### Measure of dispersion


In [None]:
df = dataset["processes"][all_str_processes].describe()
df.loc["full_range"] = df.loc["max"] - df.loc["min"]
df.loc["interquartile_range"] = df.loc["75%"] - df.loc["25%"]
df

#### Measure of center


In [None]:
apps = all_str_processes

means = all_means_processes

medians = [
    chrome_data.median(),
    dogalize_data.median(),
    katana_data.median(),
    settings_data.median(),
    system_data.median(),
    simulator_data.median(),
]

max_values = []
most_occurring_values = []

value_counts = chrome_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = dogalize_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = katana_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = settings_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = system_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = simulator_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())

for i in range(len(most_occurring_values)):
    most_occurring_values[i] = most_occurring_values[i][: min(3, len(most_occurring_values[i]))]


data = {
    "process": apps,
    "mean": means,
    "median": medians,
    "mode_count": max_values,
    "mode_values": most_occurring_values,
}

df = pd.DataFrame(data)
df

#### Measure of shape


In [None]:
data = {
    "process": all_str_processes,
    "skew": [
        stats.skew(chrome_data),
        stats.skew(dogalize_data),
        stats.skew(katana_data),
        stats.skew(settings_data),
        stats.skew(system_data),
        stats.skew(simulator_data),
    ],
    "kurtosis": [
        stats.kurtosis(chrome_data),
        stats.kurtosis(dogalize_data),
        stats.kurtosis(katana_data),
        stats.kurtosis(settings_data),
        stats.kurtosis(system_data),
        stats.kurtosis(simulator_data),
    ],
}

shape_df = pd.DataFrame(data)
shape_df["result skew"] = shape_df["skew"].apply(skewness_type)
shape_df["result kurtosis"] = shape_df["kurtosis"].apply(kurtosis_type)
shape_df.set_index("process", inplace=True)
shape_df

#### Histograms with KDE


In [None]:
fig, axes = plt.subplots(3, 2, figsize=(16, 16))

for i in range(len(all_data_processes)):
    sns.histplot(all_data_processes[i], bins=30, kde=True, ax=axes[i // 2, i % 2])
    axes[i // 2, i % 2].axvline(
        all_means_processes[i], color="r", linestyle="--", label=f"Mean: {all_means_processes[i]:.2f}"
    )
    axes[i // 2, i % 2].axvline(
        linestyle="", label=f"Skewness: {shape_df.loc[all_str_processes[i]]['skew']:.2f}"
    )
    axes[i // 2, i % 2].axvline(
        linestyle="", label=f"Kurtosis: {shape_df.loc[all_str_processes[i]]['kurtosis']:.2f}"
    )
    axes[i // 2, i % 2].set_title(f"Distribution of {all_str_processes[i]}")
    axes[i // 2, i % 2].legend()
    axes[i // 2, i % 2].grid(True, alpha=0.3)

plt.tight_layout(w_pad=3, h_pad=3)
plt.show()

In [None]:
_, axes = plt.subplots(3, 2, figsize=(16, 16))

gaussian_models = [chrome_data, settings_data, system_data]

for i in range(len(all_data_processes)):
    sns.histplot(
        data=all_data_processes[i],
        bins=30,
        stat="density",
        alpha=0.3,
        color="gray",
        label="Histogram",
        ax=axes[i // 2, i % 2],
    )
    sns.kdeplot(
        data=all_data_processes[i],
        color="blue",
        label="Actual Distribution",
        linewidth=2,
        ax=axes[i // 2, i % 2],
    )

    if any(all_data_processes[i] is model for model in gaussian_models):
        x = np.linspace(all_data_processes[i].min(), all_data_processes[i].max(), 100)
        gaussian = stats.norm.pdf(x, all_means_processes[i], all_std_processes[i])
        axes[i // 2, i % 2].plot(
            x, gaussian, color="red", linestyle="--", label="Gaussian Model", linewidth=2
        )

    else:
        a = all_data_processes[i].min()
        b = all_data_processes[i].max()
        x = np.linspace(a, b, all_data_processes[i].size)
        uniform_dist = stats.uniform(loc=a, scale=b - a)
        axes[i // 2, i % 2].plot(
            x, uniform_dist.pdf(x), color="red", linestyle="--", label="Uniform Model", linewidth=2
        )

    axes[i // 2, i % 2].set_title(f"Distribution of {all_str_processes[i]}")
    axes[i // 2, i % 2].legend()
    axes[i // 2, i % 2].grid(True, alpha=0.3)

#### Boxplots


In [None]:
_, axes = plt.subplots(3, 2, figsize=(16, 16))

for i in range(len(all_data_processes)):
    sns.boxplot(all_data_processes[i], ax=axes[i // 2, i % 2])
    axes[i // 2, i % 2].set_title(f"Distribution of {all_str_processes[i]}")
    axes[i // 2, i % 2].grid(True, alpha=0.3)


plt.tight_layout(w_pad=3, h_pad=3)
plt.show()

#### Q-Q plots


In [None]:
_, axes = plt.subplots(3, 2, figsize=(16, 16))


for i in range(len(all_data_processes)):
    sm.qqplot(all_data_processes[i], fit=True, line="45", ax=axes[i // 2, i % 2])
    axes[i // 2, i % 2].set_title(f"Distribution of {all_str_processes[i]}")
    axes[i // 2, i % 2].grid(True, alpha=0.3)


plt.tight_layout(w_pad=3, h_pad=3)
plt.show()

## C.)


In [None]:
matica = dataset["connections"].iloc[:, 2:].corr()

mask = np.triu(np.ones_like(matica, dtype=bool))
plt.figure(figsize=(14, 8))
sns.heatmap(matica, mask=mask, annot=True, cmap="coolwarm", vmin=-1, vmax=1)
plt.xticks(rotation=45, ha="right")
plt.show()

In [None]:
matica = dataset["connections"].iloc[:, 2:].corr(method="spearman")

mask = np.triu(np.ones_like(matica, dtype=bool))
plt.figure(figsize=(14, 8))
sns.heatmap(matica, mask=mask, annot=True, cmap="coolwarm", vmin=-1, vmax=1)
plt.show()

In [None]:
matica = dataset["processes"].iloc[:, 2:].corr()
matica = matica.round(2)

mask = np.triu(np.ones_like(matica, dtype=bool))
plt.figure(figsize=(14, 8))

sns.heatmap(matica, mask=mask, annot=True, cmap="coolwarm", vmin=-1, vmax=1)
plt.xticks(rotation=45, ha="right")
plt.show()

In [None]:
sns.pairplot(
    dataset["connections"].iloc[:, 2:], hue="mwra", diag_kind="kde", palette={0.0: "blue", 1.0: "red"}
)
# sns.pairplot(connections, hue="mwra", diag_kind="kde", palette = {0.0: 'blue', 1.0: 'red'})
plt.show()

In [None]:
processes_columns = ["p.android.chrome", "p.dogalize", "p.katana", "p.android.gm", "p.android.vending"]
connections_columns = ["c.android.chrome", "c.dogalize", "c.katana", "c.android.gm", "c.android.vending"]

combined_df = pd.concat(
    [dataset["processes"][processes_columns], dataset["connections"][connections_columns]], axis=1
)

correlation_matrix = combined_df.corr(method="pearson")

filtered_corr = correlation_matrix.loc[processes_columns, connections_columns]

mask = np.ones_like(filtered_corr, dtype=bool)
np.fill_diagonal(mask, False)

sns.heatmap(
    filtered_corr,
    mask=mask,
    annot=True,
    cmap="coolwarm",
    vmin=-1,
    vmax=1,
    xticklabels=connections_columns,
    yticklabels=processes_columns,
)
plt.xlabel("Connections")
plt.ylabel("Processes")
plt.title("Korelácia medzi Processes a Connections (len stredná diagonála)")
plt.xticks(rotation=45, ha="right")
plt.show()

In [None]:
mwra = dataset["processes"]["mwra"]
settings = dataset["processes"]["p.android.settings"]

sns.regplot(x=mwra, y=settings, line_kws={"color": "red"})

## D.)


In [None]:
connections_list = all_str_connections
mwra_list = ["mwra"]

combined_df = pd.concat([dataset["connections"][connections_list], dataset["connections"][mwra_list]], axis=1)

correlation_matrix = combined_df.corr(method="pearson")

filtered_corr = correlation_matrix.loc[connections_list, mwra_list]

plt.figure(figsize=(8, 6))
sns.heatmap(
    filtered_corr,
    annot=True,
    cmap="coolwarm",
    vmin=-1,
    vmax=1,
    xticklabels=mwra_list,
    yticklabels=connections_list,
)
plt.xlabel("Connections")
plt.ylabel("Processes")
plt.show()

In [None]:
processes_list = dataset["processes"].columns[3:]
mwra_list = ["mwra"]

combined_df = pd.concat([dataset["processes"][processes_list], dataset["processes"][mwra_list]], axis=1)

correlation_matrix = combined_df.corr(method="pearson")

filtered_corr = correlation_matrix.loc[processes_list, mwra_list]

plt.figure(figsize=(8, 10))
sns.heatmap(
    filtered_corr,
    annot=True,
    cmap="coolwarm",
    vmin=-1,
    vmax=1,
    xticklabels=mwra_list,
    yticklabels=processes_list,
)
plt.xlabel("Connections")
plt.ylabel("Processes")
plt.show()

In [None]:
dataset["devices"].info()

In [None]:
df = dataset["devices"].copy()
df["continent"] = dataset["devices"]["location"].apply(lambda x: x.split("/")[0])
df = df.merge(dataset["connections"][["imei", "mwra"]], on="imei")
df = df.groupby("continent").agg({"mwra": "mean"})
df.plot(kind="barh", legend=False)

## E.)


Dokumentujte Vaše prvotné zamyslenie k riešeniu zadania projektu, napr. sú
niektoré atribúty medzi sebou závislé? od ktorých atribútov závisí predikovaná
premenná? či je potrebné kombinovať záznamy z viacerých súborov?


#### Correlations

Connections:

-   Correlation matrix for **mwra** in **connections** shows slight correlations between **c.dogalize** and **c.android.youtube**
-   Surprisingly there is a negative correlation between **c.katana**

Processes:

-   Correlation matrix for **mwra** in **processes** shows slight correlation between **p.android.settings**
-   There is also slight negative correlation between **p.system**

#### Combination of data

-   We can combine data from **connections** and **processes** by **imei** to get more data, especially important data that will be crucial in our model


# 1.2 - Identification of problems, integration and data cleaning


## A.)


In 1.1-A we already transformed the "ts" column to datetime. WIth this we can expect every instance to be of the same format, if there were any errors in the data, the function would throw an error.

But here is also a simple function to check if the data is in the correct format.


In [None]:
def check_correct_format(date: str) -> bool:
    if date[4] == "-" and date[7] == "-" and date[10] == " " and date[13] == ":" and date[16] == ":":
        return True
    print(f"Date {date} is not in correct format")
    return False


# Since we already changed the format of the datetime, we will read cvs again
df = pd.read_csv("../dataset/connections.csv", sep="\t")
bool_val = df["ts"].apply(check_correct_format).all()
print(bool_val)

df = pd.read_csv("../dataset/processes.csv", sep="\t")
bool_val = df["ts"].apply(check_correct_format).all()
print(bool_val)

-   The format of date time in "connections" and "processes" is correct


In [None]:
for key in dataset:
    has_missing_values = dataset[key].isnull().values.any()
    print(
        f"DataFrame {key:<12} has {dataset[key].isnull().sum().sum() if has_missing_values else 'no':<4} missing values"
    )

-   We see that **profiles** have some missing values


In [None]:
missing_columns = dataset["profiles"].columns[dataset["profiles"].isnull().any()]
print("Columns with missing values:", missing_columns)

-   We can see the columns with missing values in **profiles** are: "address", "job", "residence", "birthdate"
-   These columns hold no important information for our analysis, so we can drop this table in later part


In [None]:
for key in dataset:
    has_duplicates = dataset[key].duplicated().any()
    print(
        f'DataFrame {key:<12} has {dataset[key].duplicated().sum() if has_duplicates else "no":<4} duplicates'
    )

Duplicates in **connections**


In [None]:
dataset["connections"][dataset["connections"].duplicated(keep="first")].sample(5, random_state=42)

Duplicates in **devices**


In [None]:
dataset["devices"][dataset["devices"].duplicated(keep="first")]

Duplicates in **processes**


In [None]:
dataset["processes"][dataset["processes"].duplicated(keep="first")].sample(5, random_state=42)

Dropping duplicates in all tables


In [60]:
for key in dataset:
    dataset[key].drop_duplicates(inplace=True)

## B.)


In [None]:
profiles_copy = dataset["profiles"].copy()

before_drop = profiles_copy.shape[0]
profiles_copy.dropna(inplace=True)

after_drop = profiles_copy.shape[0]

print(f"DataFrame profiles had {before_drop} rows before dropping NaN values")
print(f"DataFrame profiles has {after_drop} rows after dropping NaN values")

-   We see that after drop we have only 137 from original 2571 rows, which result in almost no data to work with


In [None]:
profiles_copy = dataset["profiles"].copy()

before_manipulating = profiles_copy.isnull().sum().sum()
number_of_rows = len(profiles_copy)
number_of_values = profiles_copy.count().sum()
missing_values = profiles_copy.isnull().sum().sum()

profiles_copy["job"].fillna(profiles_copy["job"].mode()[0], inplace=True)  # using mode
profiles_copy["residence"].fillna(profiles_copy["residence"].mode()[0], inplace=True)

profiles_copy["birth_year"] = profiles_copy["birthdate"].dt.year  # taking out the year
profiles_copy["birth_year"] = profiles_copy["birth_year"].interpolate(method="linear")  # using interpolation

profiles_copy["birthdate"] = pd.to_datetime(
    profiles_copy["birth_year"].round().astype(int), format="%Y", errors="coerce"
)
profiles_copy.drop(columns="birth_year", inplace=True)


le_residence = LabelEncoder()
profiles_copy["residence_encoded"] = le_residence.fit_transform(
    profiles_copy["residence"].astype(str)
)  # Encoding

le_address = LabelEncoder()
profiles_copy["address_encoded"] = le_address.fit_transform(
    profiles_copy["address"].fillna("NaN").astype(str)
)  # keeping the values

subset = profiles_copy[["address_encoded", "residence_encoded"]]
subset.loc[profiles_copy["address"].isnull(), "address_encoded"] = np.nan

imputer = KNNImputer(n_neighbors=2)
subset_imputed = imputer.fit_transform(subset)  # using kNN

profiles_copy["address_encoded"] = subset_imputed[:, 0]
profiles_copy["address"] = le_address.inverse_transform(profiles_copy["address_encoded"].round().astype(int))

profiles_copy.drop(columns=["address_encoded", "residence_encoded"], inplace=True)

after_manipulating = profiles_copy.isnull().sum().sum()
number_of_rows_new = len(profiles_copy)
number_of_values_new = profiles_copy.count().sum()
missing_values_new = profiles_copy.isnull().sum().sum()

print(
    f"Before manipulating: {before_manipulating} nan rows, after manipulating: {after_manipulating} nan rows"
)
print(f"Number of rows before: {number_of_rows}, after: {number_of_rows_new}")
print(
    f"Number of values before: {number_of_values} + {missing_values} missing ({number_of_values + missing_values}), after: {number_of_values_new} + {missing_values_new} missing ({number_of_values_new + missing_values_new})"
)
profiles_copy

## C.)


In [None]:
dataset["connections"].iloc[:, 3:].plot(
    kind="box", vert=False, figsize=(14, 10), flierprops=dict(marker="o", color="r", alpha=0.5)
)

In [None]:
dataset["processes"].iloc[:, 3:].plot(
    kind="box", vert=False, figsize=(14, 14), flierprops=dict(marker="o", color="r", alpha=0.5)
)

#### Outlier detections using **Z-score**


Number of outliers in **Connections**


In [None]:
z_scores = dataset["connections"].iloc[:, 2:].apply(stats.zscore)
outliers = z_scores[(z_scores.abs() > 3).any(axis=1)]
print(f"Number of connections with |z-score| > 3: {len(outliers)}")

Number of outliers in **Processes**


In [None]:
z_scores = dataset["processes"].iloc[:, 2:].apply(stats.zscore)
outliers = z_scores[(z_scores.abs() > 3).any(axis=1)]
print(f"Number of processes with |z-score| > 3: {len(outliers)}")

Removing outliers from **Connections** and showing boxplot


In [None]:
connections_copy = dataset["connections"].copy()
print("Number of connections before removing outliers:\t", connections_copy.shape[0])

connections_copy = connections_copy[(np.abs(stats.zscore(connections_copy.iloc[:, 2:])) < 3).all(axis=1)]
print("Number of connections after removing outliers:\t", connections_copy.shape[0])

In [None]:
dataset["connections"].iloc[:, 3:].plot(
    kind="box", vert=False, figsize=(10, 8), flierprops=dict(marker="o", color="r", alpha=0.5)
)
plt.title("Before removing outliers using Z-score")
plt.show()

connections_copy.iloc[:, 3:].plot(
    kind="box", vert=False, figsize=(10, 8), flierprops=dict(marker="o", color="r", alpha=0.5)
)
plt.title("After removing outliers using Z-score")
plt.show()

#### Outlier detections using **IQR**


In [69]:
def identify_outliers(a):
    lower = a.quantile(0.25) - 1.5 * stats.iqr(a)
    upper = a.quantile(0.75) + 1.5 * stats.iqr(a)

    return a[(a > upper) | (a < lower)]

In [None]:
connections_copy = dataset["connections"].copy()
print("Number of connections before removing outliers:\t", connections_copy.shape[0])

outlier = connections_copy.iloc[:, 2:].apply(identify_outliers)
connections_copy = connections_copy.drop(outlier.index)
print("Number of connections after removing outliers:\t", connections_copy.shape[0])

In [None]:
dataset["connections"].iloc[:, 3:].plot(
    kind="box", vert=False, figsize=(10, 8), flierprops=dict(marker="o", color="r", alpha=0.5)
)
plt.title("Before removing outliers using IQR")
plt.show()

connections_copy.iloc[:, 3:].plot(
    kind="box", vert=False, figsize=(10, 8), flierprops=dict(marker="o", color="r", alpha=0.5)
)
plt.title("After removing outliers using IQR")
plt.show()

In [72]:
# z_scores = dataset["connections"].iloc[:, 2:].apply(stats.zscore)

# # Vytvorenie masky pre hodnoty, kde je Z-skóre väčšie než 3 alebo menšie než -3
# outlier_mask = z_scores.abs() > 3

# # Použitie masky na nastavenie hodnôt na NaN len v stĺpcoch s outliers
# dataset["connections"].iloc[:, 2:] = dataset["connections"].iloc[:, 2:].mask(outlier_mask, np.nan)

# dataset["connections"]

# 1.3 Formulation and statistical verification of hypotheses about data


## A.)


### c.android.youtube


$H_0$: c.android.youtube has same values for mwra = 0 and mwra = 1

$H_A$: c.android.youtube has different values (higher) for mwra = 0 and mwra = 1


In [73]:
connections_copy = dataset["connections"].copy()

In [None]:
without_mwra = connections_copy[connections_copy["mwra"] == 0][["c.android.youtube", "mwra"]].dropna()
outlier = without_mwra.apply(identify_outliers)
without_mwra = without_mwra.drop(outlier.index)
without_mwra.drop(columns="mwra", inplace=True)
without_mwra.reset_index(drop=True, inplace=True)
without_mwra.head()

In [None]:
with_mwra = connections_copy[connections_copy["mwra"] == 1][["c.android.youtube", "mwra"]].dropna()
outlier = with_mwra.apply(identify_outliers)
with_mwra = with_mwra.drop(outlier.index)
with_mwra.drop(columns="mwra", inplace=True)
with_mwra.reset_index(drop=True, inplace=True)
with_mwra.head()

In [None]:
anderson_result = stats.anderson(with_mwra["c.android.youtube"], dist="norm")
print(anderson_result)
print(f"\nDoes fit normal distribution: {anderson_result.fit_result.success}")

In [None]:
anderson_result = stats.anderson(without_mwra["c.android.youtube"], dist="norm")
print(anderson_result)
print(f"\nDoes fit normal distribution: {anderson_result.fit_result.success}")

In [None]:
stats.levene(with_mwra["c.android.youtube"], without_mwra["c.android.youtube"])

In [None]:
data_to_plot = pd.DataFrame(
    {"With MWRA": with_mwra["c.android.youtube"], "Without MWRA": without_mwra["c.android.youtube"]}
).plot(kind="hist", bins=30, alpha=0.5, figsize=(10, 6))

In [None]:
stat, p_value = stats.ttest_ind(
    with_mwra["c.android.youtube"], without_mwra["c.android.youtube"], equal_var=False
)

print(f"Welch’s T-test Statistic: {stat}")
print(f"p_value: {float(p_value):.10f}")

if p_value < 0.05:
    print(
        "There is a significant difference between the c.android.youtube with malware-related-activity and without."
    )
else:
    print("No significant difference between the two groups.")

if stat < 0:
    print("c.android.youtube has higher weight in normal activity")
else:
    print("c.android.youtube has higher weight in malware-related activity")

In [None]:
power_analysis = TTestIndPower()

mean_with_mwra = with_mwra["c.android.youtube"].mean()
mean_without_mwra = without_mwra["c.android.youtube"].mean()
std_with_mwra = with_mwra["c.android.youtube"].std()
std_without_mwra = without_mwra["c.android.youtube"].std()

pooled_std = ((std_with_mwra**2 + std_without_mwra**2) / 2) ** 0.5

print("std of with mwra:", std_with_mwra)
print("std of without mwra:", std_without_mwra)
print("pooled std:", pooled_std)

effect_size = (mean_with_mwra - mean_without_mwra) / pooled_std

alpha = 0.05
sample_size = len(with_mwra) + len(without_mwra)

power = power_analysis.power(
    effect_size=effect_size, nobs1=len(with_mwra), alpha=alpha, ratio=len(without_mwra) / len(with_mwra)
)

if power > 0.8:
    print(f"Yes, the strength of the test is sufficient. Power: {power:.2f}")
else:
    print(f"No, the strength of the test is not sufficient. Power: {power:.2f}")

In [None]:
required_sample_size = power_analysis.solve_power(
    effect_size=effect_size, alpha=alpha, power=0.8, ratio=len(without_mwra) / len(with_mwra)
)
print(f"required data size for the strength of 0.8 is: {required_sample_size:.0f}")
print(
    "actual size data of with_mwra: ",
    len(with_mwra),
    " actual size data of without_mwra: ",
    len(without_mwra),
)
if len(with_mwra) < required_sample_size or len(without_mwra) < required_sample_size:
    print("We need to collect more data")
else:
    print("We have enough data")

### p.android.settings


$H_0$: p.android.setting has same values for mwra = 0 and mwra = 1

$H_A$: p.android.setting has different (higher) values for mwra = 0 and mwra = 1


In [None]:
with_mwra = dataset["processes"][(dataset["processes"]["mwra"] == 1)][["mwra", "p.android.settings"]]
without_mwra = dataset["processes"][(dataset["processes"]["mwra"] == 0)][["mwra", "p.android.settings"]]

print("Number of with_mwra before removing outliers:\t", with_mwra.shape[0])
print("Number of without_mwra before removing outliers:\t", without_mwra.shape[0])

outliers = with_mwra.apply(identify_outliers)
with_mwra = with_mwra.drop(outliers.index)

outliers = without_mwra.apply(identify_outliers)
without_mwra = without_mwra.drop(outliers.index)

print("Number of with_mwra after removing outliers:\t", with_mwra.shape[0])
print("Number of without_mwra after removing outliers:\t", without_mwra.shape[0])

In [None]:
anderson_result = stats.anderson(with_mwra["p.android.settings"], dist="norm")
print(anderson_result)
print(f"\nDoes fit normal distribution: {anderson_result.fit_result.success}")

In [None]:
anderson_result = stats.anderson(without_mwra["p.android.settings"], dist="norm")
print(anderson_result)
print(f"\nDoes fit normal distribution: {anderson_result.fit_result.success}")

In [None]:
stats.levene(with_mwra["p.android.settings"], without_mwra["p.android.settings"])

In [None]:
data_to_plot = pd.DataFrame(
    {"With MWRA": with_mwra["p.android.settings"], "Without MWRA": without_mwra["p.android.settings"]}
).plot(kind="hist", bins=30, alpha=0.5, figsize=(10, 6))

In [None]:
stat, p_value = stats.ttest_ind(
    with_mwra["p.android.settings"], without_mwra["p.android.settings"], equal_var=False
)

print(f"Welch’s T-test Statistic: {stat}")
print(p_value)

if p_value < 0.05:
    print(
        "There is a significant difference between the p.android.settings with malware-related-activity and without."
    )
else:
    print("No significant difference between the two groups.")

if stat < 0:
    print("p.android.settings has higher weight in normal activity")
else:
    print("p.android.settings has higher weight in malware-related activity")

TODO: zisit ci sa tabulky v connecitona processes mwra rovanju v case


In [None]:
power_analysis = TTestIndPower()

mean_with_mwra = with_mwra["p.android.settings"].mean()
mean_without_mwra = without_mwra["p.android.settings"].mean()
std_with_mwra = with_mwra["p.android.settings"].std()
std_without_mwra = without_mwra["p.android.settings"].std()

pooled_std = ((std_with_mwra**2 + std_without_mwra**2) / 2) ** 0.5
effect_size = (mean_with_mwra - mean_without_mwra) / pooled_std

alpha = 0.05
sample_size = len(with_mwra) + len(without_mwra)

power = power_analysis.power(
    effect_size=effect_size, nobs1=len(with_mwra), alpha=alpha, ratio=len(without_mwra) / len(with_mwra)
)

if power > 0.8:
    print(f"Yes, the strength of the test is sufficient. Power: {power:.2f}")
else:
    print(f"No, the strength of the test is not sufficient. Power: {power:.2f}")

In [None]:
required_sample_size = power_analysis.solve_power(
    effect_size=effect_size, alpha=alpha, power=0.8, ratio=len(without_mwra) / len(with_mwra)
)
print(f"required data size for the strength of 0.8 is: {required_sample_size:.0f}")
print(
    "actual size data of with_mwra: ",
    len(with_mwra),
    " actual size data of without_mwra: ",
    len(without_mwra),
)
if len(with_mwra) < required_sample_size or len(without_mwra) < required_sample_size:
    print("We need to collect more data")
else:
    print("We have enough data")

# Nothing Important


In [None]:
import sys


def format_size(size):
    """Convert size in bytes to KB or MB as appropriate."""
    if size >= 1024 * 1024:
        return f"{size / (1024 * 1024):.2f} MB"
    elif size >= 1024:
        return f"{size / 1024:.2f} KB"
    else:
        return f"{size} bytes"


def print_memory_usage():

    memory_usage_list = []

    for name, var in globals().items():
        try:
            memory_usage_list.append((name, sys.getsizeof(var)))
        except TypeError:
            memory_usage_list.append((name, float("inf")))  # Use infinity for undetermined sizes

    # Sort the list by memory usage in descending order
    memory_usage_list.sort(key=lambda x: x[1], reverse=True)

    # Print the sorted list
    print("Memory usage of variables (sorted):")
    print(f"Memory sum: {format_size(sum(size for _, size in memory_usage_list))}")
    for name, size in memory_usage_list:
        if size == float("inf"):
            print(f"Memory usage of {name}: Unable to determine size")
        else:
            print(f"Memory usage of {name}: {format_size(size)}")


print_memory_usage()