# Start


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
from typing import Optional
import statsmodels.api as sm


file_path: str = "../dataset"
files: tuple[str, ...] = ("connections", "devices", "processes", "profiles")

dataset: dict[str, pd.DataFrame] = {}
for file in files:
    dataset[file] = pd.read_csv(f"{file_path}/{file}.csv", sep="\t")

-   importing the necessary libraries
-   declaring the variables


# A.)


### Connections description


In [None]:
dataset["connections"].info()

-   Total Entries: 15,108
-   Total Columns: 13
-   Column Types:
-   11 columns of type float64
-   1 column of type int64
-   1 column of type object
-   There are no missing values in this data


In [None]:
dataset["connections"]["ts"]

-   The object column "ts" is date and time


In [4]:
dataset["connections"]["ts"] = pd.to_datetime(dataset["connections"].ts)

-   Cast the "ts" column to datetime


In [None]:
connection_summary = dataset["connections"].describe()
median = (
    dataset["connections"].select_dtypes(include=["float64", "int64"]).median()
)  # adding median to describe method output
connection_summary.loc["median"] = median

# dropping imei, as it has no meaning to make these statistics out of it
connection_summary.drop(columns=["imei"], inplace=True)
connection_summary

-   From these tables above we can see imei is a long integer and looks like an ID, if we look at processes table, we can also see same values indicating this could be an Id of device.
-   Another assumption we can make is that columns specifying a connection type (columns starting with c. such as c.android.youtube) have values ranging from 0 to 100, this could indicate that it is a percentage amount of time that the connection was established.


-   First few rows might indicate that the data was sample in a 1 minute interval.
-   Let's look at it closer.


In [None]:
dataset["connections"].sort_values(by="ts", ascending=True)["ts"]

-   Now we see it looks like samples are in a 1 minute interval.
-   Let's go further.


In [None]:
times = dataset["connections"].sort_values(by="ts")["ts"]
times = pd.to_datetime(times)

previous_time: Optional[pd.Series] = None

same_times: int = 0
non_minute_differences: int = 0


for current_time in times:
    if previous_time is None:
        previous_time = current_time
        continue

    if (current_time - previous_time).seconds == 0:
        same_times += 1

    elif (current_time - previous_time).seconds != 60:
        non_minute_differences += 1

    previous_time = current_time

print(f"Non minute differences: {non_minute_differences}")
print(f"Same times: {same_times}")

-   From this we can see, that there are data every minute, sometimes more than once at the same time.


In [None]:
times = (
    dataset["connections"]
    .groupby(by="imei")[["imei", "ts"]]
    .apply(lambda val: val.sort_values(by="ts", ascending=True))
    .reset_index(drop=True)
)
times

-   If we assume that columns starting with "c." are representing percentage amount of time being active during a time window, we need to group them by device serial number (imei) and then look at the time difference.


In [None]:
dataset["connections"]["mwra"].value_counts()

-   mwra is (Malware-related-activity)
-   In data there are only values 1.0 and 0.0 indicating if there was a malware activity in specific time frame.


### Devices description


In [None]:
dataset["devices"].info()

In [None]:
devices_summary = dataset["devices"].describe()
median = (
    dataset["devices"].select_dtypes(include=["float64", "int64"]).median()
)  # adding median to describe method output
devices_summary.loc["median"] = median

# dropping imei, as it has no meaning to make these statistics out of it
devices_summary.drop(columns=["imei"], inplace=True)
devices_summary

In [None]:
dataset["devices"].head()

-   "store_name" object is a string
-   "code" is string, holding code for state
-   "location" is a string, containing continent and city


### Processes description


In [None]:
dataset["processes"].info()

In [14]:
dataset["processes"]["ts"] = pd.to_datetime(dataset["processes"].ts)

In [None]:
processes_summary = dataset["processes"].describe()
median = (
    dataset["processes"].select_dtypes(include=["float64", "int64"]).median()
)  # adding median to describe method output
processes_summary.loc["median"] = median

# dropping imei, as it has no meaning to make these statistics out of it
processes_summary.drop(columns=["imei"], inplace=True)
processes_summary

In [None]:
dataset["processes"].head()

### Profiles description


In [None]:
dataset["profiles"].info()

In [None]:
profiles_summary = dataset["profiles"].describe()
median = (
    dataset["profiles"].select_dtypes(include=["float64", "int64"]).median()
)  # adding median to describe method output
profiles_summary.loc["median"] = median

# dropping imei, as it has no meaning to make these statistics out of it
profiles_summary.drop(columns=["imei"], inplace=True)
profiles_summary

In [None]:
dataset["profiles"].head()

In [None]:
null_values = {file: data.isnull().sum() for file, data in dataset.items()}
for file, nulls in null_values.items():
    if nulls.sum() == 0:
        continue
    print(f"Null values in {file} dataset:")
    print(nulls)
    print("\n")

# B.)


### MWRA


-   First we look at the most important column "mwra" and look at it more in depth.


In [None]:
dataset["connections"]["mwra"].value_counts(normalize=True) * 100

-   In "connections" we can see that positive mwra is ~62%, indicating that there are more positive cases and therefore in future when we put it into our model might falsely evaluate some connections. I would say the closer we are to 50/50 the better.


In [None]:
dataset["processes"]["mwra"].value_counts(normalize=True) * 100

-   "mwra" is the same for "processes" as it is for "connections"


## Connections


### Defining the variables


In [23]:
chrome_data = dataset["connections"]["c.android.chrome"]
chrome_mean = chrome_data.mean()
chrome_std = chrome_data.std()

dogalize_data = dataset["connections"]["c.dogalize"]
dogalize_mean = dogalize_data.mean()
dogalize_std = dogalize_data.std()

gm_data = dataset["connections"]["c.android.gm"]
gm_mean = gm_data.mean()
gm_std = gm_data.std()

youtube_data = dataset["connections"]["c.android.youtube"]
youtube_mean = youtube_data.mean()
youtube_std = youtube_data.std()

katana_data = dataset["connections"]["c.katana"]
katana_mean = katana_data.mean()
katana_std = katana_data.std()

raider_data = dataset["connections"]["c.raider"]
raider_mean = raider_data.mean()
raider_std = raider_data.std()

vending_data = dataset["connections"]["c.android.vending"]
vending_mean = vending_data.mean()
vending_std = vending_data.std()

x86_data = dataset["connections"]["c.UCMobile.x86"]
x86_mean = x86_data.mean()
x86_std = x86_data.std()


updateassist_data = dataset["connections"]["c.updateassist"]
updateassist_mean = updateassist_data.mean()
updateassist_std = updateassist_data.std()

intl_data = dataset["connections"]["c.UCMobile.intl"]
intl_mean = intl_data.mean()
intl_std = intl_data.std()

all_str_connections: list[str] = [
    "c.android.chrome",
    "c.dogalize",
    "c.android.gm",
    "c.android.youtube",
    "c.katana",
    "c.raider",
    "c.android.vending",
    "c.UCMobile.x86",
    "c.updateassist",
    "c.UCMobile.intl",
]

all_data_connections: list[pd.Series] = [
    chrome_data,
    dogalize_data,
    gm_data,
    youtube_data,
    katana_data,
    raider_data,
    vending_data,
    x86_data,
    updateassist_data,
    intl_data,
]

all_means_connections: list[float] = [
    chrome_mean,
    dogalize_mean,
    gm_mean,
    youtube_mean,
    katana_mean,
    raider_mean,
    vending_mean,
    x86_mean,
    updateassist_mean,
    intl_mean,
]

all_std_connections: list[float] = [
    chrome_std,
    dogalize_std,
    gm_std,
    youtube_std,
    katana_std,
    raider_std,
    vending_std,
    x86_std,
    updateassist_std,
    intl_std,
]

### Measure of dispersion


In [None]:
df = dataset["connections"].iloc[:, 3:].describe()
df.loc["full_range"] = df.loc["max"] - df.loc["min"]
df.loc["interquartile_range"] = df.loc["75%"] - df.loc["25%"]
df

### Measure of center


In [None]:
apps = all_str_connections

means = all_means_connections

medians = [
    chrome_data.median(),
    dogalize_data.median(),
    gm_data.median(),
    youtube_data.median(),
    katana_data.median(),
    raider_data.median(),
    vending_data.median(),
    x86_data.median(),
    updateassist_data.median(),
    intl_data.median(),
]

max_values = []
most_occurring_values = []

value_counts = chrome_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = dogalize_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = gm_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = youtube_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = katana_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = raider_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = vending_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = x86_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = updateassist_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())

value_counts = intl_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())

for i in range(len(most_occurring_values)):
    most_occurring_values[i] = most_occurring_values[i][: min(3, len(most_occurring_values[i]))]


data = {
    "connection": apps,
    "mean": means,
    "median": medians,
    "mode_count": max_values,
    "mode_values": most_occurring_values,
}

df = pd.DataFrame(data)
df

### Measure of shape


In [26]:
def skewness_type(skew_value: float) -> str:
    if skew_value <= -1:
        return "Highly Negative Skew"

    elif skew_value <= -0.5:
        return "Moderately Negative Skew"

    elif skew_value <= 0.5:
        return "Approximately Symmetric"

    elif skew_value <= 1:
        return "Moderately Positive Skew"

    return "Highly Positive Skew"

In [27]:
def kurtosis_type(kurtosis_value: float) -> str:
    if kurtosis_value < -1:
        return "Negative Kurtosis"

    elif kurtosis_value < -0.5:
        return "Moderately Negative Kurtosis"

    elif kurtosis_value < 0.5:
        return "Approximately Normal Kurtosis"

    elif kurtosis_value < 1:
        return "Moderately Positive Kurtosis"

    return "Positive Kurtosis"

In [None]:
data = {
    "connection": all_str_connections,
    "skew": [
        stats.skew(chrome_data),
        stats.skew(dogalize_data),
        stats.skew(gm_data),
        stats.skew(youtube_data),
        stats.skew(katana_data),
        stats.skew(raider_data),
        stats.skew(vending_data),
        stats.skew(x86_data),
        stats.skew(updateassist_data),
        stats.skew(intl_data),
    ],
    "kurtosis": [
        stats.kurtosis(chrome_data),
        stats.kurtosis(dogalize_data),
        stats.kurtosis(gm_data),
        stats.kurtosis(youtube_data),
        stats.kurtosis(katana_data),
        stats.kurtosis(raider_data),
        stats.kurtosis(vending_data),
        stats.kurtosis(x86_data),
        stats.kurtosis(updateassist_data),
        stats.kurtosis(intl_data),
    ],
}

shape_df = pd.DataFrame(data)

shape_df["result skew"] = shape_df["skew"].apply(skewness_type)
shape_df["result kurtosis"] = shape_df["kurtosis"].apply(kurtosis_type)
shape_df.set_index("connection", inplace=True)
shape_df

### Histograms with KDE


In [None]:
_, axes = plt.subplots(5, 2, figsize=(16, 26))


# Having all the data in a list, we can iterate over it and plot the histogram with KDE for each connection.
for i in range(len(all_data_connections)):
    sns.histplot(all_data_connections[i], bins=30, kde=True, ax=axes[i // 2, i % 2])
    axes[i // 2, i % 2].axvline(
        all_means_connections[i], color="r", linestyle="--", label=f"Mean: {all_means_connections[i]:.2f}"
    )
    axes[i // 2, i % 2].axvline(
        linestyle="", label=f"Skewness: {shape_df.loc[all_str_connections[i]]['skew']:.2f}"
    )
    axes[i // 2, i % 2].axvline(
        linestyle="", label=f"Kurtosis: {shape_df.loc[all_str_connections[i]]['kurtosis']:.2f}"
    )
    axes[i // 2, i % 2].set_title(f"Distribution of {all_str_connections[i]}")
    axes[i // 2, i % 2].legend()
    axes[i // 2, i % 2].grid(True, alpha=0.3)

# Adjust the layout
plt.tight_layout(w_pad=3, h_pad=3)
plt.show()

In [None]:
_, axes = plt.subplots(5, 2, figsize=(16, 26))

for i in range(len(all_data_connections)):
    sns.histplot(
        data=all_data_connections[i],
        bins=30,
        stat="density",
        alpha=0.3,
        color="gray",
        label="Histogram",
        ax=axes[i // 2, i % 2],
    )
    sns.kdeplot(
        data=all_data_connections[i],
        color="blue",
        label="Actual Distribution",
        linewidth=2,
        ax=axes[i // 2, i % 2],
    )

    if i <= 5:  # We know that first 6 connections are not uniform
        x = np.linspace(all_data_connections[i].min(), all_data_connections[i].max(), 100)
        gaussian = stats.norm.pdf(x, all_means_connections[i], all_std_connections[i])
        axes[i // 2, i % 2].plot(
            x, gaussian, color="red", linestyle="--", label="Gaussian Model", linewidth=2
        )

    else:
        a = all_data_connections[i].min()
        b = all_data_connections[i].max()
        x = np.linspace(a, b, all_data_connections[i].size)
        uniform_dist = stats.uniform(loc=a, scale=b - a)
        axes[i // 2, i % 2].plot(
            x, uniform_dist.pdf(x), color="red", linestyle="--", label="Uniform Model", linewidth=2
        )

    axes[i // 2, i % 2].set_title(f"Distribution of {all_str_connections[i]}")
    axes[i // 2, i % 2].legend()
    axes[i // 2, i % 2].grid(True, alpha=0.3)


# Adjust the layout
plt.tight_layout(w_pad=3, h_pad=3)
plt.show()

### Boxplots


In [None]:
_, axes = plt.subplots(5, 2, figsize=(16, 26))

for i in range(len(all_data_connections)):
    sns.boxplot(all_data_connections[i], ax=axes[i // 2, i % 2])
    axes[i // 2, i % 2].set_title(f"Distribution of {all_str_connections[i]}")
    axes[i // 2, i % 2].grid(True, alpha=0.3)

# Adjust the layout
plt.tight_layout(w_pad=3, h_pad=3)
plt.show()

### Q-Q plots


In [None]:
_, axes = plt.subplots(5, 2, figsize=(16, 26))

for i in range(len(all_data_connections)):
    sm.qqplot(all_data_connections[i], fit=True, line="45", ax=axes[i // 2, i % 2])
    axes[i // 2, i % 2].set_title(f"Distribution of {all_str_connections[i]}")
    axes[i // 2, i % 2].grid(True, alpha=0.3)

# Adjust the layout
plt.tight_layout(w_pad=3, h_pad=3)
plt.show()

## Processes


### Defining the variables


In [33]:
chrome_data = dataset["processes"]["p.android.chrome"]
chrome_mean = chrome_data.mean()
chrome_std = chrome_data.std()

dogalize_data = dataset["processes"]["p.dogalize"]
dogalize_mean = dogalize_data.mean()
dogalize_std = dogalize_data.std()

katana_data = dataset["processes"]["p.katana"]
katana_mean = katana_data.mean()
katana_std = katana_data.std()

settings_data = dataset["processes"]["p.android.settings"]
settings_mean = settings_data.mean()
settings_std = settings_data.std()

system_data = dataset["processes"]["p.system"]
system_mean = system_data.mean()
system_std = system_data.std()

simulator_data = dataset["processes"]["p.simulator"]
simulator_mean = simulator_data.mean()
simulator_std = simulator_data.std()

all_str_processes: list[str] = [
    "p.android.chrome",
    "p.dogalize",
    "p.katana",
    "p.android.settings",
    "p.system",
    "p.simulator",
]

all_data_processes: list[pd.Series] = [
    chrome_data,
    dogalize_data,
    katana_data,
    settings_data,
    system_data,
    simulator_data,
]

all_means_processes: list[float] = [
    chrome_mean,
    dogalize_mean,
    katana_mean,
    settings_mean,
    system_mean,
    simulator_mean,
]

all_std_processes: list[float] = [
    chrome_std,
    dogalize_std,
    katana_std,
    settings_std,
    system_std,
    simulator_std,
]

### Measure of dispersion


In [None]:
df = dataset["processes"][all_str_processes].describe()
df.loc["full_range"] = df.loc["max"] - df.loc["min"]
df.loc["interquartile_range"] = df.loc["75%"] - df.loc["25%"]
df

### Measure of center


In [None]:
apps = all_str_processes

means = all_means_processes

medians = [
    chrome_data.median(),
    dogalize_data.median(),
    katana_data.median(),
    settings_data.median(),
    system_data.median(),
    simulator_data.median(),
]

max_values = []
most_occurring_values = []

value_counts = chrome_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = dogalize_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = katana_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = settings_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = system_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())


value_counts = simulator_data.value_counts()
max_val = value_counts.max()
max_values.append(max_val)
most_occurring_values.append(value_counts[value_counts == max_val].index.tolist())

for i in range(len(most_occurring_values)):
    most_occurring_values[i] = most_occurring_values[i][: min(3, len(most_occurring_values[i]))]


data = {
    "process": apps,
    "mean": means,
    "median": medians,
    "mode_count": max_values,
    "mode_values": most_occurring_values,
}

df = pd.DataFrame(data)
df

### Measure of shape


In [None]:
data = {
    "process": all_str_processes,
    "skew": [
        stats.skew(chrome_data),
        stats.skew(dogalize_data),
        stats.skew(katana_data),
        stats.skew(settings_data),
        stats.skew(system_data),
        stats.skew(simulator_data),
    ],
    "kurtosis": [
        stats.kurtosis(chrome_data),
        stats.kurtosis(dogalize_data),
        stats.kurtosis(katana_data),
        stats.kurtosis(settings_data),
        stats.kurtosis(system_data),
        stats.kurtosis(simulator_data),
    ],
}

shape_df = pd.DataFrame(data)
shape_df["result skew"] = shape_df["skew"].apply(skewness_type)
shape_df["result kurtosis"] = shape_df["kurtosis"].apply(kurtosis_type)
shape_df.set_index("process", inplace=True)
shape_df

### Histograms with KDE


In [None]:
fig, axes = plt.subplots(3, 2, figsize=(16, 16))

for i in range(len(all_data_processes)):
    sns.histplot(all_data_processes[i], bins=30, kde=True, ax=axes[i // 2, i % 2])
    axes[i // 2, i % 2].axvline(
        all_means_processes[i], color="r", linestyle="--", label=f"Mean: {all_means_processes[i]:.2f}"
    )
    axes[i // 2, i % 2].axvline(
        linestyle="", label=f"Skewness: {shape_df.loc[all_str_processes[i]]['skew']:.2f}"
    )
    axes[i // 2, i % 2].axvline(
        linestyle="", label=f"Kurtosis: {shape_df.loc[all_str_processes[i]]['kurtosis']:.2f}"
    )
    axes[i // 2, i % 2].set_title(f"Distribution of {all_str_processes[i]}")
    axes[i // 2, i % 2].legend()
    axes[i // 2, i % 2].grid(True, alpha=0.3)

plt.tight_layout(w_pad=3, h_pad=3)
plt.show()

In [None]:
_, axes = plt.subplots(3, 2, figsize=(16, 16))

gaussian_models = [chrome_data, settings_data, system_data]

for i in range(len(all_data_processes)):
    sns.histplot(
        data=all_data_processes[i],
        bins=30,
        stat="density",
        alpha=0.3,
        color="gray",
        label="Histogram",
        ax=axes[i // 2, i % 2],
    )
    sns.kdeplot(
        data=all_data_processes[i],
        color="blue",
        label="Actual Distribution",
        linewidth=2,
        ax=axes[i // 2, i % 2],
    )

    if any(all_data_processes[i] is model for model in gaussian_models):
        x = np.linspace(all_data_processes[i].min(), all_data_processes[i].max(), 100)
        gaussian = stats.norm.pdf(x, all_means_processes[i], all_std_processes[i])
        axes[i // 2, i % 2].plot(
            x, gaussian, color="red", linestyle="--", label="Gaussian Model", linewidth=2
        )

    else:
        a = all_data_processes[i].min()
        b = all_data_processes[i].max()
        x = np.linspace(a, b, all_data_processes[i].size)
        uniform_dist = stats.uniform(loc=a, scale=b - a)
        axes[i // 2, i % 2].plot(
            x, uniform_dist.pdf(x), color="red", linestyle="--", label="Uniform Model", linewidth=2
        )

    axes[i // 2, i % 2].set_title(f"Distribution of {all_str_processes[i]}")
    axes[i // 2, i % 2].legend()
    axes[i // 2, i % 2].grid(True, alpha=0.3)

### Boxplots


In [None]:
_, axes = plt.subplots(3, 2, figsize=(16, 16))

for i in range(len(all_data_processes)):
    sns.boxplot(all_data_processes[i], ax=axes[i // 2, i % 2])
    axes[i // 2, i % 2].set_title(f"Distribution of {all_str_processes[i]}")
    axes[i // 2, i % 2].grid(True, alpha=0.3)


plt.tight_layout(w_pad=3, h_pad=3)
plt.show()

### Q-Q plots


In [None]:
_, axes = plt.subplots(3, 2, figsize=(16, 16))


for i in range(len(all_data_processes)):
    sm.qqplot(all_data_processes[i], fit=True, line="45", ax=axes[i // 2, i % 2])
    axes[i // 2, i % 2].set_title(f"Distribution of {all_str_processes[i]}")
    axes[i // 2, i % 2].grid(True, alpha=0.3)


plt.tight_layout(w_pad=3, h_pad=3)
plt.show()