In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
from typing import Optional


file_path: str = "../dataset"
files: tuple[str, ...] = ("connections", "devices", "processes", "profiles")

dataset: dict[str, pd.DataFrame] = {}
for file in files:
    dataset[file] = pd.read_csv(f"{file_path}/{file}.csv", sep="\t")

A.)


Connections description


In [None]:
dataset["connections"].info()

-   Total Entries: 15,108
-   Total Columns: 13
-   Column Types:
-   11 columns of type float64
-   1 column of type int64
-   1 column of type object
-   There are no missing values in this data


In [None]:
dataset["connections"]["ts"]

-   The object column "ts" is date and time


In [None]:
connection_summary = dataset["connections"].describe()
median = (
    dataset["connections"].select_dtypes(include=["float64", "int64"]).median()
)  # adding median to describe method output
connection_summary.loc["median"] = median

# dropping imei, as it has no meaning to make these statistics out of it
connection_summary.drop(columns=["imei"], inplace=True)
connection_summary

-   From these tables above we can see imei is a long integer and looks like an Id, if we look at processes table, we can also see same values indicating this could be an Id of device.
-   Another assumption we can make is that columns specifying a connection type (columns starting with c. such as c.android.youtube) have values ranging from 0 to 100, this could indicate that it is a percentage amount of time that the connection was established.


-   First few rows might indicate that the data was sample in a 1 minute interval.
-   Let's look at it closer.


In [None]:
dataset["connections"].sort_values(by="ts", ascending=True)["ts"]

-   Now we see it looks like samples are in a 1 minute interval.
-   Let's go further.


In [None]:
times = dataset["connections"].sort_values(by="ts")["ts"]
times = pd.to_datetime(times)

previous_time: Optional[pd.Series] = None

same_times: int = 0
non_minute_differences: int = 0


for current_time in times:
    if previous_time is None:
        previous_time = current_time
        continue

    if (current_time - previous_time).seconds == 0:
        same_times += 1

    elif (current_time - previous_time).seconds != 60:
        non_minute_differences += 1

    previous_time = current_time

print(f"Non minute differences: {non_minute_differences}")
print(f"Same times: {same_times}")

-   From this we can see, that there are data every minute, sometimes more than once at the same time.


In [None]:
times = (
    dataset["connections"]
    .groupby(by="imei")[["imei", "ts"]]
    .apply(lambda x: x.sort_values(by="ts", ascending=True))
    .reset_index(drop=True)
)
times

-   If we assume that columns starting with "c." are representing percentage amount of time being active during a time window, we need to group them by device serial number (imei) and then look at the time difference.


In [None]:
dataset["connections"]["mwra"].value_counts()

-   mwra is (Malware-related-activity)
-   In data there are only values 1.0 and 0.0 indicating if there was a malware activity in specific time frame.


Devices description


In [None]:
dataset["devices"].info()

In [None]:
devices_summary = dataset["devices"].describe()
median = (
    dataset["devices"].select_dtypes(include=["float64", "int64"]).median()
)  # adding median to describe method output
devices_summary.loc["median"] = median

# dropping imei, as it has no meaning to make these statistics out of it
devices_summary.drop(columns=["imei"], inplace=True)
devices_summary

In [None]:
dataset["devices"].head()

-   "store_name" object is a string
-   "code" is string, holding code for state
-   "location" is a string, containing continent and city


Processes description


In [None]:
dataset["processes"].info()

In [None]:
processes_summary = dataset["processes"].describe()
median = (
    dataset["processes"].select_dtypes(include=["float64", "int64"]).median()
)  # adding median to describe method output
processes_summary.loc["median"] = median

# dropping imei, as it has no meaning to make these statistics out of it
processes_summary.drop(columns=["imei"], inplace=True)
processes_summary

In [None]:
dataset["processes"].head()

Profiles description


In [None]:
dataset["profiles"].info()

In [None]:
profiles_summary = dataset["profiles"].describe()
median = (
    dataset["profiles"].select_dtypes(include=["float64", "int64"]).median()
)  # adding median to describe method output
profiles_summary.loc["median"] = median

# dropping imei, as it has no meaning to make these statistics out of it
profiles_summary.drop(columns=["imei"], inplace=True)
profiles_summary

In [None]:
dataset["profiles"].head()

B


-   First we look at the most important column "mwra" and look at it more in depth.


In [None]:
dataset["connections"]["mwra"].value_counts(normalize=True) * 100

-   In "connections" we can see that positive mwra is ~62%, indicating that there are more positive cases and therefore in future when we put it into our model might falsely evaluate some connections. I would say the closer we are to 50/50 the better.


In [None]:
dataset["processes"]["mwra"].value_counts(normalize=True) * 100

-   "mwra" is the same for "processes" as it is for "connections"


## Declaring and Initializing variables for further use.


In [21]:
chrome_data = dataset["connections"]["c.android.chrome"]
chrome_mean = chrome_data.mean()
chrome_std = chrome_data.std()

dogalize_data = dataset["connections"]["c.dogalize"]
dogalize_mean = dogalize_data.mean()
dogalize_std = dogalize_data.std()

gm_data = dataset["connections"]["c.android.gm"]
gm_mean = gm_data.mean()
gm_std = gm_data.std()

youtube_data = dataset["connections"]["c.android.youtube"]
youtube_mean = youtube_data.mean()
youtube_std = youtube_data.std()

katana_data = dataset["connections"]["c.katana"]
katana_mean = katana_data.mean()
katana_std = katana_data.std()

raider_data = dataset["connections"]["c.raider"]
raider_mean = raider_data.mean()
raider_std = raider_data.std()

vending_data = dataset["connections"]["c.android.vending"]
vending_mean = vending_data.mean()
vending_std = vending_data.std()

x86_data = dataset["connections"]["c.UCMobile.x86"]
x86_mean = x86_data.mean()
x86_std = x86_data.std()


updateassist_data = dataset["connections"]["c.updateassist"]
updateassist_mean = updateassist_data.mean()
updateassist_std = updateassist_data.std()

intl_data = dataset["connections"]["c.UCMobile.intl"]
intl_mean = intl_data.mean()
intl_std = intl_data.std()

In [None]:
sns.histplot(chrome_data, bins=30, kde=True)
plt.axvline(chrome_mean, color="r", linestyle="--", label=f"Mean: {chrome_mean:.2f}")
plt.title("Distribution of c.android.chrome")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.legend()
plt.show()

In [None]:
chrome_distribution = stats.norm.rvs(loc=chrome_mean, scale=chrome_std, size=chrome_data.size)
sns.histplot(chrome_distribution, bins=30, kde=True)
plt.title("Normal distribution")
plt.xlabel("Value")
plt.ylabel("Numbers of occurrences")
plt.show()

In [None]:
sns.histplot(chrome_data, bins=30, kde=True, color="blue")
sns.histplot(chrome_distribution, bins=30, kde=True, color="black")
plt.axvline(chrome_mean, color="r", linestyle="--", label=f"Mean: {chrome_mean:.2f}")
plt.title("Comparison of c.android.chrome and normal distribution")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.legend()
plt.show()

In [None]:
mu, std = stats.norm.fit(chrome_data)

# Create the histogram using seaborn
sns.histplot(data=chrome_data, bins=25, stat="density", kde="True", color="r")

# Plot the PDF
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, mu, std)

plt.plot(x, p, "k", linewidth=2)
title = "Fit Values: {:.2f} and {:.2f}".format(mu, std)
plt.title(title)
plt.show()

-   The red line is mean of the data.
-   We can see that the data is approximately symmetric, but has a skew to the right.


In [None]:
chrome_skew = stats.skew(chrome_data)
print(f"Skewness of c.android.chrome: {chrome_skew}")

-   Skewness is 0.20270904314934854, therefore this data is approximately symmetric, but has a little bit fatter right tail.


In [None]:
sns.boxplot(data=chrome_data)
plt.title("Boxplot of c.android.chrome")
plt.xlabel("Numbers of occurrences")
plt.ylabel("Time active")
plt.show()

-   If we assume the values in columns are active time, we should also look at the range 0-100


In [None]:
sns.histplot(chrome_data, bins=25, kde=True)
plt.axvline(chrome_mean, color="r", linestyle="--", label=f"Mean: {chrome_mean:.2f}")
plt.xlim(0, 100)
plt.title("Distribution of c.android.chrome")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.show()

-   Here we see that there are little outliers in lower values, but there are quite few of them near maximum value.


In [None]:
sns.histplot(dogalize_data, bins=30, kde=True)
plt.axvline(dogalize_mean, color="r", linestyle="--", label=f"Mean: {dogalize_mean:.2f}")
plt.title("Distribution of c.dogalize")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.legend()
plt.show()

In [None]:
mu, std = stats.norm.fit(dogalize_data)

# Create the histogram using seaborn
sns.histplot(data=dogalize_data, bins=25, stat="density", kde="True", color="r")

# Plot the PDF
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, mu, std)

plt.plot(x, p, "k", linewidth=2)
title = "Fit Values: {:.2f} and {:.2f}".format(mu, std)
plt.title(title)
plt.show()

In [None]:
dogalize_skew = stats.skew(dogalize_data)
print(f"Skewness of c.dogalize: {dogalize_skew}")

In [None]:
sns.boxplot(data=dogalize_data)
plt.title("Boxplot of c.dogalize")
plt.xlabel("Numbers of occurrences")
plt.ylabel("Time active")
plt.show()

In [None]:
sns.histplot(dogalize_data, bins=25, kde=True)
plt.axvline(dogalize_mean, color="r", linestyle="--", label=f"Mean: {dogalize_mean:.2f}")
plt.xlim(0, 100)
plt.title("Distribution of c.dogalize")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.show()

In [None]:
sns.histplot(gm_data, bins=30, kde=True)
plt.axvline(gm_mean, color="r", linestyle="--", label=f"Mean: {gm_mean:.2f}")
plt.title("Distribution of c.android.gm")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.legend()
plt.show()

In [None]:
mu, std = stats.norm.fit(gm_data)

# Create the histogram using seaborn
sns.histplot(data=gm_data, bins=25, stat="density", kde="True", color="r")

# Plot the PDF
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, mu, std)

plt.plot(x, p, "k", linewidth=2)
title = "Fit Values: {:.2f} and {:.2f}".format(mu, std)
plt.title(title)
plt.show()

In [None]:
gm_skew = stats.skew(gm_data)
print(f"Skewness of c.android.gm: {gm_skew}")

In [None]:
sns.boxplot(data=gm_data)
plt.title("Boxplot of c.android.gm")
plt.xlabel("Numbers of occurrences")
plt.ylabel("Time active")
plt.show()

In [None]:
sns.histplot(gm_data, bins=25, kde=True)
plt.axvline(gm_mean, color="r", linestyle="--", label=f"Mean: {gm_mean:.2f}")
plt.xlim(0, 100)
plt.title("Distribution of c.android.gm")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.show()

In [None]:
sns.histplot(youtube_data, bins=30, kde=True)
plt.axvline(youtube_mean, color="r", linestyle="--", label=f"Mean: {youtube_mean:.2f}")
plt.title("Distribution of c.android.youtube")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.legend()
plt.show()

In [None]:
mu, std = stats.norm.fit(youtube_data)

# Create the histogram using seaborn
sns.histplot(data=youtube_data, bins=25, stat="density", kde="True", color="r")

# Plot the PDF
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, mu, std)

plt.plot(x, p, "k", linewidth=2)
title = "Fit Values: {:.2f} and {:.2f}".format(mu, std)
plt.title(title)
plt.show()

In [None]:
youtube_skew = stats.skew(youtube_data)
print(f"Skewness of c.android.youtube: {youtube_skew}")

In [None]:
sns.boxplot(data=youtube_data)
plt.title("Boxplot of c.android.youtube")
plt.xlabel("Numbers of occurrences")
plt.ylabel("Time active")
plt.show()

In [None]:
sns.histplot(youtube_data, bins=25, kde=True)
plt.axvline(youtube_mean, color="r", linestyle="--", label=f"Mean: {youtube_mean:.2f}")
plt.xlim(0, 100)
plt.title("Distribution of c.android.youtube")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.show()

In [None]:
sns.histplot(katana_data, bins=30, kde=True)
plt.axvline(katana_mean, color="r", linestyle="--", label=f"Mean: {katana_mean:.2f}")
plt.title("Distribution of c.katana")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.legend()
plt.show()

In [None]:
mu, std = stats.norm.fit(katana_data)

# Create the histogram using seaborn
sns.histplot(data=katana_data, bins=25, stat="density", kde="True", color="r")

# Plot the PDF
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, mu, std)

plt.plot(x, p, "k", linewidth=2)
title = "Fit Values: {:.2f} and {:.2f}".format(mu, std)
plt.title(title)
plt.show()

In [None]:
katana_skew = stats.skew(katana_data)
print(f"Skewness of c.katana: {katana_skew}")

In [None]:
sns.boxplot(data=katana_data)
plt.title("Boxplot of c.katana")
plt.xlabel("Numbers of occurrences")
plt.ylabel("Time active")
plt.show()

In [None]:
sns.histplot(katana_data, bins=25, kde=True)
plt.axvline(katana_mean, color="r", linestyle="--", label=f"Mean: {katana_mean:.2f}")
plt.xlim(0, 100)
plt.title("Distribution of katana")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.show()

In [None]:
sns.histplot(raider_data, bins=30, kde=True)
plt.axvline(raider_mean, color="r", linestyle="--", label=f"Mean: {raider_mean:.2f}")
plt.title("Distribution of c.raider")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.legend()
plt.show()

In [None]:
mu, std = stats.norm.fit(raider_data)

# Create the histogram using seaborn
sns.histplot(data=raider_data, bins=25, stat="density", kde="True", color="r")

# Plot the PDF
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, mu, std)

plt.plot(x, p, "k", linewidth=2)
title = "Fit Values: {:.2f} and {:.2f}".format(mu, std)
plt.title(title)
plt.show()

In [None]:
raider_skew = stats.skew(raider_data)
print(f"Skewness of c.raider: {raider_skew}")

In [None]:
sns.boxplot(data=raider_data)
plt.title("Boxplot of c.raider")
plt.xlabel("Numbers of occurrences")
plt.ylabel("Time active")
plt.show()

In [None]:
sns.histplot(raider_data, bins=25, kde=True)
plt.axvline(raider_mean, color="r", linestyle="--", label=f"Mean: {raider_mean:.2f}")
plt.xlim(0, 100)
plt.title("Distribution of c.raider")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.show()

In [None]:
sns.histplot(vending_data, bins=30, kde=True)
plt.axvline(vending_mean, color="r", linestyle="--", label=f"Mean: {vending_mean:.2f}")
plt.title("Distribution of c.android.vending")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.legend()
plt.show()

In [None]:
mu, std = stats.norm.fit(vending_data)

# Create the histogram using seaborn
sns.histplot(data=vending_data, bins=25, stat="density", kde="True", color="r")

# Plot the PDF
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, mu, std)

plt.plot(x, p, "k", linewidth=2)
title = "Fit Values: {:.2f} and {:.2f}".format(mu, std)
plt.title(title)
plt.show()

In [None]:
vending_skew = stats.skew(vending_data)
print(f"Skewness of c.android.vending: {vending_skew}")

In [None]:
sns.boxplot(data=vending_data)
plt.title("Boxplot of c.android.vending")
plt.xlabel("Numbers of occurrences")
plt.ylabel("Time active")
plt.show()

In [None]:
sns.histplot(vending_data, bins=25, kde=True)
plt.axvline(vending_mean, color="r", linestyle="--", label=f"Mean: {vending_mean:.2f}")
plt.xlim(0, 100)
plt.title("Distribution of c.android.vending")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.show()

In [None]:
sns.histplot(x86_data, bins=30, kde=True)
plt.axvline(x86_mean, color="r", linestyle="--", label=f"Mean: {x86_mean:.2f}")
plt.title("Distribution of c.UCMobile.x86")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.legend()
plt.show()

In [None]:
mu, std = stats.norm.fit(x86_data)

# Create the histogram using seaborn
sns.histplot(data=x86_data, bins=25, stat="density", kde="True", color="r")

# Plot the PDF
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, mu, std)

plt.plot(x, p, "k", linewidth=2)
title = "Fit Values: {:.2f} and {:.2f}".format(mu, std)
plt.title(title)
plt.show()

In [None]:
x86_skew = stats.skew(x86_data)
print(f"Skewness of c.UCMobile.x86: {x86_skew}")

In [None]:
sns.boxplot(data=x86_data)
plt.title("Boxplot of c.UCMobile.x86")
plt.xlabel("Numbers of occurrences")
plt.ylabel("Time active")
plt.show()

In [None]:
sns.histplot(x86_data, bins=25, kde=True)
plt.axvline(x86_mean, color="r", linestyle="--", label=f"Mean: {x86_mean:.2f}")
plt.xlim(0, 100)
plt.title("Distribution of c.UCMobile.x86")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.show()

In [None]:
sns.histplot(updateassist_data, bins=30, kde=True)
plt.axvline(updateassist_mean, color="r", linestyle="--", label=f"Mean: {updateassist_mean:.2f}")
plt.title("Distribution of c.updateassist")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.legend()
plt.show()

In [None]:
mu, std = stats.norm.fit(updateassist_data)

# Create the histogram using seaborn
sns.histplot(data=updateassist_data, bins=25, stat="density", kde="True", color="r")

# Plot the PDF
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, mu, std)

plt.plot(x, p, "k", linewidth=2)
title = "Fit Values: {:.2f} and {:.2f}".format(mu, std)
plt.title(title)
plt.show()

In [None]:
updateassist_skew = stats.skew(updateassist_data)
print(f"Skewness of c.updateassist: {updateassist_skew}")

In [None]:
sns.boxplot(data=updateassist_data)
plt.title("Boxplot of c.updateassist")
plt.xlabel("Numbers of occurrences")
plt.ylabel("Time active")
plt.show()

In [None]:
sns.histplot(updateassist_data, bins=25, kde=True)
plt.axvline(updateassist_mean, color="r", linestyle="--", label=f"Mean: {updateassist_mean:.2f}")
plt.xlim(0, 100)
plt.title("Distribution of c.updateassist")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.show()

In [None]:
sns.histplot(intl_data, bins=30, kde=True)
plt.axvline(intl_mean, color="r", linestyle="--", label=f"Mean: {intl_mean:.2f}")
plt.title("Distribution of c.UCMobile.intl")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.legend()
plt.show()

In [None]:
mu, std = stats.norm.fit(intl_data)

# Create the histogram using seaborn
sns.histplot(data=intl_data, bins=25, stat="density", kde="True", color="r")

# Plot the PDF
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, mu, std)

plt.plot(x, p, "k", linewidth=2)
title = "Fit Values: {:.2f} and {:.2f}".format(mu, std)
plt.title(title)
plt.show()

In [None]:
intl_skew = stats.skew(intl_data)
print(f"Skewness of c.UCMobile.intl: {intl_skew}")

In [None]:
sns.boxplot(data=intl_data)
plt.title("Boxplot of c.UCMobile.intl")
plt.xlabel("Numbers of occurrences")
plt.ylabel("Time active")
plt.show()

In [None]:
sns.histplot(intl_data, bins=25, kde=True)
plt.axvline(intl_mean, color="r", linestyle="--", label=f"Mean: {intl_mean:.2f}")
plt.xlim(0, 100)
plt.title("Distribution of c.UCMobile.intl")
plt.xlabel("Time active")
plt.ylabel("Numbers of occurrences")
plt.show()