In [687]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Optional


file_path: str = "../dataset"
files: tuple[str, ...] = ("connections", "devices", "processes", "profiles")

dataset: dict[str, pd.DataFrame] = {}
for file in files:
    dataset[file] = pd.read_csv(f"{file_path}/{file}.csv", sep="\t")

Connections description


In [None]:
dataset["connections"].info()

-   Total Entries: 15,108
-   Total Columns: 13
-   Column Types:
-   11 columns of type float64
-   1 column of type int64
-   1 column of type object
-   There are no missing values in this data


In [None]:
dataset["connections"]["ts"]

-   The object column "ts" is date and time


In [None]:
summary = dataset["connections"].describe()
median = (
    dataset["connections"].select_dtypes(include=["float64", "int64"]).median()
)  # adding median to describe method output
summary.loc["median"] = median
summary

-   From these tables above we can see imei is a long integer and looks like an Id, if we look at processes table, we can also see same values indicating this could be an Id of device.
-   Another assumption we can make is that columns specifying a connection type (columns starting with c. such as c.android.youtube) have values ranging from 0 to 100, this could indicate that it is a percentage amount of time that the connection was established.


-   First few rows might indicate that the data was sample in a 1 minute interval.
-   Let's look at it closer.


In [None]:
dataset["connections"].sort_values(by="ts", ascending=True)["ts"]

-   Now we see it looks like samples are in a 1 minute interval.
-   Let's go further.


In [None]:
times = (
    dataset["connections"]
    .groupby(by="imei")[["imei", "ts"]]
    .apply(lambda x: x.sort_values(by="ts", ascending=True))
)
times

In [None]:
times = dataset["connections"].sort_values(by="ts")["ts"]
times = pd.to_datetime(times)

previous_time: Optional[pd.Series] = None

same_times: int = 0
non_minute_differences: int = 0


for current_time in times:
    if previous_time is None:
        previous_time = current_time
        continue
        
    if (current_time - previous_time).seconds == 0:
        same_times += 1
        
    elif (current_time - previous_time).seconds != 60:
        non_minute_differences += 1
        
    previous_time = current_time

print(f"Non minute differences: {non_minute_differences}")
print(f"Same times: {same_times}")

From this we can see, that there are data every minute, sometimes more than once at the same time.


In [None]:
dataset["connections"]["mwra"].value_counts()

-   mwra is (Malware-related-activity)
-   In data there are only values 1.0 and 0.0 indicating if there was a malware activity in specific time frame


In [None]:
mwra = dataset["connections"]["c.dogalize"]
mwra

sns.histplot(mwra, bins=5, kde=True)
plt.show()

In [None]:
median.transpose().plot(kind="bar", figsize=(10, 6), width=0.8)

# Add a title and labels
plt.title("Summary Statistics with Median for Connections")
plt.xlabel("Columns")
plt.ylabel("Values")

# Rotate the x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Show the plot
plt.tight_layout()
plt.show()

Devices description


In [None]:
dataset["devices"].info()
dataset["devices"].describe()

In [None]:
dataset["devices"].head()

Processes description


In [None]:
dataset["processes"].info()
dataset["processes"].describe()

In [None]:
dataset["processes"].head()

Profiles description


In [None]:
dataset["profiles"].info()
dataset["profiles"].describe()

In [None]:
dataset["profiles"].head()