## 📊 Dataset Insights

In this notebook, we first provide some information, such as the average APK size and average description length of our ground-truth data.

Next, we present statistics about the level of granularity and generate some interesting plots

#### Import

In [None]:
# IMPORT
from   tqdm                 import tqdm
import matplotlib.pyplot    as plt
import pandas               as pd
import numpy                as np
import os

#### Parameters

In [None]:
# Dataset Path
INPUT_PATH = "../0_Data/CSV/0_AndroCatSet.csv"

# AndrozooInfo
ANDROZOO_PATH    = "../0_DatasetCreation/0_androzoo.csv"
ANDROZOO_COLUMNS = ['sha256','sha1','md5','dex_date','apk_size','pkg_name','vercode','vt_detection','vt_scan_date','dex_size','markets']

In [None]:
# Where to save output file
OUTPUT_PATH = "./datasetInsights.csv"

In [None]:
# Initialize tqdm for pandas.
tqdm.pandas()

In [None]:
print("⚡ START ⚡\n")

### Load Data

In [None]:
# Load Dataset
appsDF = pd.read_csv(INPUT_PATH, index_col=False)
print("#️⃣ Apps: {}".format(appsDF.shape[0]))

#appsDF.head(3)

### Load AndroZoo

In [None]:
androzooDF = pd.read_csv(ANDROZOO_PATH, names=ANDROZOO_COLUMNS, index_col=False)
print("#️⃣ Androzoo Apps: {}".format(androzooDF.shape[0]))

#androzooDF.head(3)

###  1. Retrieve APK Size and compute Description Length

In [None]:
# Create a dictionary mapping sha256 values to apk_size values from androzooDF
apkSizeMap          = androzooDF.set_index('sha256')['apk_size'].to_dict()
appsDF['apkSize']   = appsDF['sha256'].map(apkSizeMap)

appsDF['apkSize'].fillna(0, inplace=True)
appsDF.head(3)

In [None]:
appsDF['descriptionLength'] = appsDF['googlePlayDescription'].progress_apply(lambda x: len(str(x)))
appsDF.head(3)

### 2. Compute Avg Description Length and Avg APK Size

In [None]:
infoDF = appsDF.groupby('classID').agg({
    'apkSize': ['mean', 'std'],             # Calculate mean and standard deviation for 'apkSize'
    'descriptionLength': ['mean', 'std']    # Calculate mean and standard deviation for 'descriptionLength'
}).reset_index()


# Rename the columns to reflect the computed averages
infoDF.columns = ['classID', 'avgApkSize', 'stdApkSize', 'avgDescriptionLength', 'stdDescriptionLength']
infoDF['avgApkSize']            = (infoDF['avgApkSize'] / 1000000).round(2)
infoDF['stdApkSize']            = (infoDF['stdApkSize'] / 1000000).round(2)
infoDF['avgDescriptionLength']  = infoDF['avgDescriptionLength'].astype(int)
infoDF['stdDescriptionLength']  = infoDF['stdDescriptionLength'].astype(int)

# Append the average_row to the end of the DataFrame
infoDF = infoDF.append(infoDF.mean(numeric_only=True).round(2), ignore_index=True)
infoDF.loc[50,'classID'] = "TOTAL"
infoDF['avgDescriptionLength']  = infoDF['avgDescriptionLength'].astype(int)
infoDF['stdDescriptionLength']  = infoDF['stdDescriptionLength'].astype(int)

# Save the result
infoDF.to_csv(OUTPUT_PATH, index=False)

### 3. Load Data again

In [None]:
# Load Dataset
appsDF = pd.read_csv(INPUT_PATH, index_col=False)
print("#️⃣ APPS: {}".format(appsDF.shape[0]))

# Remove columns not needed
appsDF = appsDF.drop(['pkgName','googlePlayDescription'], axis=1)

appsDF.head(3)

### 4. Group the DataFrame by 'classID' and count the occurrences of 'googlePlayCategoryID'

In [None]:
# Group the DataFrame by 'classID' and count the occurrences of 'googlePlayCategory'
counts = appsDF.groupby('classID')['googlePlayCategoryID'].value_counts()

# Keep track of the previous classID
prevClassID = None

# Print the counts for each 'classID'
for item, count in counts.items():
    classID, googlePlayCategoryID = item

    if classID != prevClassID:
        print("\n🏷️ classID : {}".format(classID))
        prevClassID = classID

    print("-- gpCategoryID: {:20} Count: {}".format(googlePlayCategoryID, count))

### 5. # Group the DataFrame by 'googlePlayCategoryID' and count the occurrences of 'classID'

In [None]:
# Group the DataFrame by 'gpID' and count the occurrences of 'googlePlayCategory'
counts = appsDF.groupby('googlePlayCategoryID')['classID'].value_counts()

# Keep track of the previous classID
prevGpID = None

# Print the counts for each 'classID'
for item, count in counts.items():
    gpID, classID = item

    if gpID != prevGpID:
        print("\n🏷️ gpID : {}".format(gpID))
        prevGpID = gpID

    print("-- classID: {:25} Count: {}".format(classID, count))

### 6. Plots

The distribution of class IDs among the Play Store categories of TOOLS and HOUSE_AND_HOME.

In [None]:
# Paths to save plots.
PLOTS_PATH = "./Plots/"

# Create folder for paths
if not os.path.exists(PLOTS_PATH):
    os.makedirs(PLOTS_PATH)
    print("Folder created:", PLOTS_PATH)
else:
    print("Folder already exists:", PLOTS_PATH)

# FontSize Parameters
TITLE_FONTSIZE = 18
TICKS_FONTSIZE = 13
AXIS_FONTSIZE  = 14

COLORS = ["#68B984", "#F0A04B", "#89CFFD","#FFCB42", "#BFACE2", "#E96479", "#5DA7DB"]

#### CategoryID: Tools

In [None]:
TO_PLOT = "TOOLS"

In [None]:
# Filter appsDF
plotDF = appsDF[appsDF['googlePlayCategoryID'] == TO_PLOT]
plotDF = plotDF.groupby('googlePlayCategoryID')['classID'].value_counts().reset_index(name='numApps')

# Group small classes into OTHER
plotDF.loc[plotDF['numApps'] / sum(plotDF['numApps']) < 0.03, 'classID'] = 'Other'
plotDF = plotDF.groupby('classID').sum().reset_index()
plotDF = plotDF.sort_values('numApps', ascending=False)
plotDF.head(3)

In [None]:
VALUES = plotDF['numApps'].values
VALUES = np.insert(VALUES, 0, 0)

LABELS = plotDF['classID'].values
LABELS[4] = "BarcodeScanner"

In [None]:
# Figure Size
plt.figure(figsize=(35, 3))

ax = plt.gca()
ax.axis('off')

# Stack the others Barplot
for i in range(1, len(VALUES)):

    ax.barh(0, 
            VALUES[i], 
            left = sum(VALUES[0:i]),
            edgecolor='black', 
            color = COLORS[(i-1)%len(COLORS)
            ])


    # Add text labels
    if i % 2 == 0:
        textY = -0.7
    else:
        textY = -1

    textX = sum(VALUES[0:i]) + VALUES[i] / 2
    ax.text(textX, textY, LABELS[i - 1], ha='center', va='center', fontsize=26)
    ax.text(textX, 0, '{:.1f}%\n({:,})'.format(VALUES[i] / sum(plotDF['numApps']) * 100, VALUES[i]), ha='center', va='center', fontsize=24)

    lineX = sum(VALUES[0:i]) + VALUES[i] / 2
    ax.plot([lineX, textX], [-0.41, textY+0.1], color='black')

# Adjust x-axis limits to match the extent of bars
ax.set_xlim(0, sum(VALUES))

# # Save
plt.savefig(PLOTS_PATH + 'toolsOverview.png', dpi=300, bbox_inches='tight')
plt.savefig(PLOTS_PATH + 'toolsOverview.pdf', bbox_inches='tight')

# Display the chart
plt.show()

#### CategoryID:  House_And_Home

In [None]:
plotDF = appsDF[appsDF['googlePlayCategoryID'] == "HOUSE_AND_HOME"]
plotDF = plotDF[plotDF['classID'] != 'Translator']
plotDF = plotDF.groupby('googlePlayCategoryID')['classID'].value_counts().reset_index(name='numApps')
plotDF

In [None]:
VALUES = plotDF['numApps'].values
VALUES = np.insert(VALUES, 0, 0)

LABELS = plotDF['classID'].values

In [None]:
# Figure Size
plt.figure(figsize=(35, 3))

ax = plt.gca()
ax.axis('off')

# Stack the others Barplot
for i in range(1, len(VALUES)):

    ax.barh(0, 
            VALUES[i], 
            left = sum(VALUES[0:i]),
            edgecolor='black', 
            color = COLORS[(i-1)%len(COLORS)
            ])


    # Add text labels
    if i % 2 == 0:
        textY = -0.7
    else:
        textY = -1

    textX = sum(VALUES[0:i]) + VALUES[i] / 2
    ax.text(textX, textY, LABELS[i - 1], ha='center', va='center', fontsize=26)
    ax.text(textX, 0, '{:.1f}%\n({:,})'.format(VALUES[i] / sum(plotDF['numApps']) * 100, VALUES[i]), ha='center', va='center', fontsize=24)

    lineX = sum(VALUES[0:i]) + VALUES[i] / 2
    ax.plot([lineX, textX], [-0.41, textY+0.1], color='black')

# Adjust x-axis limits to match the extent of bars
ax.set_xlim(0, sum(VALUES))

# # Save
plt.savefig(PLOTS_PATH + 'HouseAndHomeOverview.png', dpi=300, bbox_inches='tight')
plt.savefig(PLOTS_PATH + 'HouseAndHomeOverview.pdf', bbox_inches='tight')

# Display the chart
plt.show()

In [None]:
print("\n🔚 END")