## Common Family across years 2013 - 2024

In [1]:
# Create a map where the index is the key and the family is the value
family_map = {index: family for index, family in enumerate(['airpush', 'dianjin', 'dnotua', 'ewind', 'fakeapp',
       'plankton', 'smsagent', 'smspay', 'smsreg', 'umpay'])}

# Print the map
print(family_map)

{0: 'airpush', 1: 'dianjin', 2: 'dnotua', 3: 'ewind', 4: 'fakeapp', 5: 'plankton', 6: 'smsagent', 7: 'smspay', 8: 'smsreg', 9: 'umpay'}


In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Dynamically generate CSV file paths for the years 2013 to 2024
csv_files = [
    f'/home/ihossain/ISMAIL/CADE/reports/drebin_{year}/intermediate/mlp_detect_results_all_m10.0_lambda0.1.csv'
    for year in range(2013, 2025) if year != 2015 # Exclude 2015, as doent exist
]

# Combine all CSVs into a single DataFrame
df_list = [pd.read_csv(file) for file in csv_files]
df = pd.concat(df_list, ignore_index=True)

df['Malware Families'] = df['real_label'].map(family_map)

# Map is_drift flag for better legend labels
df['DriftStatus'] = df['is_drift'].map({'Y': 'Drift', 'N': 'Non-drift'})

# Calculate counts of 'Y' and 'N' for each Malware Families
counts = df.groupby(['Malware Families', 'DriftStatus']).size().unstack(fill_value=0)

# Plotting
plt.figure(figsize=(7, 5), dpi=300)
sns.boxplot(data=df, x='Malware Families', y='min_distance', hue='DriftStatus',
            palette={'Drift': 'red', 'Non-drift': 'green'},
            fliersize=2, linewidth=1.5)

# Add counts as text annotations just above the xticks
for i, family in enumerate(df['Malware Families'].unique()):
    if family in counts.index:
        drift_count = counts.loc[family, 'Drift'] if 'Drift' in counts.columns else 0
        non_drift_count = counts.loc[family, 'Non-drift'] if 'Non-drift' in counts.columns else 0
        plt.text(i, df['min_distance'].max() + 0.06, f"{non_drift_count}",
                 ha='center', va='top', fontsize=8, color='green')
        plt.text(i, df['min_distance'].max() + 0.09, f"{drift_count}",
                 ha='center', va='top', fontsize=8, color='red')

# plt.xlabel("Malware family used as unseen family")
plt.ylabel("Dist. to nearest centroid")
plt.xticks(rotation=45)
plt.legend(title="")
plt.tight_layout()
plt.show()
