In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
from scipy.sparse import load_npz

# Load VirusTotal data
vt_df = pd.read_csv("/home/shared-datasets/Feature_extraction/all_hash_vtdetect_added.csv")
vt_df['vt_detection'] = pd.to_numeric(vt_df['vt_detection'], errors='coerce')
vt_df['sha256'] = vt_df['sha256'].str.lower()

# Directory containing meta files
data_dir = "/home/shared-datasets/Feature_extraction/npz_yearwise_Final_rerun_0.001"
thresholds = range(0, 100)
results = {t: {'benign': 0, 'malware': 0} for t in thresholds}

vt_feat_index = {t: {0: [], 1: []} for t in thresholds}
vt_features = {t: {0: [], 1: []} for t in thresholds}

# Aggregate counts across all years except 2015
for year in range(2013, 2026):
    if year == 2015:
        continue

    meta_file1 = os.path.join(data_dir, f"{year}_meta_train.npz")
    meta_file2 = os.path.join(data_dir, f"{year}_meta_test.npz")
    if not os.path.exists(meta_file1):
        continue

    meta1 = np.load(meta_file1, allow_pickle=True)
    meta2 = np.load(meta_file2, allow_pickle=True)
    # Concatenate meta1 and meta2 hashes and families
    # Concatenate all fields from meta1 and meta2 into a single DataFrame
    meta1_df = pd.DataFrame({k: meta1[k] for k in meta1.files})
    meta2_df = pd.DataFrame({k: meta2[k] for k in meta2.files})
    full_meta_df = pd.concat([meta1_df, meta2_df], axis=0, ignore_index=True)
    # print(full_meta_df.shape)
    # # Rename 'hash' column to 'sha256' if present
    # if 'hash' in full_meta_df.columns:
    #     full_meta_df = full_meta_df.rename(columns={'hash': 'sha256'})

    # merged_df = pd.merge(full_meta_df, vt_df, on='sha256', how='inner')

    benign_indices = {t: [] for t in thresholds}
    malware_indices = {t: [] for t in thresholds}

    for idx, row in full_meta_df.iterrows():
        y_val = int(row['y'])
        vt_detection = int(row['vt_count'])
        # if vt_detection > 11.0:
        #     continue
        
        if y_val == 0:
            benign_indices[vt_detection].append(idx)
        elif y_val == 1:
            malware_indices[vt_detection].append(idx)
        # print(f"Processing {year} {idx} {y_val} {vt_detection}")
        # vt_feat_index[vt_detection][y_val].append(idx)

    train_data_dir = f'{data_dir}/{year}_X_train.npz'
    test_data_dir = f'{data_dir}/{year}_X_test.npz'
    if os.path.exists(train_data_dir) and os.path.exists(test_data_dir):
        train_data_X = load_npz(train_data_dir).toarray()
        test_data_X = load_npz(test_data_dir).toarray()
        data_X = np.concatenate((train_data_X, test_data_X), axis=0)
        # print(len(data_X))
        # print(len(benign_indices[vt_detection]))
        for vts in thresholds:
            
            if len(benign_indices[vts]) > 0:
                vt_features[vts][0].extend(data_X[benign_indices[vts]])
                # print(len(benign_indices[vts]))
            if len(malware_indices[vts]) > 0:
                vt_features[vts][1].extend(data_X[malware_indices[vts]])
                # print(len(malware_indices[vts]))
            
            # Save the merged DataFrame to a CSV file
            # merged_df.to_csv(f"merged_df_{year}_{vts}.csv", index=False)

    # if 'merged_df_all' in locals():
    #     merged_df_all = pd.concat([merged_df_all, merged_df], ignore_index=True, axis=0)
    # else:
    #     merged_df_all = merged_df.copy()
    # print(merged_df.shape)

In [3]:
for vts in vt_features.keys():
    print(f"Processing {vts} {len(vt_features[vts][0])} {len(vt_features[vts][1])}")

Processing 0 638475 0
Processing 1 0 0
Processing 2 0 0
Processing 3 0 0
Processing 4 0 44979
Processing 5 0 43103
Processing 6 0 40134
Processing 7 0 35046
Processing 8 0 28937
Processing 9 0 22331
Processing 10 0 17335
Processing 11 0 14258
Processing 12 0 12433
Processing 13 0 12876
Processing 14 0 12242
Processing 15 0 10271
Processing 16 0 8813
Processing 17 0 7854
Processing 18 0 6717
Processing 19 0 6143
Processing 20 0 5557
Processing 21 0 4978
Processing 22 0 4409
Processing 23 0 3736
Processing 24 0 3275
Processing 25 0 2835
Processing 26 0 2646
Processing 27 0 2562
Processing 28 0 2089
Processing 29 0 2013
Processing 30 0 1827
Processing 31 0 1786
Processing 32 0 1462
Processing 33 0 1254
Processing 34 0 1060
Processing 35 0 897
Processing 36 0 856
Processing 37 0 762
Processing 38 0 691
Processing 39 0 562
Processing 40 0 378
Processing 41 0 316
Processing 42 0 182
Processing 43 0 120
Processing 44 0 64
Processing 45 0 39
Processing 46 0 30
Processing 47 0 20
Processing 48 

In [4]:
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt

# Global font settings
plt.rcParams.update({
    "font.size": 14,
    "font.weight": "bold",
    "axes.labelweight": "bold",
    "axes.titlesize": 16,
    "axes.titleweight": "bold",
    "xtick.labelsize": 13,
    "ytick.labelsize": 13,
    "legend.fontsize": 13,
    "legend.frameon": False
})

In [None]:


fig, axes = plt.subplots(2, 5, figsize=(25, 10))
axes = axes.flatten()
benign_feats = np.array(vt_features[0][0])
for i, vts in enumerate(range(4, 14)):
    malware_feats = np.array(vt_features[vts][1])
    # Only plot if both classes have samples
    if len(benign_feats) > 0 and len(malware_feats) > 0:
        # Sample up to 500 from each class for visualization
        n_benign = len(benign_feats) #min(10000, len(benign_feats))
        n_malware = len(malware_feats)#min(10000, len(malware_feats))
        X = np.vstack([benign_feats[:n_benign], malware_feats[:n_malware]])
        y = np.array([0]*n_benign + [1]*n_malware)
        tsne = TSNE(n_components=2, random_state=42, perplexity=30)
        X_embedded = tsne.fit_transform(X)
        ax = axes[i]
        ax.scatter(X_embedded[y==0, 0], X_embedded[y==0, 1], label='Benign', alpha=0.5, s=10)
        ax.scatter(X_embedded[y==1, 0], X_embedded[y==1, 1], label='Malware', alpha=0.5, s=10)
        ax.set_title(f'vts={vts}')
        ax.legend()
    else:
        axes[i].set_title(f'vts={vts} (insufficient data)')
        axes[i].axis('off')

plt.tight_layout()
plt.show()