## Get the Lambda Meta data

In [None]:
import numpy as np
from scipy.sparse import load_npz
import os

year_singleton = {}
year_families = {}
# Load the .npz file
for year in range(2013, 2026):
    data_dir_train = f'/home/shared-datasets/Feature_extraction/npz_yearwise_Final/{year}_meta_train.npz'
    data_dir_test = f'/home/shared-datasets/Feature_extraction/npz_yearwise_Final/{year}_meta_test.npz'

    if os.path.exists(data_dir_train) and os.path.exists(data_dir_test):
        data_train = np.load(data_dir_train, allow_pickle=True)
        data_test = np.load(data_dir_test, allow_pickle=True)
        
        # Ensure keys exist in both train and test data before concatenating
        common_keys = set(data_train.keys()).intersection(data_test.keys())
        data_family = {key: np.concatenate((data_train[key], data_test[key])) for key in common_keys}
        
        # Extract the 'family' array from the data_family variable
        families = data_family['family']

        # Filter the families that do not start with "singleton" and do not contain "benign"
        filtered_families = [family for family in families if not family.startswith("singleton") and not family.startswith('SINGLETON') \
                            and not family.startswith("UNKNOWN") and not family.startswith("-")  and "benign" not in family]

        # Count the number of families starting with "singleton"
        year_singleton[year] = np.sum([1 for family in families if family.startswith("singleton") or family.startswith("SINGLETON")])
        
        # Store the filtered families for the year
        year_families[year] = np.unique(filtered_families)


In [None]:
total_families = 0
for year in year_families.keys():
    total_families += len(year_families[year])
    print(f'year: {year}, unique families: {len(year_families[year])}')

print("total families: ", total_families)

In [None]:
# Compute the intersection of families across all years
all_families = list(year_families.values())[:11]
common_families_all_years = set.intersection(*[set(families) for families in all_families])

print(f"Number of common families across all years: {len(common_families_all_years)}")
print(f"Common families: {common_families_all_years}")

In [None]:
# Take valid families from the above except 'UNKNOWN'
target_families = ['airpush', 'dianjin', 'dnotua', 'ewind', 'fakeapp',
       'plankton', 'smsagent', 'smspay', 'smsreg', 'umpay']
family_to_index = {family: idx for idx, family in enumerate(target_families)}

## Get the Lambda Features

In [None]:
import numpy as np
from scipy.sparse import load_npz
import os

family_year_indices = {}
# Load the .npz file
for year in range(2013, 2025):
    data_dir_train = f'/home/shared-datasets/Feature_extraction/npz_yearwise_Final/{year}_meta_train.npz'
    data_dir_test = f'/home/shared-datasets/Feature_extraction/npz_yearwise_Final/{year}_meta_test.npz'

    if os.path.exists(data_dir_train) and os.path.exists(data_dir_test):
        data_train = np.load(data_dir_train, allow_pickle=True)
        data_test = np.load(data_dir_test, allow_pickle=True)
        
        # Ensure keys exist in both train and test data before concatenating
        common_keys = set(data_train.keys()).intersection(data_test.keys())
        data_family = {key: np.concatenate((data_train[key], data_test[key]), axis=0) for key in common_keys}
        # Extract the 'family' array from the data_family variable
        families = data_family['family']

        # Find the indices where the families match the target list
        for i, family in enumerate(families): 
            if family in target_families:
                if family not in family_year_indices:
                    family_year_indices[family] = {}
                if year not in family_year_indices[family]:
                    family_year_indices[family][year] = []
                family_year_indices[family][year].append(i)

In [None]:
family_features = {}
for family in target_families:
    family_features[family] = []
    for year in range(2013, 2025):
        train_data_dir = f'/home/shared-datasets/Feature_extraction/npz_yearwise_Final/{year}_X_train.npz'
        test_data_dir = f'/home/shared-datasets/Feature_extraction/npz_yearwise_Final/{year}_X_test.npz'
        if os.path.exists(train_data_dir) and os.path.exists(test_data_dir):
            train_data_X = load_npz(train_data_dir).toarray()
            test_data_X = load_npz(test_data_dir).toarray()
            data_X = np.concatenate((train_data_X, test_data_X), axis=0)
            indices = family_year_indices[family][year]
            family_features[family].append(data_X[indices])


In [None]:
for family in family_features.keys():
    print(f'family {family} {len(family_features[family])}')

## Calcualte Jaccard similarity score (Stability score)

In [None]:
def calculate_jaccard_similarity(set1, set2):
    """
    Calculate Jaccard similarity between two sets.
    """
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

# Initialize a dictionary to store Jaccard similarity results
jaccard_results = {}

# Iterate over each family in family_features
for family, feature_groups in family_features.items():
    jaccard_scores = []
    for i in range(len(feature_groups) - 1):
        # Convert feature arrays to sets
        set1 = set(map(tuple, feature_groups[i]))
        set2 = set(map(tuple, feature_groups[i + 1]))
        
        # Calculate Jaccard similarity
        jaccard_score = calculate_jaccard_similarity(set1, set2)
        jaccard_scores.append(jaccard_score)
    
    # Store the results for the family
    jaccard_results[family] = jaccard_scores

# Display the Jaccard similarity results
for family, scores in jaccard_results.items():
    print(f"Jaccard similarity for family '{family}': {scores}")

In [None]:
# Create a map where the index is the key and the family name is the value
jaccard_key_map = {index: family  for index, family in enumerate(jaccard_results.keys())}

# Print the map
print(jaccard_key_map)

## Plot the Stability Score

In [None]:
import matplotlib.pyplot as plt

# Extract the x-axis (pairs) and y-axis (scores) values

# Plot the line plot
plt.figure(figsize=(7, 5), dpi=500)

markers = ['o', 's', 'D', '^', 'v', 'P', '*', 'X']  # Different markers for each label
for idx, family in jaccard_key_map.items():
    x_values = [i for i in range(10)]
    y_values = jaccard_results[family]
    plt.plot(x_values, y_values, marker=markers[idx % len(markers)], linestyle='-', label=f'{family}', linewidth=2)

# Add labels, title, and legend
# plt.xlabel("Group Pair", fontsize=12)
# plt.ylabel("Stability score", fontsize=12)
# plt.title("Jaccard Similarity Scores Between Consecutive Groups", fontsize=14)
plt.xticks(rotation=45)
# plt.grid(True)
plt.legend()

# Show the plot
plt.tight_layout()
plt.show()