# Generate Device Subsets

Different from the subset, the purpose of this notebook is to generate 5-10 combinations of different devices (considering the totality of the rows from each one of them) for each cardinality from 2 to `len(devices)`.

In [1]:
import random
from itertools import combinations

import pandas as pd
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


In [3]:
string_df = pd.read_csv("C:/Users/fabio/Documents/GitHub/CompactProbes/data/train_test/bin_test_new.csv")

In [4]:
string_df

Unnamed: 0,label,concatenated
0,GooglePixel3A_L,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...
1,GooglePixel3A_L,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...
2,GooglePixel3A_L,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...
3,GooglePixel3A_L,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...
4,GooglePixel3A_L,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...
...,...,...
573,iPhoneXR_U,0001101000101101010000000001011111111111000000...
574,iPhoneXR_U,0001101000101101000000000001011111111111000000...
575,iPhoneXR_U,0001101000101101000000000001011111111111000000...
576,iPhoneXR_U,0001101000101101010000000001011111111111000000...


In [5]:
labels = string_df["label"].unique()

In [6]:
labels

array(['GooglePixel3A_L', 'GooglePixel3A_V', 'HuaweiP20_G',
       'OppoFindX3Neo_A', 'S21Ultra_M', 'SamsungJ6_K', 'SamsungM31_A',
       'SamsungS4_C', 'SamsungS6_H', 'XiaomiRedmi4_B', 'XiaomiRedmi5_J',
       'iPhone11_F', 'iPhone12Pro_C', 'iPhoneXR_A', 'iPhoneXR_U'],
      dtype=object)

In [7]:
all_combinations_list = []

In [8]:
max_devices = len(labels)
batches = 10

num_iter = range(0, batches)
i = 0

random.seed(42)

for i in tqdm(range(0, batches), desc="⚠️ Batch Processing"):
    for r in tqdm(range(2, max_devices + 1), desc="↘️ Batch #" + str(i + 1)):
        random_combinations = random.sample(string_df["label"].unique().tolist(), r)
        for labels_combination in combinations(random_combinations, r):
            # Append each combination and its length to the list
            all_combinations_list.append({
                'combination': labels_combination,
                'length': len(labels_combination)
            })

↘️ Batch #1: 100%|██████████| 14/14 [00:00<00:00, 7854.50it/s]
↘️ Batch #2: 100%|██████████| 14/14 [00:00<00:00, 8051.59it/s]
↘️ Batch #3: 100%|██████████| 14/14 [00:00<00:00, 9957.65it/s]
↘️ Batch #4: 100%|██████████| 14/14 [00:00<00:00, 9563.56it/s]
↘️ Batch #5: 100%|██████████| 14/14 [00:00<00:00, 5166.31it/s]
↘️ Batch #6: 100%|██████████| 14/14 [00:00<00:00, 10087.66it/s]
↘️ Batch #7: 100%|██████████| 14/14 [00:00<00:00, 14523.93it/s]
↘️ Batch #8: 100%|██████████| 14/14 [00:00<00:00, 9226.94it/s]
↘️ Batch #9: 100%|██████████| 14/14 [00:00<00:00, 9439.04it/s]
↘️ Batch #10: 100%|██████████| 14/14 [00:00<00:00, 6324.21it/s]
⚠️ Batch Processing: 100%|██████████| 10/10 [00:00<00:00, 209.32it/s]


In [9]:
# Convert the list of dictionaries to a DataFrame
all_combinations_df = pd.DataFrame(all_combinations_list)

# Remove duplicates: keep rows where the set of devices is unique
# Convert each combination to a set and drop duplicates
all_combinations_df['combination_set'] = all_combinations_df['combination'].apply(set)
all_combinations_df.drop_duplicates(subset='combination_set', keep='first', inplace=True)

# Drop the helper column 'combination_set' as it's no longer needed
all_combinations_df.drop(columns=['combination_set'], inplace=True)

In [10]:
all_combinations_df["length"].value_counts()

length
2     10
3     10
4     10
5     10
6     10
7     10
8     10
9     10
10    10
11    10
12    10
13    10
14     7
15     1
Name: count, dtype: int64

In [11]:
all_combinations_df.to_csv("C:/Users/fabio/Documents/GitHub/CompactProbes/data/train_test/10_combinations_test.csv", index=False)