# Generate Test Subsets

In [11]:
import itertools
import random
from itertools import combinations
from random import sample

import numpy as np
import pandas as pd
import tqdm

In [12]:
string_df = pd.read_csv("../../data/interim/string_df.csv")

In [13]:
string_df

Unnamed: 0,label,concatenated
0,GooglePixel3A_L,0000000000000000000000000000000000000000000000...
1,GooglePixel3A_L,0000000000000000000000000000000000000000000000...
2,GooglePixel3A_L,0000000000000000000000000000000000000000000000...
3,GooglePixel3A_L,0000000000000000000000000000000000000000000000...
4,GooglePixel3A_L,0000000000000000000000000000000000000000000000...
...,...,...
951,iPhoneXSMax_M,0001101000101101000000000001101111111111000000...
952,iPhoneXSMax_M,0001101000101101010000000001101111111111000000...
953,iPhoneXSMax_M,0001101000101101000000000001101111111111000000...
954,iPhoneXSMax_M,0001101000101101010000000001101111111111000000...


In [14]:
# Identifying unique labels
unique_labels = string_df['label'].unique()

# Initializing lists to store matching and non-matching pairs
matching_pairs = []
non_matching_pairs = []

# Generating matching pairs
for label in unique_labels:
    # Getting all indices for the current label
    indices = string_df[string_df['label'] == label].index.tolist()
    if len(indices) == 1:
        # If only one row, duplicate it to create 50 pairs
        indices = indices * 2
    matching_indices = list(itertools.combinations(indices, 2))
    
    # Ensuring we have exactly 50 pairs
    if len(matching_indices) < 50:
        matching_indices = matching_indices * (50 // len(matching_indices) + 1)
    matching_pairs.extend([(i, j, label, label) for i, j in matching_indices[:50]])

# Generating non-matching pairs
all_indices = list(string_df.index)
random.seed(42)  # For reproducibility

for label in unique_labels:
    # Indices with the current label
    label_indices = string_df[string_df['label'] == label].index.tolist()
    # Indices without the current label
    other_indices = list(set(all_indices) - set(label_indices))
    random.shuffle(other_indices)
    
    # Creating diverse pairs between label indices and other indices
    non_matching_indices = list(itertools.product(label_indices, other_indices))
    
    # Ensuring we have exactly 50 pairs
    non_matching_pairs.extend([
        (i, j, label, string_df.loc[j, 'label']) 
        for i, j in non_matching_indices[:50]
    ])

# Creating the final dataframe
pairs_df = pd.DataFrame(matching_pairs + non_matching_pairs, columns=['index_1', 'index_2', 'label_1', 'label_2'])
print(pairs_df)

      index_1  index_2          label_1          label_2
0           0        1  GooglePixel3A_L  GooglePixel3A_L
1           0        2  GooglePixel3A_L  GooglePixel3A_L
2           0        3  GooglePixel3A_L  GooglePixel3A_L
3           0        4  GooglePixel3A_L  GooglePixel3A_L
4           0        5  GooglePixel3A_L  GooglePixel3A_L
...       ...      ...              ...              ...
3295      914      117    iPhoneXSMax_M   HuaweiHonor9_R
3296      914      366    iPhoneXSMax_M   XiaomiRedmi4_B
3297      914      101    iPhoneXSMax_M   HuaweiHonor9_R
3298      914      343    iPhoneXSMax_M   XiaomiRedmi4_B
3299      914      354    iPhoneXSMax_M   XiaomiRedmi4_B

[3300 rows x 4 columns]


In [16]:
string_df['label'].nunique()

33

In [17]:
pairs_df['Equality'] = np.where(pairs_df['label_1'] == pairs_df['label_2'], 1, -1)

In [18]:
pairs_df

Unnamed: 0,index_1,index_2,label_1,label_2,Equality
0,0,1,GooglePixel3A_L,GooglePixel3A_L,1
1,0,2,GooglePixel3A_L,GooglePixel3A_L,1
2,0,3,GooglePixel3A_L,GooglePixel3A_L,1
3,0,4,GooglePixel3A_L,GooglePixel3A_L,1
4,0,5,GooglePixel3A_L,GooglePixel3A_L,1
...,...,...,...,...,...
3295,914,117,iPhoneXSMax_M,HuaweiHonor9_R,-1
3296,914,366,iPhoneXSMax_M,XiaomiRedmi4_B,-1
3297,914,101,iPhoneXSMax_M,HuaweiHonor9_R,-1
3298,914,343,iPhoneXSMax_M,XiaomiRedmi4_B,-1


In [20]:
pairs_df.to_csv("../../data/train_test/random_pairs.csv")