# Generate Test Subsets

In [28]:
from itertools import combinations
from random import sample

import numpy as np
import pandas as pd
import tqdm

In [29]:
string_df = pd.read_csv("../../data/interim/string_df.csv")

In [30]:
string_df

Unnamed: 0,label,concatenated
0,GooglePixel3A_L,0000000000000000000000000000000000000000000000...
1,GooglePixel3A_L,0000000000000000000000000000000000000000000000...
2,GooglePixel3A_L,0000000000000000000000000000000000000000000000...
3,GooglePixel3A_L,0000000000000000000000000000000000000000000000...
4,GooglePixel3A_L,0000000000000000000000000000000000000000000000...
...,...,...
951,iPhoneXSMax_M,0001101000101101000000000001101111111111000000...
952,iPhoneXSMax_M,0001101000101101010000000001101111111111000000...
953,iPhoneXSMax_M,0001101000101101000000000001101111111111000000...
954,iPhoneXSMax_M,0001101000101101010000000001101111111111000000...


In [31]:
def create_pairs(df, num_pairs=50):
    """
    Create matching and non-matching pairs from the dataframe.
    
    Parameters:
    df (pd.DataFrame): The input dataframe with columns 'label' and 'concatenated'.
    num_pairs (int): Number of pairs to generate for matching and non-matching cases.

    Returns:
    pd.DataFrame: A dataframe containing pairs with columns 'index1', 'index2', 'label1', 'label2'.
    """
    # Get unique labels
    unique_labels = df['label'].unique()
    pairs = []

    for label in unique_labels:
        # Matching pairs
        label_df = df[df['label'] == label].reset_index()
        n_samples = len(label_df)
        
        # If there are not enough samples, reuse indices by setting replace=True
        matching_indices = np.random.choice(label_df.index, size=(num_pairs * 2), replace=(n_samples < num_pairs * 2))
        matching_indices = matching_indices.reshape((num_pairs, 2))
        for idx1, idx2 in matching_indices:
            pairs.append((label_df.loc[idx1, 'index'], label_df.loc[idx2, 'index'], label, label))

        # Non-matching pairs
        other_labels_df = df[df['label'] != label].reset_index()
        n_other_samples = len(other_labels_df)
        max_non_matching_pairs = min(n_samples, n_other_samples, num_pairs)
        non_matching_indices = np.random.choice(other_labels_df.index, size=max_non_matching_pairs, replace=False)
        label_indices = np.random.choice(label_df.index, size=max_non_matching_pairs, replace=False)
        for idx1, idx2 in zip(label_indices, non_matching_indices):
            pairs.append((label_df.loc[idx1, 'index'], other_labels_df.loc[idx2, 'index'], label, other_labels_df.loc[idx2, 'label']))

    pairs_df = pd.DataFrame(pairs, columns=['index1', 'index2', 'label1', 'label2'])
    return pairs_df

In [32]:
pairs_df

Unnamed: 0,index1,index2,label1,label2
0,18,38,GooglePixel3A_L,GooglePixel3A_L
1,30,14,GooglePixel3A_L,GooglePixel3A_L
2,6,39,GooglePixel3A_L,GooglePixel3A_L
3,29,26,GooglePixel3A_L,GooglePixel3A_L
4,1,17,GooglePixel3A_L,GooglePixel3A_L
...,...,...,...,...
1245,929,621,iPhoneXSMax_M,XiaomiRedmiNote9S_T
1246,945,187,iPhoneXSMax_M,SamsungJ6_K
1247,946,129,iPhoneXSMax_M,OnePlusNord_O
1248,951,463,iPhoneXSMax_M,XiaomiRedmi5_J


In [33]:
string_df['label'].nunique()

33