# Duplication issue
The following pollution functions are used to inject non-exact duplicates into the dataset, for a total of 10 different experiments.

### Imports

In [None]:
!pip install recordlinkage

In [None]:
import random as rnd
import pandas as pd
import numpy as np
import numpy.linalg as la
import string
from datetime import datetime, timedelta
from scipy.stats import pearsonr
import recordlinkage
# -- IMPORT SCRIPTS FROM DATADIQ REPO (@camillasancricca)
import A_data_collection as data_collection
import D_data_analysis as data_analysis
import E_plot_results as plot_results

In [None]:
SEED = 2023
NUM_EXPERIMENTS = 10

np.random.seed(SEED)
rnd.seed(SEED)

In [None]:
INIT_FEATURES = 5
NUM_INFORMATIVE = 5
NUM_REDUNDANT = INIT_FEATURES - NUM_INFORMATIVE
assert NUM_REDUNDANT >= 0

In [None]:
# dataset for usage example
example_dataset, example_labels = data_collection.make_dataset_for_classification(n_samples=5, n_features=INIT_FEATURES, n_informative=NUM_INFORMATIVE, n_redundant=NUM_REDUNDANT, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, seed=2023)
# dataset for duplication experiments
X_dup, y_dup = data_collection.make_dataset_for_classification(n_samples=1000, n_features=INIT_FEATURES, n_informative=NUM_INFORMATIVE, n_redundant=NUM_REDUNDANT, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, seed=2023)

### Pollution Functions

#### Rounding-off pollution function
This pollution function randomly picks a percentage of entries from the dataset, duplicates them, and shaves off a random amount of digits, to a random digit, for a random amount of features.

In [None]:
def pollute_round_off(dataset, labels, percentage):

  '''
  Create duplicates from the dataset of size (percentage * dataset.shape[0]) and
   apply rounding off to a random number of features with random digits.

  Parameters:
    dataset (numpy.ndarray): The dataset to be duplicated and polluted.
    labels (numpy.ndarray): The labels of the dataset.
    percentage (float): The fraction of the dataset size to duplicate.

  Returns:
    numpy.ndarray: The polluted dataset with duplicates.

  Raises:
    ValueError: If the dataset is empty or if the percentage is not between 0 and 1.
  '''

  if not (0 < percentage < 1):
    raise ValueError("Percentage must be between 0 and 1.")

  num_entries, num_features = dataset.shape

  if num_entries <= 0:
    raise ValueError("Dataset must be non-empty.")

  num_duplicates = int(num_entries * percentage)

  if num_duplicates == 0:
    num_duplicates = 1

  # Select random entries to duplicate and create a deep copy
  duplicate_indices = rnd.sample(range(num_entries), num_duplicates)
  duplicate_data = dataset[duplicate_indices].copy()
  duplicate_labels = labels[duplicate_indices].copy()

  for entry in duplicate_data:
    num_round_features = rnd.randint(1, num_features)

    round_features = rnd.sample(range(num_features), num_round_features)

    for feature in round_features:
      # Calculate the digits of a data point
      entry_digits = len(str(entry[feature]).split('.')[1]) - 1 # Remove digit before comma

      # Randomly select how many digits to round off
      digits = rnd.randint(1, entry_digits - 1) # Remove 1 to avoid chance of exact duplicates (Extremes are included in randint)
      entry[feature] = np.round(entry[feature], digits) # Round entry to generated digit

  return (np.append(dataset, duplicate_data, axis=0), np.append(labels, duplicate_labels, axis=0))

**Usage example**

In [None]:
percentage = .5

polluted_dataset, polluted_labels = pollute_round_off(example_dataset, example_labels, percentage)

In [None]:
example_dataset, example_labels

(array([[-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.48758674],
        [-3.35471844, -1.4895879 ,  0.47891255, -2.63812044,  0.25589926],
        [ 2.8429093 ,  0.83633122, -1.89775432,  1.68994257, -0.59224401],
        [ 0.80460305,  2.24711467, -3.28064942, -1.0454157 , -1.63415213],
        [-0.89816726,  1.39373349, -0.79674393,  0.17159317,  0.02322873]]),
 array([1, 0, 1, 0, 0]))

In [None]:
polluted_dataset, polluted_labels

(array([[-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.48758674],
        [-3.35471844, -1.4895879 ,  0.47891255, -2.63812044,  0.25589926],
        [ 2.8429093 ,  0.83633122, -1.89775432,  1.68994257, -0.59224401],
        [ 0.80460305,  2.24711467, -3.28064942, -1.0454157 , -1.63415213],
        [-0.89816726,  1.39373349, -0.79674393,  0.17159317,  0.02322873],
        [-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.49      ],
        [-0.8982    ,  1.4       , -0.79674393,  0.1716    ,  0.02322873]]),
 array([1, 0, 1, 0, 0, 1, 0]))

#### Gaussian Noise pollution function
This pollution function randomly picks a percentage of entries from the dataset, duplicates them, and applies a Gaussian noise to them.

In [None]:
def pollute_gaussian_noise(dataset, labels, percentage, mean=0):

  '''
  Create duplicates from the dataset of size (percentage * dataset.shape[0]) and
   add Gaussian noise of mean MEAN and variance proportional to the dataset's
   variance, to each feature of each duplicate.

  Parameters:
    dataset (numpy.ndarray): The dataset to be duplicated and polluted.
    labels (numpy.ndarray): The labels of the dataset.
    percentage (float): The fraction of the dataset size to duplicate.
    mean (float): The mean of the Gaussian noise distribution.

  Returns:
    numpy.ndarray: The polluted dataset with duplicates.

  Raises:
    ValueError: If the dataset is empty or if the percentage is not between 0 and 1.
  '''

  if not (0 < percentage < 1):
    raise ValueError("Percentage must be between 0 and 1.")

  num_entries, num_features = dataset.shape

  if num_entries <= 0:
    raise ValueError("Dataset must be non-empty.")

  num_duplicates = int(num_entries * percentage)

  if num_duplicates == 0:
    num_duplicates = 1

  # Calculate the standard deviation of the dataset
  std_dev_data = np.std(dataset)

  # Determine a scaling factor for the noise
  scaling_factor = 0.1  # 10% of the data's standard deviation

  # Calculate the standard deviation for the noise
  std_dev_noise = scaling_factor * std_dev_data

  # Select random entries to duplicate and create a deep copy
  duplicate_indices = np.random.choice(num_entries, size=num_duplicates)
  duplicate_data = dataset[duplicate_indices].copy()
  duplicate_labels = labels[duplicate_indices].copy()

  for entry in duplicate_data:
    noise = np.random.normal(mean, std_dev_noise, num_features)
    entry += noise

  return (np.append(dataset, duplicate_data, axis=0), np.append(labels, duplicate_labels, axis=0))

**Usage example**

In [None]:
percentage = .5

polluted_dataset, polluted_labels = pollute_gaussian_noise(example_dataset, example_labels, percentage)

In [None]:
example_dataset, example_labels

(array([[-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.48758674],
        [-3.35471844, -1.4895879 ,  0.47891255, -2.63812044,  0.25589926],
        [ 2.8429093 ,  0.83633122, -1.89775432,  1.68994257, -0.59224401],
        [ 0.80460305,  2.24711467, -3.28064942, -1.0454157 , -1.63415213],
        [-0.89816726,  1.39373349, -0.79674393,  0.17159317,  0.02322873]]),
 array([1, 0, 1, 0, 0]))

In [None]:
polluted_dataset, polluted_labels

(array([[-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.48758674],
        [-3.35471844, -1.4895879 ,  0.47891255, -2.63812044,  0.25589926],
        [ 2.8429093 ,  0.83633122, -1.89775432,  1.68994257, -0.59224401],
        [ 0.80460305,  2.24711467, -3.28064942, -1.0454157 , -1.63415213],
        [-0.89816726,  1.39373349, -0.79674393,  0.17159317,  0.02322873],
        [ 0.88981186,  2.150672  , -3.46835508, -0.70980104, -1.64897999],
        [-1.0437219 ,  1.25501095, -0.86133709,  0.09596303,  0.04895464]]),
 array([1, 0, 1, 0, 0, 0, 0]))

####Scaling pollution function
This pollution function randomly picks a percentage of entries from the dataset, duplicates them, and scales them to a different, randomly chosen unit of measurement.

In [None]:
def pollute_scaling(dataset, labels, percentage, min_exp=-12, max_exp=12):

  '''
  Create duplicates from the dataset of size (percentage * dataset.shape[0]) and
   scales them to a different, randomly chosen unit of measurement (i.e.,
   multiplies each data item for a factor that is 10^exp, where exp is a random
   exponent between MIN_EXP and MAX_EXP.

  Parameters:
    dataset (numpy.ndarray): The dataset to be duplicated and polluted.
    labels (numpy.ndarray): The labels of the dataset.
    percentage (float): The fraction of the dataset size to duplicate.
    min_exp (int): The lower bound of powers of 10 for scaling pollution.
    max_exp (int): The upper bound of powers of 10 for scaling pollution.

  Returns:
    numpy.ndarray: The polluted dataset with duplicates.

  Raises:
    ValueError: If the dataset is empty or if the percentage is not between 0 and 1.
  '''

  if not (0 < percentage < 1):
    raise ValueError("Percentage must be between 0 and 1.")

  num_entries, num_features = dataset.shape

  if num_entries <= 0:
    raise ValueError("Dataset must be non-empty.")
  num_duplicates = int(num_entries * percentage)

  if num_duplicates == 0:
    num_duplicates = 1

  # Select random entries to duplicate and create a deep copy
  duplicate_indices = np.random.choice(num_entries, size=num_duplicates)
  duplicate_data = dataset[duplicate_indices].copy()
  duplicate_labels = labels[duplicate_indices].copy()

  for entry in duplicate_data:
      # pollute the elements of the row with the same scale
      exp = rnd.randint(min_exp, max_exp)
      entry *= (10 ** exp)

  return (np.append(dataset, duplicate_data, axis=0), np.append(labels, duplicate_labels, axis=0))

**Usage example**

In [None]:
percentage = .5

polluted_dataset, polluted_labels = pollute_scaling(example_dataset, example_labels, percentage)

In [None]:
example_dataset, example_labels

(array([[-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.48758674],
        [-3.35471844, -1.4895879 ,  0.47891255, -2.63812044,  0.25589926],
        [ 2.8429093 ,  0.83633122, -1.89775432,  1.68994257, -0.59224401],
        [ 0.80460305,  2.24711467, -3.28064942, -1.0454157 , -1.63415213],
        [-0.89816726,  1.39373349, -0.79674393,  0.17159317,  0.02322873]]),
 array([1, 0, 1, 0, 0]))

In [None]:
polluted_dataset, polluted_labels

(array([[-9.49209246e-01,  3.35684837e-01, -1.41548407e+00,
         -1.78576295e+00, -4.87586737e-01],
        [-3.35471844e+00, -1.48958790e+00,  4.78912548e-01,
         -2.63812044e+00,  2.55899262e-01],
        [ 2.84290930e+00,  8.36331225e-01, -1.89775432e+00,
          1.68994257e+00, -5.92244007e-01],
        [ 8.04603049e-01,  2.24711467e+00, -3.28064942e+00,
         -1.04541570e+00, -1.63415213e+00],
        [-8.98167260e-01,  1.39373349e+00, -7.96743925e-01,
          1.71593173e-01,  2.32287279e-02],
        [ 8.04603049e+04,  2.24711467e+05, -3.28064942e+05,
         -1.04541570e+05, -1.63415213e+05],
        [-3.35471844e-06, -1.48958790e-06,  4.78912548e-07,
         -2.63812044e-06,  2.55899262e-07]]),
 array([1, 0, 1, 0, 0, 0, 0]))

####Swapping pollution function
This pollution function randomly picks a percentage of entries from the dataset, duplicates them, and swaps a random amount of features.

In [None]:
def pollute_swapping(dataset, labels, percentage):

  '''
  Create duplicates from the dataset of size (percentage * dataset.shape[0]) and
   swaps a randomly chosen number of features.

  Parameters:
    dataset (numpy.ndarray): The dataset to be duplicated and polluted.
    labels (numpy.ndarray): The labels of the dataset.
    percentage (float): The fraction of the dataset size to duplicate.

  Returns:
    numpy.ndarray: The polluted dataset with duplicates.

  Raises:
    ValueError: If the dataset is empty, if the dataset has just one feature,
    or if the percentage is not between 0 and 1.
  '''

  if not (0 < percentage < 1):
    raise ValueError("Percentage must be between 0 and 1.")

  num_entries, num_features = dataset.shape

  if num_entries <= 0:
    raise ValueError("Dataset must be non-empty.")

  if num_features <= 1:
    raise ValueError("Dataset must have at least two features.")

  num_duplicates = int(num_entries * percentage)

  if num_duplicates == 0:
    num_duplicates = 1

  # Select random entries to duplicate and create a deep copy
  duplicate_indices = np.random.choice(num_entries, size=num_duplicates)
  duplicate_data = dataset[duplicate_indices].copy()
  duplicate_labels = labels[duplicate_indices].copy()

  for entry in duplicate_data:

    # randomly choose how many times to swap
    num_swap_features = rnd.randint(1, num_features - 1)

    # randomly choose the new order of features
    new_features_order = list(range(num_features))

    old_entry = entry.copy()
    while np.array_equal(old_entry, entry):
      for _ in range(num_swap_features):
        index1 = 0
        index2 = 0
        while index1 == index2:
          index1, index2 = np.random.choice(num_features, size=2)
        new_features_order[index1], new_features_order[index2] = new_features_order[index2], new_features_order[index1]
        # apply the new order of features
        entry[:] = entry[new_features_order]

  return (np.append(dataset, duplicate_data, axis=0), np.append(labels, duplicate_labels, axis=0))

**Usage example**

In [None]:
percentage = .5

polluted_dataset, polluted_labels = pollute_swapping(example_dataset, example_labels, percentage)

In [None]:
example_dataset, example_labels

(array([[-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.48758674],
        [-3.35471844, -1.4895879 ,  0.47891255, -2.63812044,  0.25589926],
        [ 2.8429093 ,  0.83633122, -1.89775432,  1.68994257, -0.59224401],
        [ 0.80460305,  2.24711467, -3.28064942, -1.0454157 , -1.63415213],
        [-0.89816726,  1.39373349, -0.79674393,  0.17159317,  0.02322873]]),
 array([1, 0, 1, 0, 0]))

In [None]:
polluted_dataset, polluted_labels

(array([[-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.48758674],
        [-3.35471844, -1.4895879 ,  0.47891255, -2.63812044,  0.25589926],
        [ 2.8429093 ,  0.83633122, -1.89775432,  1.68994257, -0.59224401],
        [ 0.80460305,  2.24711467, -3.28064942, -1.0454157 , -1.63415213],
        [-0.89816726,  1.39373349, -0.79674393,  0.17159317,  0.02322873],
        [-3.28064942, -1.63415213,  0.80460305, -1.0454157 ,  2.24711467],
        [-0.89816726,  1.39373349,  0.17159317, -0.79674393,  0.02322873]]),
 array([1, 0, 1, 0, 0, 0, 0]))

#### Similarity Pollution Functions

##### Helper Functions

**Cosine Similarity**

In [None]:
def generate_cosine_similar_vector(original_vector, similarity):
    # Form the unit vector parallel to v:
    u = original_vector / la.norm(original_vector)

    # This is a vector which will most likely not be parallel with v, so we can
    # use it to calculate a vector perpendicular to v
    r = np.random.normal(0, 1, len(original_vector))

    # Form a vector perpendicular to v:
    # This vector will be used to build the right triangle
    u_perp = r - r.dot(u)*u

    # Make it a unit vector:
    u_perp = u_perp / la.norm(u_perp)

    # w is the linear combination of u and u_perp with coefficients costheta
    # u-component is equal to the cosine of the triangle (adjacent to the angle)
    # u_perp-component is equal to the sine
    # and sin(theta) = sqrt(1 - costheta**2), respectively:
    w = similarity*u + np.sqrt(1 - similarity**2)*u_perp

    # Rescale vector to original size
    magnitude_original = np.linalg.norm(original_vector)
    magnitude_w = np.linalg.norm(w)

    # Calculate the scaling factor
    scaling_factor = magnitude_original / magnitude_w

    # Scale vector w
    scaled_vector_w = w * scaling_factor

    return scaled_vector_w

**Pearson Correlation**

In [None]:
def generate_pearson_correlated_vector(original_vector, correlation_target, tolerance=0.01, max_iterations=1000):
    """
    Generate a vector that incrementally adjusts its Pearson correlation to the specified target with the original vector.

    This is done by gradually adding noise until correlation drops to the desired target.

    Parameters:
    original_vector (numpy.ndarray): The original vector to which the generated vector should be correlated.
    correlation_target (float): The target Pearson correlation coefficient between the original and the generated vector.
    tolerance (float): The acceptable deviation from the target correlation for the generated vector.
    max_iterations (int): The maximum number of iterations to try to achieve the target correlation.

    Returns:
    numpy.ndarray: A vector that is incrementally adjusted to reach the specified Pearson correlation target.
    """
    current_vector = np.array(original_vector, dtype=float)  # Ensure the vector is of float type for incremental adjustments
    original_norm = np.linalg.norm(original_vector)

    for _ in range(max_iterations):
        # Generate small random noise
        noise = np.random.normal(0, 0.1, len(original_vector))

        # Incrementally adjust the vector by adding noise
        current_vector += noise

        # Normalize the adjusted vector to have the same magnitude as the original vector
        current_vector_norm = np.linalg.norm(current_vector)
        current_vector = (current_vector / current_vector_norm) * original_norm

        # Calculate Pearson correlation with the original vector
        current_correlation, _ = pearsonr(original_vector, current_vector)

        # Check if the current correlation is within the specified tolerance
        if abs(correlation_target - current_correlation) <= tolerance:
            return current_vector

    return current_vector


**(Addition) Tanimoto Similarity**

Tanimoto similarity for continuous variables is calculated as:
$T(a, b) = \frac{A \cdot B}{||A||^2 + ||B||^2 - A \cdot B}$

The goal is to find a vector $B$ such that the Tanimoto similarity with $A$ is the specified value $T$

In [None]:
def tanimoto_similarity(vector_a, vector_b):
    dot_product = np.dot(vector_a, vector_b)
    # Refer to formula above
    return dot_product / (np.square(np.linalg.norm(vector_a)) + np.square(np.linalg.norm(vector_b)) - dot_product)

In [None]:
def generate_tanimoto_similar_vector(original_vector, similarity_target, tolerance=1e-5, max_iterations=1000):
    '''
    Generates a vector with specified Tanimoto similarity to an original vector.

    This problem has no algebraic solution like with pearson's, so we need to use
    an iterative solution.

    Take a random perturbation vector and add it to the original, then begin iterating:
    For each step, calculate an adjustment factor to scale the perturbation to be
    added to the new vector.

    Repeat until the two vectors' tanimoto similarity goes below the threshold, or
    until the maximum numbers of iterations is reached.

    Parameters:
      original_vector (np.ndarray): The original vector to which the generated vectors should be similar.
      similarity_target (float): The target Tanimoto similarity between the original and generated vectors.
      tolerance (float): The tolerance within which the generated vectors should match the target similarity.
      max_iterations (int): The maximum number of iterations to perform.

    Returns:
      np.ndarray: A batch of vectors, each with a Tanimoto similarity to the original vector
                  close to the specified target.
    '''
    perturbation = np.random.normal(0, 0.1, len(original_vector))
    new_vector = original_vector + perturbation
    for _ in range(max_iterations):
        current_similarity = tanimoto_similarity(original_vector, new_vector)
        if abs(current_similarity - similarity_target) < tolerance:
            break

        # Adjust new_vector
        adjustment_factor = (similarity_target - current_similarity) * 0.1  # A small step towards the target
        new_vector += adjustment_factor * perturbation  # Adjust based on the initial perturbation direction

    return new_vector

##### Pollution Function

In [None]:
similarity_functions = {'cosine': generate_cosine_similar_vector, 'pearson': generate_pearson_correlated_vector, 'tanimoto': generate_tanimoto_similar_vector}

In [None]:
def pollute_similarity(dataset, labels, pollution_percentage, similarity, similarity_percentage=0.9):
  if not (0 < percentage < 1):
    raise ValueError("Percentage must be between 0 and 1.")

  if similarity.lower() not in similarity_functions.keys():
    raise ValueError(f"Similarity {similarity} not implemented.")

  num_entries, num_features = dataset.shape
  num_duplicates = int(num_entries * percentage)

  if num_duplicates == 0:
    num_duplicates = 1

  # Select random entries to duplicate and create a deep copy
  duplicate_indices = np.random.choice(num_entries, size=num_duplicates)
  duplicate_data = dataset[duplicate_indices].copy()
  duplicate_labels = labels[duplicate_indices].copy()

  similarity_function = similarity_functions.get(similarity.lower())

  for idx,entry in enumerate(duplicate_data):
    new_duplicate = similarity_function(entry, similarity_percentage)
    duplicate_data[idx] = new_duplicate

  return (np.append(dataset, duplicate_data, axis=0), np.append(labels, duplicate_labels, axis=0))

**Usage Examples**

In [None]:
percentage = .5

polluted_dataset, polluted_labels = pollute_similarity(example_dataset, example_labels, percentage, 'cosine')

In [None]:
example_dataset, example_labels

(array([[-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.48758674],
        [-3.35471844, -1.4895879 ,  0.47891255, -2.63812044,  0.25589926],
        [ 2.8429093 ,  0.83633122, -1.89775432,  1.68994257, -0.59224401],
        [ 0.80460305,  2.24711467, -3.28064942, -1.0454157 , -1.63415213],
        [-0.89816726,  1.39373349, -0.79674393,  0.17159317,  0.02322873]]),
 array([1, 0, 1, 0, 0]))

In [None]:
polluted_dataset, polluted_labels

(array([[-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.48758674],
        [-3.35471844, -1.4895879 ,  0.47891255, -2.63812044,  0.25589926],
        [ 2.8429093 ,  0.83633122, -1.89775432,  1.68994257, -0.59224401],
        [ 0.80460305,  2.24711467, -3.28064942, -1.0454157 , -1.63415213],
        [-0.89816726,  1.39373349, -0.79674393,  0.17159317,  0.02322873],
        [ 0.94731617,  1.32125332, -4.02009833,  0.47922708, -1.09039017],
        [-1.24481628, -0.71120455, -1.13013486, -1.73025637, -0.34292331]]),
 array([1, 0, 1, 0, 0, 0, 1]))

In [None]:
polluted_dataset, polluted_labels = pollute_similarity(example_dataset, example_labels, percentage, 'pearson')

In [None]:
polluted_dataset, polluted_labels

(array([[-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.48758674],
        [-3.35471844, -1.4895879 ,  0.47891255, -2.63812044,  0.25589926],
        [ 2.8429093 ,  0.83633122, -1.89775432,  1.68994257, -0.59224401],
        [ 0.80460305,  2.24711467, -3.28064942, -1.0454157 , -1.63415213],
        [-0.89816726,  1.39373349, -0.79674393,  0.17159317,  0.02322873],
        [-1.34150324,  0.47910634, -1.70882098, -1.1788211 , -0.32419998],
        [ 1.68234228,  1.10272655, -3.3536629 , -1.99290274, -0.97873419]]),
 array([1, 0, 1, 0, 0, 1, 0]))

In [None]:
polluted_dataset, polluted_labels = pollute_similarity(example_dataset, example_labels, percentage, 'tanimoto', similarity_percentage=0.9)

In [None]:
polluted_dataset, polluted_labels

(array([[-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.48758674],
        [-3.35471844, -1.4895879 ,  0.47891255, -2.63812044,  0.25589926],
        [ 2.8429093 ,  0.83633122, -1.89775432,  1.68994257, -0.59224401],
        [ 0.80460305,  2.24711467, -3.28064942, -1.0454157 , -1.63415213],
        [-0.89816726,  1.39373349, -0.79674393,  0.17159317,  0.02322873],
        [ 2.81616623,  1.74564245, -1.52533741,  1.24381089, -1.12319871],
        [ 1.98430088,  0.85260431, -2.31459639,  1.18162291, -0.34215541]]),
 array([1, 0, 1, 0, 0, 1, 1]))

####Data pollution function
This function picks a percentage of entries from the dataset, applies the specified pollution function, and shuffles the result. It will be used in the following experiments.

In [None]:
def pollute_duplication(dataset, labels, percentage, pollution_function, **kwargs):
  '''
  Create duplicates from the dataset of size (percentage * dataset.shape[0]),
  pollutes them with the given pollution_function, and shuffle the result.

  Parameters:
    dataset (numpy.ndarray): The dataset to be duplicated and polluted.
    labels (numpy.ndarray): The labels of the dataset.
    percentage (float): The fraction of the dataset size to duplicate.
    pollution_function (object): The pollution function to apply to the dataset.
    **kwargs: Additional keyword arguments for specific pollution functions.

  Returns:
    numpy.ndarray: The polluted dataset with duplicates.

  Raises:
    ValueError: If the dataset violates the pollution_function conditions.
  '''

  polluted_dataset, polluted_labels = pollution_function(dataset, labels, percentage, **kwargs)
  polluted_labels = polluted_labels[:, np.newaxis]
  polluted_dataset_labels = np.concatenate((polluted_dataset, polluted_labels), axis=1)

  # shuffle the dataset with the labels
  np.random.shuffle(polluted_dataset_labels)

  num_features = polluted_dataset.shape[1]

  return (polluted_dataset_labels[:, :num_features], polluted_dataset_labels[:, num_features:])

**Usage example**

In [None]:
percentage = .5
pollution_function = pollute_swapping

polluted_dataset, polluted_labels = pollute_duplication(example_dataset, example_labels, percentage, pollution_function)

In [None]:
example_dataset, example_labels

(array([[-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.48758674],
        [-3.35471844, -1.4895879 ,  0.47891255, -2.63812044,  0.25589926],
        [ 2.8429093 ,  0.83633122, -1.89775432,  1.68994257, -0.59224401],
        [ 0.80460305,  2.24711467, -3.28064942, -1.0454157 , -1.63415213],
        [-0.89816726,  1.39373349, -0.79674393,  0.17159317,  0.02322873]]),
 array([1, 0, 1, 0, 0]))

In [None]:
polluted_dataset, polluted_labels

(array([[-3.35471844, -2.63812044, -1.4895879 ,  0.47891255,  0.25589926],
        [ 2.24711467, -1.63415213,  0.80460305, -3.28064942, -1.0454157 ],
        [-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.48758674],
        [-0.89816726,  1.39373349, -0.79674393,  0.17159317,  0.02322873],
        [ 2.8429093 ,  0.83633122, -1.89775432,  1.68994257, -0.59224401],
        [-3.35471844, -1.4895879 ,  0.47891255, -2.63812044,  0.25589926],
        [ 0.80460305,  2.24711467, -3.28064942, -1.0454157 , -1.63415213]]),
 array([[0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.]]))

### Experiments

####Experiments dataset initialization

In [None]:
X_duplication_experiments = []
y_duplication_experiments = []

####Experiment \#1
The dataset will be **slightly** polluted with the **rounding-off** pollution function.

In [None]:
dataset = X_dup
labels = y_dup
percentage = .05
pollution_function = pollute_round_off

X_polluted, y_polluted = pollute_duplication(dataset, labels, percentage, pollution_function)
X_duplication_experiments.append(X_polluted)
y_duplication_experiments.append(y_polluted)
X_polluted, y_polluted

(array([[-1.26362418,  1.72283532,  2.29646349,  0.86979494, -0.19620279],
        [-0.55128057,  1.82366901,  1.04789728, -1.43508087, -1.8044368 ],
        [-1.44794874,  0.94356927,  0.31017483, -1.58841984, -0.82191895],
        ...,
        [ 2.07126709,  0.84328634, -1.44131217, -0.9828732 ,  0.76882991],
        [-0.64406719,  1.97148781,  2.35837219,  1.76982818,  0.28841226],
        [-1.72869558, -0.66054133,  0.83074544, -0.29965793,  0.76912222]]),
 array([[1.],
        [1.],
        [1.],
        ...,
        [0.],
        [1.],
        [1.]]))

####Experiment \#2
The dataset will be **heavily** polluted with the **rounding-off** pollution function.

In [None]:
dataset = X_dup
labels = y_dup
percentage = .5
pollution_function = pollute_round_off

X_polluted, y_polluted = pollute_duplication(dataset, labels, percentage, pollution_function)
X_duplication_experiments.append(X_polluted)
y_duplication_experiments.append(y_polluted)
X_polluted, y_polluted

(array([[-1.28226799,  1.06877017, -0.38059387,  1.81219199,  3.03842247],
        [-0.40673444, -1.64746513,  0.62564852, -1.50362343,  1.06173768],
        [-1.63800694, -0.1154045 ,  1.68829322, -1.1079689 , -0.01318753],
        ...,
        [ 0.15467695, -0.19948894, -2.13626932, -2.03410944, -3.91796758],
        [ 2.02971912,  1.28201321,  2.0545469 ,  0.95644835,  1.6759338 ],
        [ 0.08835475, -1.33169351, -0.90085005, -0.82116032,  0.73641052]]),
 array([[1.],
        [0.],
        [0.],
        ...,
        [0.],
        [1.],
        [0.]]))

####Experiment \#3
The dataset will be **slightly** polluted with the **Gaussian noise** pollution function.

In [None]:
dataset = X_dup
labels = y_dup
percentage = .05
pollution_function = pollute_gaussian_noise

X_polluted, y_polluted = pollute_duplication(dataset, labels, percentage, pollution_function)
X_duplication_experiments.append(X_polluted)
y_duplication_experiments.append(y_polluted)
X_polluted, y_polluted

(array([[ 1.16327706, -1.05028594,  0.65234844,  0.93632615,  2.97651156],
        [ 0.91014726, -1.39476408, -0.00675588, -1.66315581,  1.4806439 ],
        [ 3.43483512,  1.16804449, -1.4676696 , -1.20489941,  1.33947855],
        ...,
        [-1.67281789, -0.07856119,  2.16641976, -0.38347766, -0.29164429],
        [-1.73033603,  0.86148643,  1.96462253, -0.61253313, -1.51461633],
        [-2.71417002,  1.88932855, -1.71647742, -2.11834123,  1.12844336]]),
 array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [0.]]))

####Experiment \#4
The dataset will be **heavily** polluted with the **Gaussian noise** pollution function.

In [None]:
dataset = X_dup
labels = y_dup
percentage = .5
pollution_function = pollute_gaussian_noise

X_polluted, y_polluted = pollute_duplication(dataset, labels, percentage, pollution_function)
X_duplication_experiments.append(X_polluted)
y_duplication_experiments.append(y_polluted)
X_polluted, y_polluted

(array([[-1.70804736,  1.25213318,  2.08602872, -1.24431503, -2.41561583],
        [ 1.33532555,  3.7805688 ,  0.16231271, -2.04894238, -3.07922867],
        [-1.90050182,  1.22551263,  1.42378072, -1.59741314, -1.88375507],
        ...,
        [-1.38362686,  2.00271224,  1.63428037, -0.6510599 , -2.98146146],
        [-0.78407297,  1.95310397,  2.17423237,  1.89704651,  0.44785354],
        [ 0.88226638, -1.11451841,  0.52829617,  0.34566745,  2.63255647]]),
 array([[1.],
        [1.],
        [1.],
        ...,
        [1.],
        [1.],
        [0.]]))

####Experiment \#5
The dataset will be **slightly** polluted with the **scaling** pollution function.

In [None]:
dataset = X_dup
labels = y_dup
percentage = .05
pollution_function = pollute_scaling

X_polluted, y_polluted = pollute_duplication(dataset, labels, percentage, pollution_function)
X_duplication_experiments.append(X_polluted)
y_duplication_experiments.append(y_polluted)
X_polluted, y_polluted

(array([[-1.57418806,  1.05993288,  0.53385234,  0.13397501,  1.13179834],
        [-1.99034914, -0.08653247,  1.53060829,  1.09700743, -0.64246656],
        [ 1.6366213 , -0.23566977, -1.17779344, -1.1118078 ,  0.98667669],
        ...,
        [ 1.30781299, -1.37981411, -1.56062445, -0.80202517,  1.13028903],
        [ 0.22835649,  2.33104715,  2.25816762,  0.44672085,  1.14652017],
        [-0.99554243,  0.5664292 ,  0.96044104, -0.31096013, -0.51153448]]),
 array([[1.],
        [1.],
        [0.],
        ...,
        [0.],
        [1.],
        [1.]]))

####Experiment \#6
The dataset will be **heavily** polluted with the **scaling** pollution function.

In [None]:
dataset = X_dup
labels = y_dup
percentage = .5
pollution_function = pollute_scaling

X_polluted, y_polluted = pollute_duplication(dataset, labels, percentage, pollution_function)
X_duplication_experiments.append(X_polluted)
y_duplication_experiments.append(y_polluted)
X_polluted, y_polluted

(array([[ 7.21617465e-02,  8.18653615e-03, -6.12299458e-02,
         -1.27659443e-01, -3.48661964e-01],
        [-1.47016681e+00,  6.04944359e-01,  1.85924642e+00,
          1.49491367e-01, -1.11689240e+00],
        [-1.21027842e+00,  2.37150593e+00,  1.41051880e+00,
          2.57274582e+00,  1.50165244e+00],
        ...,
        [-3.07265909e+01, -2.21699480e+00,  3.00032276e+01,
          7.34709247e+00, -1.40978286e+01],
        [-2.18897900e-05,  2.26398205e-05,  4.30963584e-05,
          1.64826781e-04,  1.37803680e-04],
        [-1.59857854e-01,  1.45972006e+00,  2.21790880e+00,
         -5.28451756e-01,  2.13062255e-01]]),
 array([[0.],
        [1.],
        [1.],
        ...,
        [1.],
        [1.],
        [1.]]))

####Experiment \#7
The dataset will be **slightly** polluted with the **swapping** pollution function.

In [None]:
dataset = X_dup
labels = y_dup
percentage = .05
pollution_function = pollute_swapping

X_polluted, y_polluted = pollute_duplication(dataset, labels, percentage, pollution_function)
X_duplication_experiments.append(X_polluted)
y_duplication_experiments.append(y_polluted)
X_polluted, y_polluted

(array([[ 1.87073804, -0.52367569, -1.2353938 , -1.04128874,  1.21386213],
        [ 1.14422585, -0.40023995,  1.04788291,  2.27577602,  3.24797063],
        [-0.32451768,  1.97100488,  2.55645075,  3.19597299,  0.4822227 ],
        ...,
        [-0.82545885,  0.1631802 , -1.71975096, -0.06591352, -2.04844101],
        [ 0.75143652,  0.44646148, -0.08478494, -3.23481762,  1.58516593],
        [-1.90050182,  1.22551263,  1.42378072, -1.59741314, -1.88375507]]),
 array([[0.],
        [0.],
        [1.],
        ...,
        [0.],
        [1.],
        [1.]]))

####Experiment \#8
The dataset will be **heavily** polluted with the **swapping** pollution function.

In [None]:
dataset = X_dup
labels = y_dup
percentage = .5
pollution_function = pollute_swapping

X_polluted, y_polluted = pollute_duplication(dataset, labels, percentage, pollution_function)
X_duplication_experiments.append(X_polluted)
y_duplication_experiments.append(y_polluted)
X_polluted, y_polluted

(array([[-2.22753413, -2.95910976, -1.40963308, -1.39155788, -1.80300123],
        [-1.60467918, -0.4877585 , -0.19529525,  0.16920395,  1.05103003],
        [-1.2457026 , -0.12868055, -2.63781432, -1.085902  , -2.7673109 ],
        ...,
        [-1.57637239,  0.36012162,  0.20470767,  0.09888187,  0.23829669],
        [ 1.9494065 ,  0.29528   , -1.10607936,  0.10053975,  1.42604526],
        [-0.37448419,  2.5908915 , -2.91648514,  1.48902087, -0.89080027]]),
 array([[0.],
        [1.],
        [0.],
        ...,
        [1.],
        [0.],
        [1.]]))

#### Experiment \#9

The dataset will be **slightly** polluted with the **similarity** pollution function, using the *Tanimoto* similarity measure, with **75%** similarity

In [None]:
dataset = X_dup
labels = y_dup
percentage = .05
pollution_function = pollute_similarity

X_polluted, y_polluted = pollute_duplication(dataset, labels, percentage, pollution_function, similarity='tanimoto', similarity_percentage=0.75)
X_duplication_experiments.append(X_polluted)
y_duplication_experiments.append(y_polluted)
X_polluted, y_polluted

(array([[-3.36369582, -0.56942765,  0.77089656,  0.65329003, -0.59504281],
        [ 0.8479711 ,  1.29572189, -1.5157126 , -0.98433789, -2.83446981],
        [ 1.28623488, -0.60008139, -0.77648608, -1.92129938,  0.54265603],
        ...,
        [ 0.75143652,  0.44646148, -0.08478494, -3.23481762,  1.58516593],
        [ 2.5776916 ,  0.90093164, -0.94424682, -0.07889503,  1.62874676],
        [-0.56724982, -0.65374864, -3.43827305, -1.98789721, -4.56815193]]),
 array([[1.],
        [0.],
        [0.],
        ...,
        [1.],
        [0.],
        [0.]]))

#### Experiment \#10

The dataset will be **heavily** polluted with the **similarity** pollution function, using the *Tanimoto* correlation measure, with **75%** similarity

In [None]:
dataset = X_dup
labels = y_dup
percentage = .5
pollution_function = pollute_similarity

X_polluted, y_polluted = pollute_duplication(dataset, labels, percentage, pollution_function, similarity='tanimoto', similarity_percentage=0.75)
X_duplication_experiments.append(X_polluted)
y_duplication_experiments.append(y_polluted)
X_polluted, y_polluted

(array([[-2.62362924,  2.54845081, -0.7149473 , -2.65296162, -1.44657024],
        [-2.22753413, -2.95910976, -1.40963308, -1.39155788, -1.80300123],
        [-2.09963073,  1.03158183, -0.10404743, -2.25832871,  1.46534062],
        ...,
        [ 1.46189474, -0.87546186,  0.09550884, -1.50931106,  1.57459298],
        [ 0.17042123, -2.04626726, -2.28031576, -1.25690438, -4.65496396],
        [-1.41513393,  1.44701509,  1.57860873, -0.94710872, -2.07719494]]),
 array([[1.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [1.]]))