# Variable Types issue
The following pollution functions are used to inject variable types into the dataset, for a total of 10 different experiments.

### Imports

In [None]:
!pip install recordlinkage

In [None]:
import random as rnd
import pandas as pd
import numpy as np
import numpy.linalg as la
import string
from datetime import datetime, timedelta
from scipy.stats import pearsonr
import recordlinkage
# -- IMPORT SCRIPTS FROM DATADIQ REPO (@camillasancricca)
import A_data_collection as data_collection
import D_data_analysis as data_analysis
import E_plot_results as plot_results

In [None]:
SEED = 2023
NUM_EXPERIMENTS = 10

np.random.seed(SEED)
rnd.seed(SEED)

In [None]:
INIT_FEATURES = 5
NUM_INFORMATIVE = 5
NUM_REDUNDANT = INIT_FEATURES - NUM_INFORMATIVE
assert NUM_REDUNDANT >= 0

In [None]:
# dataset for usage example
example_dataset, example_labels = data_collection.make_dataset_for_classification(n_samples=5, n_features=INIT_FEATURES, n_informative=NUM_INFORMATIVE, n_redundant=NUM_REDUNDANT, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, seed=2023)
# dataset for duplication experiments
X_var, y_var = data_collection.make_dataset_for_classification(n_samples=1000, n_features=INIT_FEATURES, n_informative=NUM_INFORMATIVE, n_redundant=NUM_REDUNDANT, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, seed=2023)

### Correlated Variable Types pollution

#### Helper Functions

In [None]:
def poly_trig_correlation(features_to_correlate, entry):
  if len(entry.shape) == 1:
    entry = entry[:INIT_FEATURES].astype(float)
    return entry[features_to_correlate[0]] + (entry[features_to_correlate[2]]**2) * np.cos(entry[features_to_correlate[1]])
  else:
    entry = entry[:, :INIT_FEATURES].astype(float)
    return entry[:, features_to_correlate[0]] + (entry[:, features_to_correlate[2]]**2) * np.cos(entry[:, features_to_correlate[1]])

In [None]:
def calc_percentiles(data):
    features_to_correlate = np.random.choice(INIT_FEATURES, size=3, replace=False)
    correlation_values = np.array([poly_trig_correlation(features_to_correlate, d) for d in data])
    return np.percentile(correlation_values, [20, 40, 60, 80]), features_to_correlate

In [None]:
def convert_to_float(dataset):
  for i in range(INIT_FEATURES, dataset.shape[1]):
    try:
        # Try converting the first element of the column to float
        float(dataset[0, i])
        # If successful, convert the whole column to float
        dataset[:, i] = dataset[:, i].astype(float)
    except ValueError:
        # If conversion fails, leave the column as is
        continue
  return dataset

####Boolean generator pollution function
This pollution function computes a given number of new boolean features out of the original features according to some predefined rules.

In [None]:
NUM_BOOLEAN_RULES = 2
# predefined rules
def generate_boolean(entry, rule_id):
  if rule_id == 1:
    return boolean_rule_1(entry)
  else:
    return boolean_rule_2(entry)

def boolean_rule_1(entry):
    # Choose a random subset of features
    random_indices = rnd.sample(range(INIT_FEATURES), k=int(INIT_FEATURES / 2))
    feature_subset = [entry[idx] for idx in random_indices]

    # Calculate a non-linear combination of these features
    combined_value = sum([x**2 for x in feature_subset]) - sum([x**3 for x in feature_subset])

    # Return a boolean value based on this combination
    return combined_value > 0

def boolean_rule_2(entry):
    # Randomly select indices from the first INIT_FEATURES
    indices = [rnd.randint(0, INIT_FEATURES - 1) for _ in range(4)]

    # Non-linear combination
    non_linear_combination = np.sin(entry[indices[0]]) + np.log(abs(entry[indices[1]]) + 1)

    # Interaction between features
    interaction = entry[indices[2]] * entry[indices[3]]

    # Injecting noise
    noisy_interaction = interaction + np.random.normal(0, 0.1)

    # Scaling manipulation
    scaled_feature = entry[rnd.randint(0, INIT_FEATURES - 1)] * np.random.uniform(0.5, 1.5)

    # Combine all elements
    polluted_value = non_linear_combination + noisy_interaction + scaled_feature

    return polluted_value > np.median(entry[:INIT_FEATURES])

In [None]:
# pollution function
def pollute_boolean(dataset):

  '''
  Computes a boolean feature for the dataset by using the
  original features according to some predefined rules.

  Parameters:
    dataset (numpy.ndarray): The dataset to be polluted.

  Returns:
    numpy.ndarray: The polluted dataset with the new features.
  '''

  num_entries, num_features = dataset.shape

  if num_entries <= 0:
    return dataset

  rule_id = np.random.choice(list(range(1, NUM_BOOLEAN_RULES+1)), size=1)

  new_feature = np.empty((dataset.shape[0], 1), dtype=object)
  for idx, entry in enumerate(dataset):
    new_feature[idx] = generate_boolean(entry, rule_id)

  dataset = np.concatenate((dataset, new_feature), axis=1)

  return dataset

**Usage example**

In [None]:
polluted_dataset = pollute_boolean(example_dataset)

In [None]:
example_dataset

array([[-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.48758674],
       [-3.35471844, -1.4895879 ,  0.47891255, -2.63812044,  0.25589926],
       [ 2.8429093 ,  0.83633122, -1.89775432,  1.68994257, -0.59224401],
       [ 0.80460305,  2.24711467, -3.28064942, -1.0454157 , -1.63415213],
       [-0.89816726,  1.39373349, -0.79674393,  0.17159317,  0.02322873]])

In [None]:
polluted_dataset

array([[-0.9492092460945891, 0.3356848367990344, -1.4154840690084136,
        -1.7857629481086632, -0.4875867372136996, True],
       [-3.354718444854246, -1.4895879002746253, 0.47891254801921634,
        -2.6381204410192556, 0.2558992623687477, True],
       [2.8429093047054455, 0.8363312246583592, -1.897754317684768,
        1.6899425739384784, -0.5922440065704158, True],
       [0.8046030487066642, 2.247114668718825, -3.2806494207123906,
        -1.045415698656595, -1.634152134151519, True],
       [-0.8981672595151875, 1.3937334851074585, -0.7967439254055066,
        0.17159317342400127, 0.02322872794907549, True]], dtype=object)

####Categoric generator pollution function
This pollution function computes a given number of new categoric features that are computed out of the original features according to some predefined rules.

In [None]:
NUM_STRING_RULES = 3

LETTERS = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N",
           "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
CATEGORIES = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
NUM_LETTERS = len(LETTERS)

ASCII_OFFSET = 32

# predefined rules
def generate_string(entry, rule_id):
  if rule_id == 1:
    return string_rule_1(entry)
  elif rule_id == 2:
    return string_rule_2(entry)
  else:
    return string_rule_3(entry)


def string_rule_1(entry):
    indices = rnd.sample(range(INIT_FEATURES), INIT_FEATURES)
    entry = entry[:INIT_FEATURES].astype(float)
    numeric_entry = [entry[i] for i in indices]
    min_val, max_val = np.min(numeric_entry), np.max(numeric_entry)
    separators = np.linspace(min_val, max_val, NUM_LETTERS)
    string = ""
    for value in numeric_entry:
        for idx, num in enumerate(separators):
            if num >= value:
                string += LETTERS[idx]
                break
    return string

def string_rule_2(entry):
    indices = rnd.sample(range(INIT_FEATURES), INIT_FEATURES)
    string = ""
    for i in indices:
        value = entry[i]
        if not isinstance(value, float):
            continue
        str_value = str(value).replace(".", "").replace("-", "")
        for num in str_value:
            string += chr(int(num) + ASCII_OFFSET)
    return string

def string_rule_3(data):
    percentiles, features_to_correlate = calc_percentiles(data)
    results = []
    for entry in data:
        corr_value = poly_trig_correlation(features_to_correlate, entry)
        for i, percentile in enumerate(percentiles):
            if corr_value <= percentile:
                results.append(CATEGORIES[i])
                break
        else:
            results.append(CATEGORIES[4])
    return np.array(results)[:, np.newaxis]

In [None]:
def pollute_string(dataset):
  '''
  Computes a string feature for the dataset by using the
  original features according to some predefined rules.

  Parameters:
    dataset (numpy.ndarray): The dataset to be polluted.

  Returns:
    numpy.ndarray: The polluted dataset with the new feature.
  '''
  global percentiles, features_to_correlate

  num_entries, num_features = dataset.shape

  if num_entries <= 0:
    return dataset

  rule_id = rnd.sample(list(range(1, NUM_STRING_RULES+1)), 1)[0] # Go from list to single int
  new_feature = np.empty((dataset.shape[0], 1), dtype=object)

  if rule_id == 3:
    new_feature = generate_string(dataset, rule_id)
  else:
    for idx, entry in enumerate(dataset):
      new_feature[idx] = generate_string(entry, rule_id)

  #print(new_feature.shape)

  dataset = np.concatenate((dataset, new_feature), axis=1)
  # Convert the first x features to float
  dataset[:, :INIT_FEATURES] = dataset[:, :INIT_FEATURES].astype(float)
  dataset = convert_to_float(dataset)

  return dataset

**Usage example**

In [None]:
polluted_dataset = pollute_string(example_dataset)

In [None]:
example_dataset

array([[-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.48758674],
       [-3.35471844, -1.4895879 ,  0.47891255, -2.63812044,  0.25589926],
       [ 2.8429093 ,  0.83633122, -1.89775432,  1.68994257, -0.59224401],
       [ 0.80460305,  2.24711467, -3.28064942, -1.0454157 , -1.63415213],
       [-0.89816726,  1.39373349, -0.79674393,  0.17159317,  0.02322873]])

In [None]:
polluted_dataset

array([[-0.9492092460945891, 0.3356848367990344, -1.4154840690084136,
        -1.7857629481086632, -0.4875867372136996, 'KQAFZ'],
       [-3.354718444854246, -1.4895879002746253, 0.47891254801921634,
        -2.6381204410192556, 0.2558992623687477, 'YNZFA'],
       [2.8429093047054455, 0.8363312246583592, -1.897754317684768,
        1.6899425739384784, -0.5922440065704158, 'TAZHP'],
       [0.8046030487066642, 2.247114668718825, -3.2806494207123906,
        -1.045415698656595, -1.634152134151519, 'AITLZ'],
       [-0.8981672595151875, 1.3937334851074585, -0.7967439254055066,
        0.17159317342400127, 0.02322872794907549, 'ALZMC']], dtype=object)

#### Numeric generator pollution function

In [None]:
NUM_NUMERIC_RULES = 2

def generate_numeric(dataset, rule_id):
  if rule_id == 1:
    return numeric_rule_1(dataset)
  else:
    return numeric_rule_2(dataset)

def numeric_rule_1(dataset, n_features=3):
    """
    Add a polluted feature to the dataset by non-linearly combining n random features.

    :param dataset: A NumPy array containing the dataset.
    :param n_features: Number of features to pick for creating the polluted feature.
    :return: The dataset with an additional polluted feature.
    """
    # Ensure n_features is not greater than the number of columns in the dataset
    n_features = min(n_features, dataset.shape[1])

    # Randomly select n feature indices
    selected_indices = rnd.sample(range(INIT_FEATURES), n_features)

    # Initialize the polluted feature as an array of ones
    polluted_feature = np.ones(dataset.shape[0])

    # Apply a non-linear combination
    for idx in selected_indices:
        # Example of a non-linear operation: multiplication, exponentiation, and sine function
        polluted_feature *= np.sin(dataset[:, idx].astype(float) ** 2 + 1)

    # Append the new feature to the dataset
    polluted_feature = np.expand_dims(polluted_feature, axis=-1)
    #print(polluted_feature.shape, polluted_feature)
    updated_dataset = np.concatenate((dataset, polluted_feature), axis=1)

    return updated_dataset

def numeric_rule_2(dataset, n_features=3):
    """
    Add another polluted feature to the dataset by non-linearly combining n random features.
    This version ensures a reasonable scale without strict normalization.

    :param dataset: A NumPy array containing the dataset.
    :param n_features: Number of features to pick for creating the polluted feature.
    :return: The dataset with an additional polluted feature.
    """
    # Ensure n_features is not greater than the number of columns in the dataset
    n_features = min(n_features, dataset.shape[1])

    # Randomly select n feature indices
    selected_indices = rnd.sample(range(INIT_FEATURES), n_features)

    # Initialize the polluted feature
    polluted_feature = np.zeros(dataset.shape[0])

    # Apply a non-linear combination
    for idx in selected_indices:
        # Adjusting the feature values to avoid issues with log and exp
        feature = (np.abs(dataset[:, idx]) + 1e-5).astype(float)
        transformed_feature = np.log(feature) * np.exp((feature - 1) / 10)  # Scale down the exponential part
        polluted_feature += transformed_feature

    # Scale down the polluted feature to keep it within a reasonable range
    scale_factor = np.max(np.abs(polluted_feature))
    if scale_factor > 10:  # Arbitrary scale factor to avoid excessively large values
        polluted_feature /= scale_factor / 10

    # Append the new feature to the dataset
    polluted_feature = np.expand_dims(polluted_feature, axis=-1)
    updated_dataset = np.concatenate((dataset, polluted_feature), axis=1)

    return updated_dataset

In [None]:
def pollute_numeric(dataset):
  '''
  Computes a numeric feature for the dataset by using the
  original features according to some predefined rules.
  Differently to the others, this function does not pollute row-by-row, but
  passes the whole dataset to each pollution function. This is done to guarantee
  some grade of consistency in the pollution, given the feature space is no longer
  limited as in the string/boolean/date example.

  Parameters:
    dataset (numpy.ndarray): The dataset to be polluted.

  Returns:
    numpy.ndarray: The polluted dataset with the new feature.
  '''
  num_entries, num_features = dataset.shape

  if num_entries <= 0:
    return dataset

  rule_id = rnd.sample(list(range(1, NUM_NUMERIC_RULES+1)), 1)[0] # Go from list to single int
  new_feature = np.empty((dataset.shape[0], 1), dtype=float)

  dataset = generate_numeric(dataset, rule_id)

  return dataset

In [None]:
polluted_dataset = pollute_numeric(example_dataset)

In [None]:
example_dataset

array([[-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.48758674],
       [-3.35471844, -1.4895879 ,  0.47891255, -2.63812044,  0.25589926],
       [ 2.8429093 ,  0.83633122, -1.89775432,  1.68994257, -0.59224401],
       [ 0.80460305,  2.24711467, -3.28064942, -1.0454157 , -1.63415213],
       [-0.89816726,  1.39373349, -0.79674393,  0.17159317,  0.02322873]])

In [None]:
polluted_dataset

array([[-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.48758674,
        -0.37202059],
       [-3.35471844, -1.4895879 ,  0.47891255, -2.63812044,  0.25589926,
        -0.43230242],
       [ 2.8429093 ,  0.83633122, -1.89775432,  1.68994257, -0.59224401,
         1.45423497],
       [ 0.80460305,  2.24711467, -3.28064942, -1.0454157 , -1.63415213,
         1.80248078],
       [-0.89816726,  1.39373349, -0.79674393,  0.17159317,  0.02322873,
        -3.74079715]])

####Date generator pollution function
This pollution function computes a new date feature out of the original features according to a predefined rule.

In [None]:
def generate_date(entry):
    # Randomly choose a subset of features
    random_indices = rnd.sample(range(INIT_FEATURES), k=rnd.randint(1, INIT_FEATURES))
    selected_features = [entry[idx] for idx in random_indices]

    # Apply a transformation to the selected features
    total = sum([np.cos(value) * np.sin(idx) for idx, value in enumerate(selected_features)])
    str_value = str(abs(total)).replace(".", "")[:10]  # Truncate to prevent overly large numbers
    timestamp_unix = int(str_value)

    # Convert to datetime and format
    date_time = datetime.utcfromtimestamp(timestamp_unix).strftime('%d-%m-%Y')
    return date_time

In [None]:
def pollute_date(dataset):

  '''
  Computes a new date feature for the dataset by using the
  original features according to a predefined rule.

  Parameters:
    dataset (numpy.ndarray): The dataset to be polluted.

  Returns:
    numpy.ndarray: The polluted dataset with the new feature.
  '''

  num_entries, num_features = dataset.shape

  if num_entries <= 0:
    return dataset

  new_feature = np.empty((dataset.shape[0], 1), dtype=object)
  for idx, entry in enumerate(dataset):
    new_feature[idx] = generate_date(entry)
  dataset = np.concatenate((dataset, new_feature), axis=1)

  return dataset

**Usage example**

In [None]:
polluted_dataset = pollute_date(example_dataset)

In [None]:
example_dataset

array([[-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.48758674],
       [-3.35471844, -1.4895879 ,  0.47891255, -2.63812044,  0.25589926],
       [ 2.8429093 ,  0.83633122, -1.89775432,  1.68994257, -0.59224401],
       [ 0.80460305,  2.24711467, -3.28064942, -1.0454157 , -1.63415213],
       [-0.89816726,  1.39373349, -0.79674393,  0.17159317,  0.02322873]])

In [None]:
polluted_dataset

array([[-0.9492092460945891, 0.3356848367990344, -1.4154840690084136,
        -1.7857629481086632, -0.4875867372136996, '12-10-2011'],
       [-3.354718444854246, -1.4895879002746253, 0.47891254801921634,
        -2.6381204410192556, 0.2558992623687477, '05-10-1975'],
       [2.8429093047054455, 0.8363312246583592, -1.897754317684768,
        1.6899425739384784, -0.5922440065704158, '01-01-1970'],
       [0.8046030487066642, 2.247114668718825, -3.2806494207123906,
        -1.045415698656595, -1.634152134151519, '07-07-1988'],
       [-0.8981672595151875, 1.3937334851074585, -0.7967439254055066,
        0.17159317342400127, 0.02322872794907549, '13-11-2025']],
      dtype=object)

####Pollution Function

In [None]:
correlated_feature_generators = {'boolean': pollute_boolean, 'string': pollute_string, 'date': pollute_date, 'numeric': pollute_numeric}

In [None]:
def pollute_correlated_data_types(dataset, generator_functions):
  if not isinstance(generator_functions, list):
    generator_functions = [generator_functions] # Make mono-dimensional list to avoid errors for using single generator function

  for func in generator_functions:
    if func.lower() not in correlated_feature_generators:
      raise ValueError(f"Function {func} not implemented.")

  num_entries, num_features = dataset.shape

  if num_entries <= 0:
    raise ValueError("Dataset must be non-empty.")

  for func in generator_functions:
    pollution_function = correlated_feature_generators.get(func.lower())
    dataset = pollution_function(dataset)

  return dataset

**Usage example**

In [None]:
polluted_dataset = pollute_correlated_data_types(example_dataset, ['boolean', 'date', 'boolean', 'boolean'])

In [None]:
example_dataset

array([[-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.48758674],
       [-3.35471844, -1.4895879 ,  0.47891255, -2.63812044,  0.25589926],
       [ 2.8429093 ,  0.83633122, -1.89775432,  1.68994257, -0.59224401],
       [ 0.80460305,  2.24711467, -3.28064942, -1.0454157 , -1.63415213],
       [-0.89816726,  1.39373349, -0.79674393,  0.17159317,  0.02322873]])

In [None]:
polluted_dataset

array([[-0.9492092460945891, 0.3356848367990344, -1.4154840690084136,
        -1.7857629481086632, -0.4875867372136996, True, '18-09-2005',
        True, True],
       [-3.354718444854246, -1.4895879002746253, 0.47891254801921634,
        -2.6381204410192556, 0.2558992623687477, True, '01-06-1978',
        True, True],
       [2.8429093047054455, 0.8363312246583592, -1.897754317684768,
        1.6899425739384784, -0.5922440065704158, False, '06-12-1983',
        False, True],
       [0.8046030487066642, 2.247114668718825, -3.2806494207123906,
        -1.045415698656595, -1.634152134151519, True, '23-03-2000', True,
        True],
       [-0.8981672595151875, 1.3937334851074585, -0.7967439254055066,
        0.17159317342400127, 0.02322872794907549, True, '05-03-2028',
        True, False]], dtype=object)

### Noncorrelated Variable Types pollution

#### Helper Functions

In [None]:
def generate_random_dates(n, start_year=1900, end_year=2100):
    dates = []
    start_date = datetime(start_year, 1, 1)
    end_date = datetime(end_year, 12, 31)
    delta = end_date - start_date

    for _ in range(n):
        random_days = rnd.randrange(delta.days)
        random_date = start_date + timedelta(days=random_days)
        dates.append(random_date.strftime("%Y-%m-%d"))

    return dates

In [None]:
def generate_random_booleans(n):
    return [rnd.choice([True, False]) for _ in range(n)]

In [None]:
def generate_random_strings(n, length=10):
    strings = []
    for _ in range(n):
        random_str = ''.join(rnd.choices(string.ascii_letters + string.digits, k=length))
        strings.append(random_str)
    return strings

In [None]:
def generate_random_floats(dataset, scale=1.0):
    # Extract only numeric data for determining range
    numeric_data = dataset[:, :INIT_FEATURES]

    # Determine the range of the numeric data
    data_min, data_max = np.min(numeric_data), np.max(numeric_data)

    # Generate random numbers within a similar range
    random_feature = np.random.uniform(data_min * scale, data_max * scale, size=dataset.shape[0])

    # Append the new feature to the dataset
    random_feature = np.expand_dims(random_feature, axis=-1)
    updated_dataset = np.concatenate((dataset, random_feature), axis=1)
    return updated_dataset

#### Pollution Function

In [None]:
noncorrelated_feature_generators = {'date': generate_random_dates, 'bool': generate_random_booleans, 'string': generate_random_strings, 'numeric': generate_random_floats}

In [None]:
def pollute_noncorrelated_data_types(dataset, generator_functions):
  if not isinstance(generator_functions, list):
    generator_functions = [generator_functions] # Make mono-dimensional list to avoid errors for using single generator function

  for func in generator_functions:
    if func.lower() not in noncorrelated_feature_generators:
      raise ValueError(f"Function {func} not implemented.")

  num_entries, num_features = dataset.shape

  if num_entries <= 0:
    raise ValueError("Dataset must be non-empty.")

  duplicate_indices = np.arange(num_entries)

  for func in generator_functions:
    pollution_function = noncorrelated_feature_generators.get(func.lower())
    # Whole dataset required at input to guarantee similarly-scaled floats
    if func.lower() == 'numeric':
      dataset = pollution_function(dataset)
      continue

    polluted_data = pollution_function(len(duplicate_indices)) # Get n-sized vector of new feature

    new_feature = np.zeros((num_entries, 1), dtype=object) # Create column vector for this feature

    for i, index in enumerate(duplicate_indices):
      new_feature[index] = polluted_data[i]

    dataset = np.append(dataset, new_feature, axis=1)

  return dataset


**Usage example**

In [None]:
polluted_dataset = pollute_noncorrelated_data_types(example_dataset, ['string', 'bool', 'numeric', 'date'])

In [None]:
example_dataset

array([[-0.94920925,  0.33568484, -1.41548407, -1.78576295, -0.48758674],
       [-3.35471844, -1.4895879 ,  0.47891255, -2.63812044,  0.25589926],
       [ 2.8429093 ,  0.83633122, -1.89775432,  1.68994257, -0.59224401],
       [ 0.80460305,  2.24711467, -3.28064942, -1.0454157 , -1.63415213],
       [-0.89816726,  1.39373349, -0.79674393,  0.17159317,  0.02322873]])

In [None]:
polluted_dataset

array([[-0.9492092460945891, 0.3356848367990344, -1.4154840690084136,
        -1.7857629481086632, -0.4875867372136996, 'TY8HhO6kGL', False,
        2.2809109123099898, '2092-06-14'],
       [-3.354718444854246, -1.4895879002746253, 0.47891254801921634,
        -2.6381204410192556, 0.2558992623687477, '75UQSyPx3B', True,
        -1.7508964498594348, '2060-12-27'],
       [2.8429093047054455, 0.8363312246583592, -1.897754317684768,
        1.6899425739384784, -0.5922440065704158, 'pdbIKaRdeb', True,
        0.7513480493596694, '1950-04-22'],
       [0.8046030487066642, 2.247114668718825, -3.2806494207123906,
        -1.045415698656595, -1.634152134151519, 'uFrEHS2JpD', True,
        -1.4228492298374003, '2051-05-25'],
       [-0.8981672595151875, 1.3937334851074585, -0.7967439254055066,
        0.17159317342400127, 0.02322872794907549, 'zfOvz1dOOd', True,
        -0.13153044514404844, '1965-11-06']], dtype=object)

### Experiments

####Experiments dataset initialization

In [None]:
X_variable_types_experiments = []
y_variable_types_experiments = np.array([y_var.copy() for _ in range(NUM_EXPERIMENTS)])

####Experiment \#1
The dataset will be polluted with a **low** number of new **boolean** features that are **correlated** with the original features.

In [None]:
dataset = X_var
new_features = ['boolean'] * 2

X_polluted = pollute_correlated_data_types(dataset, new_features)
X_variable_types_experiments.append(X_polluted)
X_polluted

array([[-3.044852043021044, 0.40122525804329334, 1.8766526337265068, ...,
        -2.1329499478849936, True, False],
       [1.4457834775607874, 1.3307533080592129, 1.140285198046516, ...,
        2.422164688959354, True, False],
       [2.404213410027689, 1.6201611493238681, -0.710272995532069, ...,
        1.1839555818904257, False, False],
       ...,
       [0.17397334524745334, -1.0958619504628027, 0.14857090944972828,
        ..., 1.0491511265756497, True, True],
       [-1.4758554074184587, -0.7187457361162684, -2.1085391293481255,
        ..., -2.674929990187009, True, True],
       [-1.73033602580001, 0.8614864271967009, 1.9646225268133373, ...,
        -1.5146163278990659, True, True]], dtype=object)

####Experiment \#2
The dataset will be polluted with a **low** number of new **numeric** features that are **correlated** with the original features.

In [None]:
dataset = X_var
new_features = ['numeric'] * 2

X_polluted = pollute_correlated_data_types(dataset, new_features)
X_variable_types_experiments.append(X_polluted)
X_polluted

array([[-3.04485204,  0.40122526,  1.87665263, ..., -2.13294995,
         2.90164848,  0.67442021],
       [ 1.44578348,  1.33075331,  1.1402852 , ...,  2.42216469,
         1.53847359,  0.0138496 ],
       [ 2.40421341,  1.62016115, -0.710273  , ...,  1.18395558,
         0.84915925, -0.22111506],
       ...,
       [ 0.17397335, -1.09586195,  0.14857091, ...,  1.04915113,
        -3.31293051,  0.59106784],
       [-1.47585541, -0.71874574, -2.10853913, ..., -2.67492999,
         2.40500531,  0.0271084 ],
       [-1.73033603,  0.86148643,  1.96462253, ..., -1.51461633,
         1.77065302,  0.73384115]])

####Experiment \#3
The dataset will be polluted with a **low** number of new **string** features that are **correlated** with the original features.

In [None]:
dataset = X_var
new_features = ['string'] * 2

X_polluted = pollute_correlated_data_types(dataset, new_features)
X_variable_types_experiments.append(X_polluted)
X_polluted

array([[-3.044852043021044, 0.40122525804329334, 1.8766526337265068, ...,
        -2.1329499478849936, 'FZSYA', 'Very Low'],
       [1.4457834775607874, 1.3307533080592129, 1.140285198046516, ...,
        2.422164688959354, 'HLZKA', 'Very High'],
       [2.404213410027689, 1.6201611493238681, -0.710272995532069, ...,
        1.1839555818904257, 'ZWNAV', 'Very High'],
       ...,
       [0.17397334524745334, -1.0958619504628027, 0.14857090944972828,
        ..., 1.0491511265756497, 'QAZDQ', 'High'],
       [-1.4758554074184587, -0.7187457361162684, -2.1085391293481255,
        ..., -2.674929990187009, 'ZQAIJ', 'Very Low'],
       [-1.73033602580001, 0.8614864271967009, 1.9646225268133373, ...,
        -1.5146163278990659, 'CSZIA', 'Low']], dtype=object)

####Experiment \#4
The dataset will be polluted with a **high** number of new **string** features that are **correlated** with the original features.

In [None]:
dataset = X_var
new_features = ['string'] * 5

X_polluted = pollute_correlated_data_types(dataset, new_features)
X_variable_types_experiments.append(X_polluted)
X_polluted

array([[-3.044852043021044, 0.40122525804329334, 1.8766526337265068, ...,
        ' $ !""%"%( $#")##$# $$(%" $# "! $$!%&&& %#" #&\'" )\'!(\'&&%"&##\'"&% &("!#")$))$\'(($))#&',
        'Very Low', 'FYSZA'],
       [1.4457834775607874, 1.3307533080592129, 1.140285198046516, ...,
        '!!$ "(%!)( $&%!&"$""!&$&(()%)#%$!$$%\'(#$\'\'%& \'(\'$!## \'%## ( %)"!") \' &"%"&\' \'!\'%"##',
        'Very High', 'KLAHZ'],
       [2.404213410027689, 1.6201611493238681, -0.710272995532069, ...,
        '#\')#$(\'!"!")$ )##"$ $"!#$!  "\'&()!&" !&!!$)#"#(&(! \'! "\'"))%%#" &)!!(#)%%%(!() $"%\'',
        'High', 'WVAZN'],
       ...,
       [0.17397334524745334, -1.0958619504628027, 0.14857090944972828,
        ...,
        '! $)!%!!"&%\'%&$)\'! )%(&!)% $&"( "\' !\'#)\'##$%"$\'$%##$ !$(%\' ) )$$)\'"("(!#!"!)$)"$ #%$\'"',
        'Low', 'QQADZ'],
       [-1.4758554074184587, -0.7187457361162684, -2.1085391293481255,
        ...,
        ' \'!(\'$%\'#&!!&"&($!)($#())\'#"%"!\'(&"! (%#)!")#$(!"%%"&\'$)")))

####Experiment \#5
The dataset will be polluted with a **low** number of new **numeric** and **string** features that are **correlated** with the original features.

In [None]:
dataset = X_var
new_features = ['numeric'] * 2 + ['string'] * 2

X_polluted = pollute_correlated_data_types(dataset, new_features)
X_variable_types_experiments.append(X_polluted)
X_polluted

array([[-3.044852043021044, 0.40122525804329334, 1.8766526337265068, ...,
        1.3543442528114746, 'AZFSY',
        '!(\'&&%"&##\'"&% &(# $$(%" $# "! $$ $ !""%"%( $#")##$"!#")$))$\'(($))#&!%&&& %#" #&\'" )\''],
       [1.4457834775607874, 1.3307533080592129, 1.140285198046516, ...,
        1.7006939281876563, 'AHKLZ',
        '"$""!&$&(()%)#%$!$$%\'(#$\'\'%& \'(\'$!## \'%## ( %)"!")!!$ "(%!)( $&%!& \' &"%"&\' \'!\'%"##'],
       [2.404213410027689, 1.6201611493238681, -0.710272995532069, ...,
        1.6948867023646543, 'WNVAZ',
        '!!(#)%%%(!() $"%\'"$ $"!#$!  "\'&()#\')#$(\'!"!")$ )##!&" !&!!$)#"#(&(! \'! "\'"))%%#" &)'],
       ...,
       [0.17397334524745334, -1.0958619504628027, 0.14857090944972828,
        ..., -1.4694876720085936, 'ZDAQQ',
        '! )%(&!)% $&"( "\'! $)!%!!"&%\'%&$)\' !\'#)\'##$%"$\'$%##$ !$(%\' ) )$$)\'"("(!#!"!)$)"$ #%$\'"'],
       [-1.4758554074184587, -0.7187457361162684, -2.1085391293481255,
        ..., 1.2504740766925133, 'AJZQI',
        '!$\'

####Experiment \#6
The dataset will be polluted with a **low** number of new **boolean**, **string**, **numeric**, and **date** features that are **correlated** with the original features.

In [None]:
dataset = X_var
new_features = ['boolean'] * 2 + ['string'] * 2 + ['numeric'] * 2 + ['date'] * 2

X_polluted = pollute_correlated_data_types(dataset, new_features)
X_variable_types_experiments.append(X_polluted)
X_polluted

array([[-3.044852043021044, 0.40122525804329334, 1.8766526337265068, ...,
        -0.2261735208424135, '05-10-2009', '28-04-1993'],
       [1.4457834775607874, 1.3307533080592129, 1.140285198046516, ...,
        0.03813236107058131, '01-01-1970', '16-02-1981'],
       [2.404213410027689, 1.6201611493238681, -0.710272995532069, ...,
        0.14850216212892647, '29-07-1977', '23-08-2012'],
       ...,
       [0.17397334524745334, -1.0958619504628027, 0.14857090944972828,
        ..., 0.29812448190569185, '16-04-1983', '27-02-1988'],
       [-1.4758554074184587, -0.7187457361162684, -2.1085391293481255,
        ..., -0.02646146012168066, '23-05-2009', '11-01-2002'],
       [-1.73033602580001, 0.8614864271967009, 1.9646225268133373, ...,
        0.7305478639642488, '26-12-1988', '30-08-2004']], dtype=object)

####Experiment \#7
The dataset will be polluted with a **very high** number of new **date** and **numeric** features that are **correlated** with the original features.

In [None]:
dataset = X_var
new_features = ['numeric'] * 50 + ['date'] * 50

X_polluted = pollute_correlated_data_types(dataset, new_features)
X_variable_types_experiments.append(X_polluted)
X_polluted

array([[-3.044852043021044, 0.40122525804329334, 1.8766526337265068, ...,
        '25-04-1982', '13-06-1974', '01-01-1970'],
       [1.4457834775607874, 1.3307533080592129, 1.140285198046516, ...,
        '11-10-1984', '20-04-1986', '21-01-1990'],
       [2.404213410027689, 1.6201611493238681, -0.710272995532069, ...,
        '23-12-1986', '01-01-1970', '01-01-1971'],
       ...,
       [0.17397334524745334, -1.0958619504628027, 0.14857090944972828,
        ..., '15-05-1996', '20-01-2008', '01-01-1970'],
       [-1.4758554074184587, -0.7187457361162684, -2.1085391293481255,
        ..., '12-07-1972', '31-08-1975', '10-11-1983'],
       [-1.73033602580001, 0.8614864271967009, 1.9646225268133373, ...,
        '01-01-1970', '01-01-1970', '01-07-1971']], dtype=object)

####Experiment \#8
The dataset will be polluted with **one** new **date** feature that is **noncorrelated** with the original features.

In [None]:
dataset = X_var
new_features = ['date']

X_polluted = pollute_noncorrelated_data_types(dataset, new_features)
X_variable_types_experiments.append(X_polluted)
X_polluted

array([[-3.044852043021044, 0.40122525804329334, 1.8766526337265068,
        1.5666053203672097, -2.1329499478849936, '2076-12-20'],
       [1.4457834775607874, 1.3307533080592129, 1.140285198046516,
        0.7062526707175233, 2.422164688959354, '2006-07-27'],
       [2.404213410027689, 1.6201611493238681, -0.710272995532069,
        -3.7934871212940933, 1.1839555818904257, '2022-05-16'],
       ...,
       [0.17397334524745334, -1.0958619504628027, 0.14857090944972828,
        -1.312194924035472, 1.0491511265756497, '2089-06-26'],
       [-1.4758554074184587, -0.7187457361162684, -2.1085391293481255,
        -1.9843899732521786, -2.674929990187009, '2000-03-03'],
       [-1.73033602580001, 0.8614864271967009, 1.9646225268133373,
        -0.6125331271154684, -1.5146163278990659, '2034-09-25']],
      dtype=object)

####Experiment \#9
The dataset will be polluted with a **low** number of new **date** and **string** features that are **noncorrelated** with the original features.

In [None]:
dataset = X_var
new_features = ['date'] + ['string']

X_polluted = pollute_noncorrelated_data_types(dataset, new_features)
X_variable_types_experiments.append(X_polluted)
X_polluted

array([[-3.044852043021044, 0.40122525804329334, 1.8766526337265068, ...,
        -2.1329499478849936, '1928-04-01', 'eXlGLjFA30'],
       [1.4457834775607874, 1.3307533080592129, 1.140285198046516, ...,
        2.422164688959354, '1952-01-19', 'mnjZvWdaq3'],
       [2.404213410027689, 1.6201611493238681, -0.710272995532069, ...,
        1.1839555818904257, '1945-05-08', 'xUB3XqeiJT'],
       ...,
       [0.17397334524745334, -1.0958619504628027, 0.14857090944972828,
        ..., 1.0491511265756497, '1999-06-25', '5TPVFj9JRG'],
       [-1.4758554074184587, -0.7187457361162684, -2.1085391293481255,
        ..., -2.674929990187009, '2039-01-18', 'dItOTj55Wb'],
       [-1.73033602580001, 0.8614864271967009, 1.9646225268133373, ...,
        -1.5146163278990659, '1994-05-17', 'f7hrFCH8ju']], dtype=object)

####Experiment \#10
The dataset will be polluted with a **low** number of new **string** and **numeric** features that are **noncorrelated** with the original features.

In [None]:
dataset = X_var
new_features = ['string'] * 2 + ['numeric']

X_polluted = pollute_noncorrelated_data_types(dataset, new_features)
X_variable_types_experiments.append(X_polluted)
X_polluted

array([[-3.044852043021044, 0.40122525804329334, 1.8766526337265068, ...,
        'YEHfi07GXD', 'nD6FPb5b4l', 0.3954236082807272],
       [1.4457834775607874, 1.3307533080592129, 1.140285198046516, ...,
        'WClvfGbftp', '5ics1C6yuB', -5.925728106178093],
       [2.404213410027689, 1.6201611493238681, -0.710272995532069, ...,
        'uKB6YpY4hY', 'i3JXNb4p4H', 1.562460141243573],
       ...,
       [0.17397334524745334, -1.0958619504628027, 0.14857090944972828,
        ..., 'srB9sOl5yq', 't9fpTXCIYC', 1.9244206652761449],
       [-1.4758554074184587, -0.7187457361162684, -2.1085391293481255,
        ..., 'az2XWKUkN5', 'z6MmNQs4Za', -1.927260346570959],
       [-1.73033602580001, 0.8614864271967009, 1.9646225268133373, ...,
        '5WNaYp5Vua', 'aIhLZRomrM', -1.985222930939731]], dtype=object)