In [2]:
import os
import pandas as pd
import numpy as np

dataset_path = "./dataset"
directory = os.fsencode(dataset_path)
file_list = os.listdir(directory)
dfs = []
for file in file_list:
    filename = os.fsdecode(file)
    dfs.append(pd.read_csv(f"{dataset_path}/{filename}"))

In [3]:
for i in range(len(dfs)):
    nan_number = dfs[i].isna().sum().sum()
    if nan_number > 0:
        print(f"We have {nan_number} nan values in df {i}.")

We have 1 nan values in df 11.
We have 1 nan values in df 25.


In [4]:
dfs[0].dropna(inplace=True)
dfs[23].dropna(inplace=True)

In [5]:
CORR_COLS = ['CD',
 'NII',
 'NL',
 'NLE',
 'TCD',
 'WarningBlocker',
 'WarningCritical',
 'WarningMajor',
 'WarningMinor',
 'Documentation Metric Rules',
 'TNOS',
 'TLOC',
 'Size Metric Rules',
 'McCC',
 'TCLOC',
 'NOS',
 'LOC',
 'TLLOC',
 'DLOC']

In [6]:
from sklearn.preprocessing import MinMaxScaler

def df_transformation(df: pd.DataFrame, correlated_cols) -> pd.DataFrame:
    df.drop(["Name", "LongName", "Parent", "Component", "Path", "Line", "Column", "EndLine", "EndColumn", "ID"],
            axis=1, inplace=True)
    df.drop(correlated_cols, axis=1, inplace=True)
    scaler = MinMaxScaler()
    scaler.fit_transform(df)
    return df

for df in dfs:
    df = df_transformation(df, CORR_COLS)

In [7]:
dfs[0].columns

       'Complexity Metric Rules', 'Coupling Metric Rules'],
      dtype='object')

In [8]:
def split_df(df: pd.DataFrame):
    new_dfs = []
    if len(df) > 6000:
        new_dfs.extend(split_df(df.iloc[:int(len(df)/2), :]))
        new_dfs.extend(split_df(df.iloc[int(len(df)/2):, :]))
    else:
        new_dfs.append(df)
    return new_dfs

short_dfs = []
df_names = []
for i in range(len(dfs)):
    new_dfs = split_df(dfs[i])
    new_names = [file_list[i]] if len(new_dfs) <= 1 else [f"{file_list[i]}_{j+1}" for j in range(len(new_dfs))]
    short_dfs.extend(new_dfs)
    df_names.extend(new_names)

In [9]:
import numpy as np

def initialize_population(num_food_sources, num_features, feature_count):
    rng = np.random.default_rng()
    population = np.zeros((num_food_sources, num_features), dtype=int)

    for i in range(num_food_sources):
        # randomly select feature_count indices to set to 1
        active_indices = rng.choice(num_features, size=feature_count, replace=False)
        population[i, active_indices] = 1
    return population

# used for generation of solutions during the mutation process
def jaccard_dissimilarity(X1, X2):
    intersection = np.sum(np.logical_and(X1, X2))
    union = np.sum(np.logical_or(X1, X2))
    return 1 - intersection / union if union > 0 else 1

In [10]:
def differential_mutation(food_sources, feature_count, phi, rng):
    num_sources = len(food_sources)
    mutant_sources = np.zeros_like(food_sources)

    for i in range(num_sources):
        # choose 3 random neighbours
        r1, r2, r3 =  rng.choice([j for j in range(num_sources) if j != i], size=3, replace=False)
        Xr1, Xr2, Xr3 = food_sources[r1], food_sources[r2], food_sources[r3]

        # calculate scaled jackard dissimilarity
        dissimilarity_r2_r3 = jaccard_dissimilarity(Xr2, Xr3)
        target_dissimilarity = phi * dissimilarity_r2_r3
        
        # estimate similarity between new solution and r1
        m1 = np.sum(Xr1)
        m0 = len(Xr1) - m1

        # estimate optimal ms
        best_M11, best_M10, best_M01 = 0, 0, 0
        min_difference = float('inf')

        for M11 in range(m1 + 1):
            for M10 in range(m0+1):
                M01 = m1 - M11
                denominator = M11 + M10 + M01
                dissimilarity = 1 - (M11 / denominator if denominator != 0 else 1)
                difference = abs(dissimilarity - target_dissimilarity)

                if difference < min_difference:
                    best_M11, best_M10, best_M01 = M11, M10, M01
                    min_difference = difference

        # compose le mutant
        omega_i = np.zeros(len(food_sources[i]), dtype=int)

        active_indices = np.where(Xr1 == 1)[0]
        if len(active_indices) >= best_M11:
            selected_indices = rng.choice(active_indices, size=best_M11, replace=False)
            omega_i[selected_indices] = 1

        inactive_indices = np.where(Xr1 == 0)[0]
        if len(inactive_indices) >= best_M10:
            selected_indices = rng.choice(inactive_indices, size=best_M10, replace=False)
            omega_i[selected_indices] = 1

        current_active_count = np.sum(omega_i)

        if current_active_count < feature_count:
            remaining_inactive_indices = np.where(omega_i == 0)[0]
            additional_indices = rng.choice(remaining_inactive_indices, size=feature_count - current_active_count, replace=False)
            omega_i[additional_indices] = 1

        elif current_active_count > feature_count:
            excess_active_indices = np.where(omega_i == 1)[0]
            removal_indices = rng.choice(excess_active_indices, size=current_active_count - feature_count, replace=False)
            omega_i[removal_indices] = 0

        mutant_sources[i] = omega_i
        # print(mutant_sources)

    return mutant_sources

In [11]:
def crossover(parent, mutant, crossover_rate, rng, num_features):
    offspring = np.where(rng.random(len(parent)) < crossover_rate, mutant, parent)
    current_active_count = np.sum(offspring)
    if current_active_count < num_features:
        inactive_indices = np.where(offspring == 0)[0]
        additional_indices = rng.choice(inactive_indices, size=num_features - current_active_count, replace=False)
        offspring[additional_indices] = 1

    elif current_active_count > num_features:
        active_indices = np.where(offspring == 1)[0]
        removal_indices = rng.choice(active_indices, size=current_active_count - num_features, replace=False)
        offspring[removal_indices] = 0

    # print(offspring)
    return offspring

In [12]:
from tqdm import tqdm
import cupy as cp

def optimized_sammon_error(high_distances, low_dim_data):
    low_distances = cp.linalg.norm(low_dim_data[:, None] - low_dim_data, axis=2)
    high_dist_sum = cp.sum(high_distances)
    sammon_error_value = cp.sum(((high_distances - low_distances) ** 2) / (high_distances + 1e-9)) / high_dist_sum
    return sammon_error_value

def reduce_features(data, binary_vector):
    selected_features = data[:, binary_vector == 1]
    return selected_features

def mdisabc(num_food_sources, crossover_rate, phi, MAX_LIMIT, max_iterations, feature_count, num_features, dataset: pd.DataFrame):
    rng = np.random.default_rng()
    food_sources = initialize_population(num_food_sources, num_features, feature_count)
    limits = np.zeros(num_food_sources)
    best_solution = None
    best_error = float('inf')
    error_history = []
    dataset_norms = cp.linalg.norm(dataset[:, None] - dataset, axis=2)
    low_dim_errors = np.zeros(num_food_sources)
    sammon_errors = {}

    for iteration in tqdm(range(max_iterations)):
        mutants = differential_mutation(food_sources, feature_count, phi, rng)
        for i in range(num_food_sources):
            if tuple(food_sources[i]) not in sammon_errors:
                subset_data = reduce_features(dataset, food_sources[i])
                current_error = optimized_sammon_error(dataset_norms, subset_data)
                sammon_errors[tuple(food_sources[i])] = current_error
                # print(current_error)
            else:
                current_error = sammon_errors[tuple(food_sources[i])]

            mutant = mutants[i]
            candidate_solution = crossover(food_sources[i], mutant, crossover_rate, rng, feature_count)

            if tuple(candidate_solution) not in sammon_errors:
                neighbor_subset_data = reduce_features(dataset, candidate_solution)
                neighbor_error = optimized_sammon_error(dataset_norms, neighbor_subset_data)
                sammon_errors[tuple(candidate_solution)] = neighbor_error
            else:
                neighbor_error = sammon_errors[tuple(candidate_solution)]

            if neighbor_error < current_error:
                food_sources[i] = candidate_solution
                low_dim_errors[i] = neighbor_error
                limits[i] = 0
            else:
                limits[i] += 1
            # print(food_sources[i])

            if current_error < best_error:
                best_solution = food_sources[i]
                best_error = current_error

        for i in range(num_food_sources):
            if limits[i] >= MAX_LIMIT:
                pos_indices = rng.choice(num_features, size=feature_count, replace=False)
                food_sources[i]=np.array([1 if i in pos_indices else 0 for i in range(num_features)])
                limits[i] = 0

        error_history.append(best_error)

    return best_solution, error_history

In [13]:
open("abc.log", "a").close()

In [14]:
import gc

with open("abc.log", "a", buffering=1) as f:
    for i in range(0, len(short_dfs)):
        f.write(f"Working on {df_names[i]}...\n\n")
        for feature_count in range(2, len(short_dfs[i].columns)-1):
            best_solution, error_history = mdisabc(30, 0.25, 0.9, 50, 50, feature_count, 
                                                   len(short_dfs[i].columns), cp.array(short_dfs[i].values))
            f.write(f"Reduction to {feature_count} features.\nBest error: {error_history[-1]}.\nError history: {error_history}.\n\
                    Best solution: {best_solution}.\nSelected subset: {[short_dfs[i].columns[j] for j in np.where(best_solution == 1)[0]]}.\n\n")
            if error_history[-1] == 0:
                break
        gc.collect()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 16.73it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:03<00:00, 14.24it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:04<00:00, 10.81it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:03<00:00, 13.12it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 23.81it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 183.41it/s]
100%|█████████████████████████████████████████████████████████████████████████████