In [None]:
import os
import sys

sys.path.append("..")

import ase.atoms
import ase.io
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.ensemble
import sklearn.metrics
import sklearn.model_selection
from tqdm import tqdm

import config.paths as PATHS
import src.features.features_extractors as features_extractors
from src.features import input_parsers

plt.style.use("seaborn-v0_8")

### Constans

In [None]:
seed = 0xCAFFE

### Single particle loading for experiments: 

In [None]:
particle = ase.io.read(PATHS.SINGLE_PARTICLE_FILE)
particle

In [None]:
# %conda install -c conda-forge nglview

# Feature definition

In [None]:
sulfur_idxs = [10, 31]

benzene1_idxs = [11, 14, 15, 16, 17, 20]
benzene2_idxs = [21, 24, 25, 26, 27, 30]

benzene1_plane_idxs = [14, 15, 16]
benzene2_plane_idxs = [25, 26, 27]

# Apply to all particles:

In [None]:
df = input_parsers.read_raw_data(
    PATHS.PARTICLES_FILE, PATHS.TRANSPORT_FILE, PATHS.FEATURES_CACHE
)
df["y"] = np.log(df["y"])  # ToDo: log dodany do y - refactor it
df.head()

In [None]:
au = np.array([1, 6, 5, 10, 33, 38, 39, 42]) - 1
s = np.array([31, 32]) - 1
h = np.array([13, 14, 19, 20, 23, 24, 29, 30]) - 1
# benzenes are reindexed so not substracting one is intentionally
b1 = np.array([11, 14, 15, 16, 17, 20])
b2 = np.array([21, 24, 25, 26, 27, 30])

In [None]:
def get_all_triples(items_to_combine):
    items_num = len(items_to_combine)
    all_possible_idxs_combinations = []
    for i in range(items_num):
        for j in range(i + 1, items_num):
            for k in range(j + 1, items_num):
                all_possible_idxs_combinations.append([i, j, k])

    items_to_combine = np.array(items_to_combine)
    return [items_to_combine[idxs] for idxs in all_possible_idxs_combinations]


def get_all_pairs(items_to_combine):
    items_num = len(items_to_combine)
    all_possible_idxs_combinations = []
    for i in range(items_num):
        for j in range(i + 1, items_num):
            all_possible_idxs_combinations.append([i, j])

    items_to_combine = np.array(items_to_combine)
    return [items_to_combine[idxs] for idxs in all_possible_idxs_combinations]


important_atoms = np.concatenate([au, s, h, b1, b2])
all_dst = get_all_pairs(important_atoms)
np.array(all_dst).shape

In [None]:
for idx1, idx2 in tqdm(all_dst):
    features_extractors.add_dst_feature(df, idx1, idx2)

In [None]:
for idx1, idx2 in tqdm(all_dst):
    if not (idx1 == 22 or idx2 == 22):
        features_extractors.add_ang_feature(df, idx1, 22, idx2)

In [None]:
X = df.iloc[:, 2:]
y = df["y"]

In [None]:
from sklearn.feature_selection import VarianceThreshold

var_sel = VarianceThreshold(threshold=(0.8 * (1 - 0.8)))
X = var_sel.fit_transform(X)

In [None]:
var_sel.n_features_in_, var_sel.get_feature_names_out().shape[0]

In [None]:
import numpy as np
from deap import algorithms, base, creator, tools
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
y_test_eval = np.exp(y_test)


# Define the evaluation function for the genetic algorithm
def evaluate(individual):
    # Convert the individual's chromosome to a boolean mask
    mask = np.asarray(individual, dtype=bool)
    # Apply the mask to the training data to select the features
    X_train_masked = X_train[:, mask]
    # Train a decision tree regressor on the masked data
    clf = DecisionTreeRegressor(random_state=42)
    clf.fit(X_train_masked, y_train)
    # Apply the mask to the testing data to select the features
    X_test_masked = X_test[:, mask]
    # Calculate the mean squared error of the regressor on the masked testing data
    y_pred = np.exp(clf.predict(X_test_masked))
    mape = sklearn.metrics.mean_absolute_percentage_error(y_test_eval, y_pred)
    features_ratio = np.mean(mask)
    return mape, features_ratio


# Define the genetic algorithm's parameters
POPULATION_SIZE = 50
P_CROSSOVER = 0.5
P_MUTATION = 0.9
MAX_GENERATIONS = 10
HALL_OF_FAME_SIZE = 5
RANDOM_SEED = 42
FEATURE_PROB = 0.05
TOURNAMENT_FRAC = 0.1


def mutate_swap(individual, indpb, swaps=5):
    included, not_included = [], []
    for idx, is_included in enumerate(individual):
        if is_included:
            included.append(idx)
        else:
            not_included.append(idx)

    # i, j = np.random.choice(len(individual), size=2, replace=False)
    if included == [] or not_included == []:
        return (individual,)

    for _ in range(swaps):
        if np.random.random() > indpb:
            i = np.random.choice(included)
            j = np.random.choice(not_included)

            individual[i], individual[j] = individual[j], individual[i]
    return (individual,)


def multi_mutate(individual, indpb):
    individual = tools.mutFlipBit(individual, indpb / 8)[
        0
    ]  # napisać swojego z osobnym pstwem dla true i false
    # individual = tools.mutShuffleIndexes(individual, indpb)[0]
    # individual = mutate_swap(individual, indpb)[0]
    return (individual,)


# Define the genetic algorithm's toolbox
creator.create("FitnessMin", base.Fitness, weights=(-1.0, -0.05))
creator.create("Individual", list, fitness=creator.FitnessMin)
toolbox = base.Toolbox()
toolbox.register(
    "attr_bool", np.random.choice, [True, False], p=[FEATURE_PROB, 1 - FEATURE_PROB]
)
toolbox.register(
    "individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=X.shape[1]
)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", multi_mutate, indpb=0.5)
toolbox.register(
    "select", tools.selTournament, tournsize=int(POPULATION_SIZE * TOURNAMENT_FRAC)
)

# Set the random seed for reproducibility
np.random.seed(RANDOM_SEED)

# Create the initial population of individuals
population = toolbox.population(n=POPULATION_SIZE)

# Define the statistics to collect during the genetic algorithm
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("min mape", lambda x: np.min(x, axis=0)[0])
stats.register("avg mape", lambda x: np.mean(x, axis=0)[0])
stats.register("min feature ratio", lambda x: np.min(x, axis=0)[1])
stats.register("avg feature ratio", lambda x: np.mean(x, axis=0)[1])

In [None]:
# Run the genetic algorithm
population, logbook = algorithms.eaSimple(
    population,
    toolbox,
    cxpb=P_CROSSOVER,
    mutpb=P_MUTATION,
    ngen=MAX_GENERATIONS,
    stats=stats,
    halloffame=tools.HallOfFame(HALL_OF_FAME_SIZE),
    verbose=True,
)

In [None]:
# Print the best individual found by the genetic algorithm
best_individual = tools.selBest(population, k=1)[0]
best_mask = np.asarray(best_individual, dtype=bool)
best_features = var_sel.get_feature_names_out()[best_mask]
# print("Best individual found: ", best_individual)
print("Best mape: ", evaluate(best_individual)[0])
print("Best features: ", best_features)
len(best_features)

In [None]:
import re


def feature_id_to_positions(indexes: list):
    pattern = r"^[a-zA-Z]+(\d{1,2})[a-zA-Z]+(\d{1,2})$"
    positions = []
    for index in indexes:
        pos1, pos2 = re.search(pattern, index).groups()
        positions.append((int(pos1), int(pos2)))
    return positions


dst_features = [
    "dstAu0C17",
    "dstAu0C25",
    "dstAu5S31",
    "dstAu5H18",
    "dstAu4Au9",
    "dstAu4Au38",
    "dstAu4C30",
    "dstAu4H13",
    "dstAu9Au37",
    "dstAu9H12",
    "dstAu9C15",
    "dstAu9C24",
    "dstAu32Au41",
    "dstAu32C26",
    "dstAu41H18",
    "dstAu41H23",
    "dstAu41C25",
    "dstC30C30",
    "dstS31H18",
    "dstS31C14",
    "dstH12H22",
    "dstH13C17",
    "dstH18H22",
    "dstH19C17",
    "dstC11C14",
    "dstC11C17",
    "dstC11C21",
    "dstC21C26",
]

print(feature_id_to_positions(dst_features))