In [1]:
import sys
import os
import re
sys.path.insert(0, os.path.abspath(".."))
from utils import *
import pandas as pd
from basic_model_config import *
import copy
# --- Required Configurations ---
dataset_name = 'pc4'
data_filepath = '../../data/'  # Ensure trailing slash
model_config = copy.deepcopy(basic_model_config_slim_plussig2) 
#model_config = copy.deepcopy(basic_model_config_slim_plussig1)
model_config['config']['log_path'] = 'interpretations/'  # Set log path for interpretations
model_config['config']['seed'] = 2  # Enable verbose logging
model_config['config']['verbose'] = True
model_config = fill_config(model_config, scaling = True, oversampling= False, fitness_function='sigmoid_rmse', minimization=True, inflation_rate=0.1, ms_upper=10)
# --- Load Data and Metadata ---
data = pd.read_csv(f"{data_filepath}data_prepared/{dataset_name}.csv")
data_info = load_and_adapt_data_info(f"{data_filepath}data_info.csv")
train_indices = data_info.loc[data_info['name'] == dataset_name, 'train_indices'].values[0][0]
test_indices = data_info.loc[data_info['name'] == dataset_name, 'test_indices'].values[0][0]
categoricals = data_info.loc[data_info['name'] == dataset_name, 'categoricals'].values[0]

# --- Prepare Train/Test Splits ---
X_train, y_train, X_test, y_test = return_train_test(
    df=data,
    train_indices=train_indices,
    test_indices=test_indices,
    scaling=model_config['scaling'],
    oversampling=model_config['oversampling'],
    categoricals=categoricals
)

update_sample_weights(y_train, y_test)

In [2]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

(torch.Size([940, 37]),
 torch.Size([404, 37]),
 torch.Size([940]),
 torch.Size([404]))

In [3]:
data

Unnamed: 0,LOC_BLANK,BRANCH_COUNT,CALL_PAIRS,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CONDITION_COUNT,CYCLOMATIC_COMPLEXITY,CYCLOMATIC_DENSITY,DECISION_COUNT,DECISION_DENSITY,...,NODE_COUNT,NORMALIZED_CYLOMATIC_COMPLEXITY,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,NUMBER_OF_LINES,PERCENT_COMMENTS,LOC_TOTAL,target
0,17.0,11.0,5.0,2.0,8.0,20.0,6.0,0.25,10.0,2.0,...,25.0,0.11,53.0,49.0,23.0,12.0,57.0,31.25,24.0,0
1,2.0,9.0,3.0,0.0,1.0,16.0,5.0,0.56,6.0,2.0,...,14.0,0.36,13.0,24.0,7.0,14.0,14.0,10.00,9.0,0
2,2.0,5.0,1.0,1.0,1.0,6.0,3.0,0.17,2.0,3.0,...,7.0,0.13,16.0,28.0,9.0,14.0,23.0,10.53,18.0,0
3,4.0,5.0,1.0,0.0,0.0,8.0,3.0,0.30,4.0,2.0,...,10.0,0.19,13.0,16.0,10.0,9.0,16.0,0.00,10.0,0
4,7.0,5.0,1.0,3.0,0.0,0.0,3.0,0.15,0.0,0.0,...,10.0,0.11,26.0,46.0,7.0,7.0,28.0,15.00,20.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1339,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.14,0.0,0.0,...,3.0,0.09,8.0,15.0,5.0,8.0,11.0,0.00,7.0,0
1340,16.0,22.0,4.0,18.0,7.0,22.0,15.0,0.25,10.0,2.0,...,32.0,0.18,125.0,275.0,25.0,29.0,85.0,37.31,60.0,0
1341,12.0,3.0,8.0,0.0,6.0,4.0,2.0,0.11,2.0,2.0,...,14.0,0.05,28.0,40.0,19.0,13.0,39.0,24.00,19.0,0
1342,8.0,9.0,0.0,0.0,10.0,0.0,5.0,0.42,0.0,0.0,...,10.0,0.16,20.0,38.0,9.0,20.0,32.0,45.45,12.0,0


In [4]:
trained_model = train_model(
    dataset_name=dataset_name,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    model=model_config['name'],
    **model_config['config']
)

Verbose Reporter
-----------------------------------------------------------------------------------------------------------------------------------------
|         Dataset         |  Generation  |     Train Fitness     |       Test Fitness       |        Timing          |      Nodes       |
-----------------------------------------------------------------------------------------------------------------------------------------
|     pc4                 |       0      |   0.34936395287513733 |   0.3453179597854614     |   0.06458044052124023  |      5           |
|     pc4                 |       1      |   0.34936395287513733 |   0.3453179597854614     |   0.018021106719970703 |      5           |
|     pc4                 |       2      |   0.34936395287513733 |   0.3453179597854614     |   0.0                  |      5           |
|     pc4                 |       3      |   0.34936395287513733 |   0.3453179597854614     |   0.015044212341308594 |      5           |
|     pc4        

In [5]:
rep = trained_model.get_tree_representation()
rep

"('add', ('subtract', 'constant__0.8', 'constant_2.2'), 'x17') + f(('divide', 'x19', 'constant_8.3'))"

In [6]:
mapping = {col: f'x{i}' for i, col in enumerate(data.columns)}
mapping_table = pd.DataFrame(list(mapping.items()), columns=['Original', 'Mapped'])
reverse_mapping = {v: k for k, v in mapping.items()}


for k, v in reverse_mapping.items():
    rep = re.sub(rf'\b{k}\b', v, rep)

rep

"('add', ('subtract', 'constant__0.8', 'constant_2.2'), 'HALSTEAD_CONTENT') + f(('divide', 'HALSTEAD_EFFORT', 'constant_8.3'))"

In [7]:
def expr_to_latex(expr):
    def to_latex(node):
        if isinstance(node, tuple):
            op = node[0]
            if op == 'add':
                return f"{to_latex(node[1])} + {to_latex(node[2])}"
            elif op == 'subtract':
                return f"{to_latex(node[1])} - {to_latex(node[2])}"
            elif op == 'multiply':
                return f"{to_latex(node[1])} \\times {to_latex(node[2])}"
            elif op == 'divide':
                return f"\\frac{{{to_latex(node[1])}}}{{{to_latex(node[2])}}}"
            else:
                return f"\\text{{UNKNOWN\_OP}}({', '.join(map(str, node))})"
        elif isinstance(node, str):
            node = node.replace("__", "_")  # Handle double underscores as subscript
            if "_" in node:
                parts = node.split("_")
                return f"\\text{{{parts[0]}}}_{{{parts[1]}}}" if len(parts) == 2 else f"\\text{{{node}}}"
            return f"\\text{{{node}}}"
        else:
            return str(node)

    # If expression is wrapped with f(expr)
    if isinstance(expr, tuple) and expr[0] in {'add', 'subtract', 'multiply', 'divide'}:
        return to_latex(expr)
    elif isinstance(expr, tuple) and callable(expr[0]):
        return f"f\\left({to_latex(expr[1])}\\right)"
    else:
        return f"f\\left({to_latex(expr)}\\right)"


In [8]:
latex_rep = expr_to_latex(rep)

In [9]:
latex_rep

"f\\left(\\text{('add', ('subtract', 'constant_0.8', 'constant_2.2'), 'HALSTEAD_CONTENT') + f(('divide', 'HALSTEAD_EFFORT', 'constant_8.3'))}\\right)"

In [10]:
with open("short.txt", "w", encoding="utf-8") as file:
    file.write(rep)

In [11]:
trained_model.nodes_count

16

In [12]:
dataset_name = 'pc4'
data_filepath = '../../data/'  # Ensure trailing slash
model_config = copy.deepcopy(basic_model_config_slim_plussig1)
model_config['config']['log_path'] = 'interpretations/'  # Set log path for interpretations
model_config['config']['seed'] = 2  # Enable verbose logging
model_config['config']['verbose'] = True
model_config = fill_config(model_config, scaling = True, oversampling= False, fitness_function='sigmoid_rmse', minimization=True, inflation_rate=0.3, ms_upper=0.3)
# --- Load Data and Metadata ---
data = pd.read_csv(f"{data_filepath}data_prepared/{dataset_name}.csv")
data_info = load_and_adapt_data_info(f"{data_filepath}data_info.csv")
train_indices = data_info.loc[data_info['name'] == dataset_name, 'train_indices'].values[0][0]
test_indices = data_info.loc[data_info['name'] == dataset_name, 'test_indices'].values[0][0]
categoricals = data_info.loc[data_info['name'] == dataset_name, 'categoricals'].values[0]

# --- Prepare Train/Test Splits ---
X_train, y_train, X_test, y_test = return_train_test(
    df=data,
    train_indices=train_indices,
    test_indices=test_indices,
    scaling=model_config['scaling'],
    oversampling=model_config['oversampling'],
    categoricals=categoricals)

In [13]:
trained_model = train_model(
    dataset_name=dataset_name,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    model=model_config['name'],
    **model_config['config']
)

Verbose Reporter
-----------------------------------------------------------------------------------------------------------------------------------------
|         Dataset         |  Generation  |     Train Fitness     |       Test Fitness       |        Timing          |      Nodes       |
-----------------------------------------------------------------------------------------------------------------------------------------
|     pc4                 |       0      |   0.34936395287513733 |   0.3453179597854614     |   0.11126017570495605  |      5           |
|     pc4                 |       1      |   0.34936395287513733 |   0.3453179597854614     |   0.016098976135253906 |      5           |
|     pc4                 |       2      |   0.34936395287513733 |   0.3453179597854614     |   0.01794910430908203  |      5           |
|     pc4                 |       3      |   0.3484395742416382  |   0.34443798661231995    |   0.015549421310424805 |      16          |
|     pc4        

In [14]:
rep = trained_model.get_tree_representation()

In [15]:
mapping = {col: f'x{i}' for i, col in enumerate(data.columns)}
mapping_table = pd.DataFrame(list(mapping.items()), columns=['Original', 'Mapped'])
reverse_mapping = {v: k for k, v in mapping.items()}
for k, v in reverse_mapping.items():
    rep = re.sub(rf'\b{k}\b', v, rep)
rep

"('add', ('subtract', 'constant__0.8', 'constant_2.2'), 'HALSTEAD_CONTENT') + f(('add', 'NORMALIZED_CYLOMATIC_COMPLEXITY', 'constant__9.0')) + f(('add', 'constant_7.2', 'PERCENT_COMMENTS'))"

In [16]:
rep

"('add', ('subtract', 'constant__0.8', 'constant_2.2'), 'HALSTEAD_CONTENT') + f(('add', 'NORMALIZED_CYLOMATIC_COMPLEXITY', 'constant__9.0')) + f(('add', 'constant_7.2', 'PERCENT_COMMENTS'))"

In [17]:
with open("long.txt", "w", encoding="utf-8") as file:
    file.write(rep)

In [18]:
trained_model.nodes_count

27

In [2]:
import torch
torch.manual_seed(1)  # For reproducibility
# Random X with one feature (column)
X_train = torch.rand(10, 1)
X_test = torch.rand(5, 1)

# Random binary labels for y (e.g., 0 or 1)
y_train = torch.randint(0, 2, (10,))
y_test = torch.randint(0, 2, (5,))

# Print shapes
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


torch.Size([10, 1]) torch.Size([5, 1]) torch.Size([10]) torch.Size([5])


In [3]:
trained_model = train_model(
    dataset_name=dataset_name,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    model=model_config['name'],
    **model_config['config']
)

Verbose Reporter
-----------------------------------------------------------------------------------------------------------------------------------------
|         Dataset         |  Generation  |     Train Fitness     |       Test Fitness       |        Timing          |      Nodes       |
-----------------------------------------------------------------------------------------------------------------------------------------
|     pc4                 |       0      |   0.4472213387489319  |   0.3409407436847687     |   0.11632227897644043  |      15          |
|     pc4                 |       1      |   0.4472213387489319  |   0.3409407436847687     |   0.0                  |      15          |
|     pc4                 |       2      |   0.4472213387489319  |   0.3409407436847687     |   0.002723217010498047 |      15          |
|     pc4                 |       3      |   0.4472213387489319  |   0.3409407436847687     |   0.01504206657409668  |      15          |
|     pc4        

In [4]:
trained_model.get_tree_representation()

"('multiply', ('add', ('multiply', 'x0', 'x0'), ('divide', 'x0', 'x0')), ('multiply', ('divide', 'x0', 'constant__4.8'), ('subtract', 'constant_10.0', 'x0'))) + f(('subtract', 'x0', ('multiply', 'x0', 'x0')) - ('multiply', 'x0', 'x0'))"

In [5]:
trained_model.predict(X_test)

tensor([-4.2016, -1.6816, -5.3732,  1.0025, -0.6459])

In [6]:
def custom_function(x):
    return 0 + (-1* (torch.sigmoid(x/4.5) - torch.sigmoid((x/x))))


In [7]:
custom_function(X_test)

tensor([[0.1956],
        [0.2019],
        [0.1932],
        [0.2141],
        [0.2053]])

In [8]:
X_test

tensor([[0.6387],
        [0.5247],
        [0.6826],
        [0.3051],
        [0.4635]])

In [9]:
1 + 0.0481 * 7.9

1.37999