In [138]:
from neuron_predictor import NeuronPredictor

# import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor  
from sklearn.tree import DecisionTreeClassifier

import math

import numpy as np
from utils import probe_directions_list, tuple_to_label
import os

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

from create_dataset import get_filtered_dataset
from create_dataset import get_variable_names

import pandas as pd

import joblib
from typing import List, Tuple



'''def tree_to_cnf(tree, feature_names=None):
    recurse_count = 0
    def recurse(node, path):
        nonlocal recurse_count
        recurse_count += 1
        if tree.feature[node] != -2:  # not a leaf
            feature = feature_names[tree.feature[node]] if feature_names is not None else f"feature_{tree.feature[node]}"
            threshold = tree.threshold[node]

            if feature[3:].startswith("not"):
                left_feature = feature[:3] + feature[7:]
                right_feature = feature
            else:
                left_feature = feature[:3] + "not " + feature[3:]
                right_feature = feature
            
            left_path = path + [f"{left_feature}"]
            right_path = path + [f"{right_feature}"]
            
            yield from recurse(tree.children_left[node], left_path)
            yield from recurse(tree.children_right[node], right_path)
        else:  # leaf
            predicted_value = tree.value[node][0, 0]
            yield f"({' AND '.join(path)} => {predicted_value:.4f})"

    return list(recurse(0, [])), recurse_count'''

def tree_to_dnf(tree, feature_names=None):
    recurse_count = 0
    def recurse(node, path):
        nonlocal recurse_count
        recurse_count += 1
        if tree.feature[node] != -2:  # not a leaf
            feature = feature_names[tree.feature[node]] if feature_names is not None else f"feature_{tree.feature[node]}"
            threshold = tree.threshold[node]

            if feature[3:].startswith("not"): # The First Case probably never happens
                feature = feature[:3] + feature[7:]
                threshold = 1-threshold
            left_feature = (feature, "<=", threshold)
            right_feature = (feature, ">", threshold)
            
            left_path = path + [left_feature]
            right_path = path + [right_feature]
            
            yield from recurse(tree.children_left[node], left_path)
            yield from recurse(tree.children_right[node], right_path)
        else:  # leaf
            predicted_value = tree.value[node][0, -1]
            yield (path, predicted_value)

    rules = list(recurse(0, []))
    rules.sort(key=lambda x: x[-1], reverse=True) # sort by the predicted value
    return rules, recurse_count

def process_disjunction(disjunction: List[Tuple[str, str, float]]) -> str:
    # Step 1: Remove redundancies
    cleaned = remove_redundancies(disjunction)
    
    # Step 2: Remove certain "placed" variables
    result = remove_placed_variables(cleaned)
    
    # Step 3: Group literals by variable
    # grouped = group_literals(cleaned)
    
    # Step 4: Generate the final string
    # result = generate_string(grouped)
    
    return result

def remove_redundancies(disjunction: List[Tuple[str, str, float]]) -> List[Tuple[str, str, float]]:
    result = []
    for var, op, threshold in disjunction:
        if op == "<=":
            existing = next((x for x in result if x[0] == var and x[1] == "<="), None)
            if existing is None or threshold < existing[2]:
                result = [x for x in result if not (x[0] == var and x[1] == "<=")]
                result.append((var, op, threshold))
        elif op == ">":
            existing = next((x for x in result if x[0] == var and x[1] == ">"), None)
            if existing is None or threshold > existing[2]:
                result = [x for x in result if not (x[0] == var and x[1] == ">")]
                result.append((var, op, threshold))
    return result

def remove_placed_variables(disjunction: List[Tuple[str, str, float]]) -> List[Tuple[str, str, float]]:
    placed_vars = [var for var, op, _ in disjunction if "placed" in var and op == ">"]
    if len(placed_vars) > 0:
        return [x for x in disjunction if not ("placed" in x[0] and x[1] == "<=")]
    return disjunction

def group_literals(disjunction: List[Tuple[str, str, float]]) -> dict:
    grouped = {}
    for var, op, threshold in disjunction:
        if var not in grouped:
            grouped[var] = {">" : None, "<=" : None}
        grouped[var][op] = f"{threshold:.4f}"
    return grouped

def generate_string_from_rule(rule: List[Tuple[str, str, float]]) -> str:
    out_str = ""
    for var, op, threshold in rule:
        out_str += f"{var} {op} {threshold:.4f} "
    return out_str

def generate_string(grouped: dict) -> str:
    parts = []
    for var, ops in grouped.items():
        if ops[">"] is not None and ops["<="] is not None:
            parts.append(f"{ops['>']} < {var} <= {ops['<=']}")
        elif ops[">"] is not None:
            parts.append(f"{var}")
        elif ops["<="] is not None:
            parts.append(f"not {var}")
    return " AND ".join(parts)

def get_accuracy(y_pred, y_test):
    correct = 0
    for i in range(len(y_test)):
        if (y_pred[i] > 0 and y_test[i] > 0) or (y_pred[i] <= 0 and y_test[i] <= 0):
            correct += 1
    return correct / len(y_test)

EPSILON = 0.000001

def scorefunction(depth, accuracy):
    return math.log(accuracy + EPSILON, 2) * 2**depth

class DecisionTree(NeuronPredictor):
    def __init__(self, layer, neuron):
        # This is bad coupling
        self.column_names_input = get_variable_names(True)
        self.neuron = neuron
        self.layer = layer

    def predict(self, X):
        return self.regressor.predict(X)

    def fit(self, X, y, **kwargs):
        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.regressor = DecisionTreeRegressor(random_state = 0, **kwargs)  
        self.regressor.fit(X, y)

    def get_top_rules(self, thresh = 0.9, column_names=None):
        if column_names is None:
            column_names = self.column_names_input
        cnf_rules, _ = tree_to_dnf(self.regressor.tree_, feature_names=column_names)
        rules = []
        preds = []
        cnf_rules = [(disjunction, pred_value) for (disjunction, pred_value) in cnf_rules if pred_value > thresh]
        for disjunction, pred_value in cnf_rules:
            disjunction = process_disjunction(disjunction)
            rules += [disjunction]
            preds += [pred_value]
        return rules, preds

    def get_clean_format(self, column_names=None):
        if column_names is None:
            column_names = self.column_names_input
        # cnf_rules = tree_to_cnf(self.regressor.tree_, feature_names=column_names)
        cnf_rules, variable_count = tree_to_dnf(self.regressor.tree_, feature_names=column_names)
        rules_nice = []
        for disjunction, pred_value in cnf_rules:
            disjunction = process_disjunction(disjunction)
            disjunction_str = generate_string_from_rule(disjunction)
            disjunction_str += f"=> {pred_value:.4f}"
            rules_nice.append(disjunction_str)
        return rules_nice, variable_count

    def get_variable_count(self):
        _, variable_count = self.get_clean_format(self.column_names_input)
        return variable_count

    def load(self, layer, neuron):
        # Save the decision tree to a file
        file_path = f"neuron_predictors/decision_trees/decision_tree_L{layer}_N{neuron}.joblib"
        if os.path.exists(file_path):
            self.regressor = joblib.load(file_path)
            self.max_depth = self.regressor.get_depth()
            self.layer = layer
            self.neuron = neuron
            return True
        return False

    def save(self):
        file_path = f"neuron_predictors/decision_trees/decision_tree_L{self.layer}_N{self.neuron}.joblib"#
        # Save the decision tree to a file
        joblib.dump(self.regressor, file_path)


class ClassifierDecisionTree(NeuronPredictor):
    def __init__(self, layer, neuron):
        # This is bad coupling
        self.column_names_input = get_variable_names(True)
        self.neuron = neuron
        self.layer = layer

    def predict(self, X):
        return self.regressor.predict(X)

    def fit(self, X, y, **kwargs):
        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.regressor = DecisionTreeClassifier(**kwargs)  
        self.regressor.fit(X, y)

    def get_top_rules(self, thresh = 0.9, column_names=None):
        if column_names is None:
            column_names = self.column_names_input
        cnf_rules, _ = tree_to_dnf(self.regressor.tree_, feature_names=column_names)
        rules = []
        preds = []
        cnf_rules = [(disjunction, pred_value) for (disjunction, pred_value) in cnf_rules if pred_value > thresh]
        for disjunction, pred_value in cnf_rules:
            disjunction = process_disjunction(disjunction)
            rules += [disjunction]
            preds += [pred_value]
        return rules, preds

    def get_clean_format(self, column_names=None):
        if column_names is None:
            column_names = self.column_names_input
        # cnf_rules = tree_to_cnf(self.regressor.tree_, feature_names=column_names)
        cnf_rules, variable_count = tree_to_dnf(self.regressor.tree_, feature_names=column_names)
        rules_nice = []
        for disjunction, pred_value in cnf_rules:
            disjunction = process_disjunction(disjunction)
            disjunction_str = generate_string_from_rule(disjunction)
            disjunction_str += f"=> {pred_value:.4f}"
            rules_nice.append(disjunction_str)
        return rules_nice, variable_count

    def get_variable_count(self):
        _, variable_count = self.get_clean_format(self.column_names_input)
        return variable_count

    def load(self, layer, neuron):
        # Save the decision tree to a file
        file_path = f"neuron_predictors/decision_tree_classifiers/decision_tree_classifier_L{layer}_N{neuron}.joblib"
        if os.path.exists(file_path):
            self.regressor = joblib.load(file_path)
            self.max_depth = self.regressor.get_depth()
            self.layer = layer
            self.neuron = neuron
            return True
        return False

    def save(self):
        file_path = f"neuron_predictors/decision_tree_classifiers/decision_tree_classifier_L{self.layer}_N{self.neuron}.joblib"#
        # Save the decision tree to a file
        joblib.dump(self.regressor, file_path)

def new_weighted_f1_score(y, y_pred):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for i in range(len(y)):
        score = max(y[i], y_pred[i])
        if y_pred[i] > 0 and y[i] > 0:
            tp += score
        elif y_pred[i] <= 0 and y[i] <= 0:
            tn += 1
        elif y_pred[i] > 0 and y[i] <= 0:
            fp += score
        elif y_pred[i] <= 0 and y[i] > 0:
            fn += score
    precision = tp / (tp + fp + EPSILON)
    recall = tp / (tp + fn + EPSILON)
    f1 = 2 * (precision * recall) / (precision + recall + EPSILON)
    return f1.item()

import numpy as np

EPSILON = 1e-9  # Small constant to avoid division by zero

def very_new_weighted_f1_score(y, y_pred):
    y = np.asarray(y)
    y_pred = np.asarray(y_pred)

    sum_y = np.sum(y)
    len_y = len(y)
    
    # Score for each element
    score = y / sum_y * len_y
    
    # True positives (tp): y and y_pred are both positive
    tp = np.sum(score[(y_pred > 0) & (y > 0)])
    
    # True negatives (tn): both y and y_pred are non-positive
    # tn = np.sum((y_pred <= 0) & (y <= 0))
    
    # False positives (fp): y_pred is positive but y is non-positive
    fp = np.sum((y_pred > 0) & (y <= 0))
    
    # False negatives (fn): y_pred is non-positive but y is positive
    fn = np.sum(score[(y_pred <= 0) & (y > 0)])
    
    # Precision: tp / (tp + fp)
    precision = tp / (tp + fp + EPSILON)
    
    # Recall: tp / (tp + fn)
    recall = tp / (tp + fn + EPSILON)
    
    # F1 Score: harmonic mean of precision and recall
    f1 = 2 * (precision * recall) / (precision + recall + EPSILON)
    
    return f1


y_test = np.array([1, 10, 1, 0, 0, 1, 1, 0, 999, 1])
y_pred = np.array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1])
print(new_weighted_f1_score(y_test, y_pred))

def get_data(layer, neuron, dataset : pd.DataFrame | str = None, **kwargs):
    if type(dataset) == str:
        small_dataset_softmax = pd.read_csv(dataset)
    elif type(dataset) == pd.DataFrame:
        small_dataset_softmax = dataset
    else:
        small_dataset_softmax = pd.read_csv(f"data/neuron_datasets/logic_small_L{layer}.csv")
    dataset = get_filtered_dataset(small_dataset_softmax, layer, neuron, overfitting_strength=None, **kwargs)
    dataset_columns_new = dataset.columns.tolist()

    dataset_columns_new = [col_name for col_name in dataset_columns_new if col_name[3:6] != "not"]
    dataset = dataset[dataset_columns_new]

    column_names = dataset.columns
    column_names_input = column_names[:-1]
    column_names_output = [column_names[-1]]

    X = dataset[column_names_input].astype(float)
    y = dataset[column_names_output].astype(float)
    # turn into numpy arrays
    X = X.to_numpy()
    y = y.to_numpy()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test, column_names_input

def train_decision_tree(X_train, y_train, layer, neuron, **kwargs) -> DecisionTree:
    descision_tree = DecisionTree(layer, neuron)
    descision_tree.fit(X_train, y_train, **kwargs)
    return descision_tree

def evaluate_decision_tree(descision_tree : DecisionTree, X_test, y_test):
    column_names_input = descision_tree.column_names_input
    y_pred_test = descision_tree.predict(X_test)
    weighted_f1 = new_weighted_f1_score(y_test, y_pred_test)
    f1 = f1_score(y_test > 0, y_pred_test > 0)
    rules, variable_count = descision_tree.get_clean_format(column_names_input)
    return weighted_f1, f1, variable_count, rules, y_pred_test

def train_and_evaluate_decision_tree(layer : int, neuron : int, dataset : pd.DataFrame | str = None, **kwargs):
    X_train, X_test, y_train, y_test, column_names_input = get_data(layer, neuron, dataset)
    descision_tree = train_decision_tree(X_train, y_train, layer, neuron, **kwargs)
    descision_tree.column_names_input = column_names_input
    weighted_f1, f1, variable_count, rules, _ = evaluate_decision_tree(descision_tree, X_test, y_test)
    return descision_tree, weighted_f1, f1, variable_count, rules

0.9990147778241394


In [78]:
# from utils import *
from create_dataset import get_filtered_dataset
# from descision_trees import DecisionTree
from create_dataset import save_filtered_dataset_for_neurons
# import abstractmethod
from abc import ABC, abstractmethod
from dataclasses import dataclass

In [139]:
@dataclass
class NeuronPredictorArgs:
    dataset_kind : str
    neuron_predictor_type : str
    preprocess_train_type : str
    preprocess_eval_type : str
    layer : int
    neuron : int
    len_data : int
    dataset_train : pd.DataFrame = None
    dataset_test : pd.DataFrame = None

class PreprocessAbc(ABC):
    @abstractmethod
    def preprocess(self, dataset : pd.DataFrame, args : NeuronPredictorArgs) -> pd.DataFrame:
        pass

def my_round(x):
    if x < 0:
        return -0.1
    if x < 0.3:
        return 0.1
    if x < 1:
        return 0.7
    return 1.5

class PreproccessorTrain(PreprocessAbc):
    def __init__(self):
        pass

    def preprocess(self, dataset : pd.DataFrame, args : NeuronPredictorArgs) -> pd.DataFrame:
        input_columns = dataset.columns[:-1]
        input_columns = [col for col in input_columns if col[3:] == "flipped" or col[3:] == "placed"]
        dataset[input_columns] = dataset[input_columns].apply(lambda x: x.apply(lambda y : 0 if y < 0.5 else 1))
        dataset["neuron activation"] = dataset["neuron activation"].apply(lambda x: my_round(x))
        return dataset
    
class PreproccessorTrainRound(PreprocessAbc):
    def __init__(self):
        pass

    def preprocess(self, dataset : pd.DataFrame, args : NeuronPredictorArgs) -> pd.DataFrame:
        dataset["neuron activation"] = dataset["neuron activation"].apply(lambda x: -1 if x <= 0 else 1)
        return dataset

class PreproccessorEval(PreprocessAbc):
    def __init__(self):
        pass

    def preprocess(self, dataset : pd.DataFrame, args : NeuronPredictorArgs) -> pd.DataFrame:
        return dataset

def get_neuron_predictor(args : NeuronPredictorArgs, **kwargs) -> NeuronPredictor:
    if args.neuron_predictor_type == "decision_tree":
        return DecisionTree(args.layer, args.neuron, **kwargs)
    elif args.neuron_predictor_type == "decision_tree_classifier":
        return ClassifierDecisionTree(args.layer, args.neuron, **kwargs)
    else:
        raise ValueError(f"neuron_predictor_type {args.neuron_predictor_type} not recognized")
    
def get_preprocess_train(args : NeuronPredictorArgs, **kwargs) -> PreprocessAbc:
    if args.preprocess_train_type == "normal":
        return PreproccessorTrain(**kwargs)
    elif args.preprocess_train_type == "round":
        return PreproccessorTrainRound(**kwargs)
    else:
        raise ValueError(f"preprocess_train_type {args.preprocess_train_type} not recognized")

def get_preprocess_eval(args : NeuronPredictorArgs, **kwargs) -> PreprocessAbc:
    if args.preprocess_eval_type == "normal":
        return PreproccessorEval(**kwargs)
    else:
        raise ValueError(f"preprocess_eval_type {args.preprocess_eval_type} not recognized")

def get_dataset(dataset_kind : str, train_or_eval : str, layer : int):
    assert train_or_eval in ["train", "eval"]
    dataset = pd.read_csv(f"data/neuron_datasets/{dataset_kind}_{train_or_eval}_L{layer}.csv")
    return dataset

def dataset_to_X_y(dataset : pd.DataFrame):
    column_names = dataset.columns
    column_names_input = column_names[:-1]
    column_names_output = [column_names[-1]]

    X = dataset[column_names_input].astype(float)
    y = dataset[column_names_output].astype(float)
    # turn into numpy arrays
    X = X.to_numpy()
    y = y.to_numpy()
    return X, y

def evaluate_neuron_predictor(neuron_predictor : NeuronPredictor, X_test, y_test):
    y_pred_test = neuron_predictor.predict(X_test)
    weighted_f1 = very_new_weighted_f1_score(y_test, y_pred_test)
    f1 = f1_score(y_test > 0, y_pred_test > 0)
    rules, variable_count = neuron_predictor.get_clean_format()
    return weighted_f1, f1, variable_count, rules, y_pred_test

# This class get's a train / eval dataset AND/OR a layer and neuron
# AND a neuron predictor and 
class NeuronPredictorEvaluator:
    def get_dataset_train_filtered(self):
        # dataset_train = get_dataset(self.args.dataset_kind, "train", self.args.layer)
        dataset_train_filtered = get_filtered_dataset(self.dataset_train, self.args.layer, self.args.neuron, size=self.args.len_data, remove_negative_features=True)
        dataset_train_filtered = self.preprocessor_train.preprocess(dataset_train_filtered, self.args)
        return dataset_train_filtered

    def get_dataset_test_filtered(self):
        # dataset_test = get_dataset(self.args.dataset_kind, "eval", self.args.layer)
        dataset_test_filtered = get_filtered_dataset(self.dataset_test, self.args.layer, self.args.neuron, size=self.args.len_data)
        dataset_test_filtered = self.preprocessor_eval.preprocess(dataset_test_filtered, self.args)
        return dataset_test_filtered

    def __init__(
        self,
        args : NeuronPredictorArgs,
        **kwargs
    ):
        # initialize the neuron predictor
        # TODO: when initialized: big dataset is loaded, you can also pass big dataset as an argument
        if args.dataset_train is not None:
            self.dataset_train = args.dataset_train
        else:
            self.dataset_train = get_dataset(self.args.dataset_kind, "train", self.args.layer)
        if args.dataset_test is not None:
            self.dataset_test = args.dataset_test
        else:
            self.dataset_test = get_dataset(self.args.dataset_kind, "test", self.args.layer)

        self.neuron_predictor : NeuronPredictor = get_neuron_predictor(args, **kwargs)
        self.preprocessor_train : PreprocessAbc = get_preprocess_train(args)
        self.preprocessor_eval : PreprocessAbc = get_preprocess_eval(args)
        self.args = args

    def train_and_evaluate(self, **kwargs):
        dataset_train = self.get_dataset_train_filtered()
        X_train, y_train = dataset_to_X_y(dataset_train)
        self.neuron_predictor.fit(X_train, y_train, **kwargs)
        # TODO: save the neuron predictor
        # evaluate the neuron predictor
        # TODO: load the neuron predictor if it exists
        dataset_test = self.get_dataset_test_filtered()
        X_test, y_test = dataset_to_X_y(dataset_test)
        weighted_f1, f1, variable_count, rules, y_pred_test = evaluate_neuron_predictor(self.neuron_predictor, X_test, y_test)
        return weighted_f1, f1, variable_count, rules, y_pred_test

In [4]:
# TODO:
# 1. Make the Train and Test datasets DONE
# 2. Run the Decision Tree DONE
# 3. Implement a cool preprocessing step
# 4. Run the Decision Tree again and see if it's better
# 5. wandb
# 6. Timing

'''layer = 1
big_dataset = pd.read_csv(f"data/neuron_datasets/logic_train_L{layer}.csv")
save_filtered_dataset_for_neurons(big_dataset=big_dataset,dataset_name=f"small_train", layer = layer, neurons = [0, 1, 2, 3, 4, 421])
big_dataset = pd.read_csv(f"data/neuron_datasets/logic_eval_L{layer}.csv")
save_filtered_dataset_for_neurons(big_dataset=big_dataset,dataset_name=f"small_eval", layer = layer, neurons = [0, 1, 2, 3, 4, 421])'''

'layer = 1\nbig_dataset = pd.read_csv(f"data/neuron_datasets/logic_train_L{layer}.csv")\nsave_filtered_dataset_for_neurons(big_dataset=big_dataset,dataset_name=f"small_train", layer = layer, neurons = [0, 1, 2, 3, 4, 421])\nbig_dataset = pd.read_csv(f"data/neuron_datasets/logic_eval_L{layer}.csv")\nsave_filtered_dataset_for_neurons(big_dataset=big_dataset,dataset_name=f"small_eval", layer = layer, neurons = [0, 1, 2, 3, 4, 421])'

In [80]:
from utils import label_to_tuple
import torch as t
from utils import plot_boards_general

In [81]:
def visualize_rules(rules, preds, how_many = 10):
    # generate 10 random indices
    num_rules = min(how_many, len(rules))
    indices = np.random.choice(len(rules), num_rules, replace=False)
    # sort indices
    indices.sort()
    # num_options = 11 + 3
    num_options = 7
    boards = t.zeros((num_options, num_rules, 8, 8))
    # boards = t.zeros((how_many, 7, 8, 8))
    '''feature_indices = {
        "empty" : 0,
        "not empty" : 1,
        "yours" : 2,
        "not yours" : 3,
        "mine" : 4,
        "not mine" : 5,
        "flipped" : 3 + 3,
        "not flipped" : 4 + 3,
        "placed" : 5 + 3,
        "not placed" : 6 + 3,
        "legal" : 7 + 3,
        "not legal" : 8 + 3,
        "accessible" : 9 + 3,
        "not accessible" : 10 + 3
    }'''
    feature_indices = {
        "empty" : 0,
        "yours" : 1,
        "mine" : 2,
        "flipped" : 3,
        "placed" : 4,
        "legal" : 5,
        "accessible" : 6,
    }
    for i in indices:
        rule = rules[i]
        pred = preds[i]
        if pred < 0.5:
            continue
        for j in range(len(rule)):
            literal = rule[j]
            feature, op, threshold = literal
            # TODO: I need to change not A4 Placed to A4 not_placed
            label = feature[:2]
            tile_tuple = label_to_tuple(label)
            option_str = feature[3:]
            option_str = option_str.lower()
            if op == "<=":
                # option_str = "not " + option_str
                val = -1
            else:
                val = 1
            option = feature_indices[option_str]
            boards[option, i, tile_tuple[0], tile_tuple[1]] = val
    plot_boards_general(x_labels=list(feature_indices.keys()), y_labels=[f"Rule {i}" for i in indices], boards=boards)

In [141]:
layer = 1
neurons = [0, 1, 2, 3, 4, 421]
neurons = [2]

for neuron in neurons:
    args = NeuronPredictorArgs(
        dataset_kind = "big_argmax",
        neuron_predictor_type = "decision_tree_classifier",
        preprocess_train_type = "round",
        preprocess_eval_type = "normal",
        layer = layer,
        neuron = neuron,
        len_data=100000,
    )
    evaluator_round = NeuronPredictorEvaluator(args)
    # weighted_f1, f1, variable_count, rules, y_pred_test = evaluator_round.train_and_evaluate(min_impurity_decrease=0.00001)
    weighted_f1, f1, variable_count, rules, y_pred_test = evaluator_round.train_and_evaluate(min_impurity_decrease=0.0001)
    print(f"Neuron {neuron} has a weighted f1 of {weighted_f1}, f1 of {f1}, variable count of {variable_count}")
    print(rules)

# Got a weighted f1 of 0.93 on L1_N2. That's pretty sick ... (Rounding Helps, Data also helps)
# The Rule is a bit whacky (G4 Not yours AND G4 Yours meaning G4 is between x and y)
# Rounding the Input data to 0 / 1 makes the rules look nicer but worsens the f1 score
# I think the performance is really good if the there
# Weighted F1 is 94 for softmax and 85 for argmax (Argmax is much more interpretabil, so I think I will be usnig that ...)

# Neuron 2 has a weighted f1 of 0.853886536950849, f1 of 0.6273830155979203, variable count of 125 ()


AttributeError: 'NeuronPredictorEvaluator' object has no attribute 'args'

: 

In [140]:
rules, preds = evaluator_round.neuron_predictor.get_top_rules(thresh=0.8)
print(rules, preds)
visualize_rules(rules, preds, how_many=10)

[] []


ValueError: 
The 'rows' argument to make_subplots must be an int greater than 0.
    Received value of type <class 'int'>: 0

# Analyze Neuron Activation Distributions

In [101]:
import pandas as pd
from tqdm import tqdm
import torch as t
import plotly.express as px
import plotly.graph_objects as go

In [14]:
# load datasets
big_datasets = []
for layer in tqdm(range(8)):
    df = pd.read_csv(f"data/neuron_datasets/big_argmax_train_L{layer}.csv")
    big_datasets.append(df)

100%|██████████| 8/8 [05:04<00:00, 38.02s/it]


In [84]:
# load datasets
big_datasets_test = []
for layer in tqdm(range(8)):
    df = pd.read_csv(f"data/neuron_datasets/big_argmax_test_L{layer}.csv")
    big_datasets_test.append(df)

  0%|          | 0/8 [00:00<?, ?it/s]

In [18]:
# Analyze Neurons by there activation distribution
# TODO: load the dataset of every layer.
# Go over every layer and neuron, calculate of the positive cases calculate avereage, std, skewness, kurtosis and fraction of positive cases
# I think that I can spot interpretable neurons by near 0 skewness and maybe high kurtosis, I can plot some of the distributions
# Maybe I can do k-means and see if I can find two clusters
# Better: visualize top two / tree pca components and see if I can find clusters visualy

# Then I can get a representative sample of neurons and do some hyperparameter sweep

average = t.zeros((8, 2048))
std = t.zeros((8, 2048))
skewness = t.zeros((8, 2048))
kurtosis = t.zeros((8, 2048))
fraction = t.zeros((8, 2048))
for layer in tqdm(range(8)):
    df = big_datasets[layer]
    for neuron in range(2048):
        neuron_activation = df[f"L{layer}_N{neuron}"]
        neuron_activation_positive = neuron_activation[neuron_activation > 0]
        average[layer, neuron] = neuron_activation_positive.mean()
        std[layer, neuron] = neuron_activation_positive.std()
        skewness[layer, neuron] = neuron_activation_positive.skew()
        kurtosis[layer, neuron] = neuron_activation_positive.kurtosis()
        fraction[layer, neuron] = len(neuron_activation_positive) / len(neuron_activation)

100%|██████████| 8/8 [00:15<00:00,  1.90s/it]


In [97]:
# 5 neurons with skewness > 0.5 and 5 with skewness < 0.5
# Layer 1?. Go over min_impurity_decrease and plot variable count vs f1_score with hyperparameter value as label

skewness_layer_1 = skewness[1]
# get five random neurons with skewness > 0.5
skewness_layer_1_indices = t.argsort(skewness_layer_1)
for i, index in enumerate(skewness_layer_1_indices):
    if skewness_layer_1[index] >= 0.5:
        limit = i
        break
neurons_interpretabil = np.random.choice(skewness_layer_1_indices[:limit], 3, replace=False)
neurons_uninterpretabil = np.random.choice(skewness_layer_1_indices[-limit:], 3, replace=False)

In [98]:
neurons_interpretabil

array([ 619,  546, 1341])

In [99]:
np.stack([neurons_interpretabil, neurons_uninterpretabil]).flatten()

array([ 619,  546, 1341, 1414, 1773,  521])

In [108]:
min_impurity_decreases = [0.00001, 0.00003, 0.0001, 0.0003, 0.001, 0.003, 0.01]

In [100]:
# Now I can do a hyperparameter sweep
results = t.zeros((6, 7, 3))
rules_dict = {}
layer = 1
neurons = np.stack([neurons_interpretabil, neurons_uninterpretabil]).flatten()
for i, neuron in tqdm(enumerate(neurons)):
    args = NeuronPredictorArgs(
        dataset_kind = "big_argmax",
        neuron_predictor_type = "decision_tree_classifier",
        preprocess_train_type = "round",
        preprocess_eval_type = "normal",
        layer = layer,
        neuron = neuron,
        len_data = 100000,
        dataset_train = big_datasets[layer],
        dataset_test = big_datasets_test[layer],
    )
    evaluator_round = NeuronPredictorEvaluator(args)
    # weighted_f1, f1, variable_count, rules, y_pred_test = evaluator_round.train_and_evaluate(min_impurity_decrease=0.00001)
    for j, min_impurity_decrease in enumerate(min_impurity_decreases):
        weighted_f1, f1, variable_count, rules, y_pred_test = evaluator_round.train_and_evaluate(min_impurity_decrease=min_impurity_decrease)
        results[i, j, 0] = variable_count
        results[i, j, 1] = f1
        results[i, j, 2] = weighted_f1
        print(f"Neuron {neuron} has a weighted f1 of {weighted_f1}, f1 of {f1}, variable count of {variable_count}, min_impurity_decrease of {min_impurity_decrease}")
        rules_dict[(neuron, min_impurity_decrease)] = rules


0it [00:00, ?it/s]

Neuron 619 has a weighted f1 of 0.9615518092967672, f1 of 0.9241379310344827, variable count of 191, min_impurity_decrease of 1e-05
Neuron 619 has a weighted f1 of 0.9629187083920144, f1 of 0.9224904701397713, variable count of 93, min_impurity_decrease of 3e-05
Neuron 619 has a weighted f1 of 0.962872731412724, f1 of 0.9358582773365913, variable count of 27, min_impurity_decrease of 0.0001
Neuron 619 has a weighted f1 of 0.9350587287317694, f1 of 0.9059618930547019, variable count of 19, min_impurity_decrease of 0.0003
Neuron 619 has a weighted f1 of 0.8927914633451297, f1 of 0.8411843876177658, variable count of 11, min_impurity_decrease of 0.001
Neuron 619 has a weighted f1 of 0.8798319143287278, f1 of 0.8250825082508251, variable count of 9, min_impurity_decrease of 0.003
Neuron 619 has a weighted f1 of 0.0, f1 of 0.0, variable count of 1, min_impurity_decrease of 0.01
Neuron 546 has a weighted f1 of 0.9323537020852146, f1 of 0.7960618846694796, variable count of 191, min_impurity_

In [68]:
real_layer = -1

In [132]:
print("FAAAAAAAAAAAAAAAAAAAAAAAAAAAACK")

FAAAAAAAAAAAAAAAAAAAAAAAAAAAACK


In [131]:
# [ 619,  546, 1341, 1414, 1773,  521])
rule = rules_dict[(619, 0.0001)]
visualize_rules(rule, len(rule))

['B1 placed > 0.5000 B2 yours <= 0.5000 B2 flipped <= 0.5000 B2 empty <= 0.5000 B3 empty <= 0.5000 => 1.0000',
 'B1 placed > 0.5000 B2 yours <= 0.5000 B2 flipped > 0.5000 => 1.0000',
 'B1 placed > 0.5000 B2 yours > 0.5000 B3 empty <= 0.5000 => 0.9498',
 'B0 placed > 0.5000 B1 flipped > 0.5000 B3 accesible <= 0.5000 => 0.9341',
 'B0 placed > 0.5000 B1 flipped <= 0.5000 B1 yours > 0.5000 B2 empty <= 0.5000 B3 empty <= 0.5000 A2 mine > 0.5000 => 0.8533',
 'B0 placed > 0.5000 B1 flipped <= 0.5000 B1 yours > 0.5000 B2 empty <= 0.5000 B3 empty <= 0.5000 A2 mine <= 0.5000 => 0.5422',
 'B1 placed > 0.5000 B2 yours <= 0.5000 B2 flipped <= 0.5000 B2 empty <= 0.5000 B3 empty > 0.5000 => 0.1173',
 'B0 placed > 0.5000 B1 flipped <= 0.5000 B1 yours <= 0.5000 => 0.0749',
 'B0 placed > 0.5000 B1 flipped > 0.5000 B3 accesible > 0.5000 => 0.0135',
 'B1 placed > 0.5000 B2 yours <= 0.5000 B2 flipped <= 0.5000 B2 empty > 0.5000 => 0.0015',
 'B0 placed <= 0.5000 B1 placed <= 0.5000 => 0.0003',
 'B1 placed >

In [107]:
print("hello")

hello


In [119]:
fraction[1, 546]
print(fraction[1, 1414])

tensor(0.0782)


In [129]:
# array([ 619,  546, 1341, 1414, 1773,  521])
fig = go.Figure()
# neuron = neurons[0]
# for i, neuron in enumerate(neurons):
i = 3
neuron = neurons[i]
fig.add_trace(go.Scatter(x=results[i, :, 1], y=results[i, :, 0], mode="lines+markers", name=f"Neuron {neuron}", text=min_impurity_decreases))
fig.show()
print(results[i, :, 1])

tensor([0.5802, 0.6024, 0.5871, 0.6166, 0.4684, 0.0000, 0.0000])


In [75]:
# Make a plotly scatter plot of skewness vs average
# real_layer +=1
print(real_layer)
fig = go.Figure()
for layer in range(real_layer, real_layer+1):
    fig.add_trace(go.Scatter(x=average[layer], y=skewness[layer], mode="markers", name=f"Layer {layer}", text=[f"Neuron {i}" for i in range(2048)]))
fig.show()

2


In [95]:
layer = 1
print(layer)
neuron = 250

activations = big_datasets[layer][f"L{layer}_N{neuron}"]
activations = activations[activations > 0]
print(len(activations))
fig = px.histogram(x=activations)
fig.update_layout(
    title=f"Activations of Neurons in Layer {layer} of the MLP",
    xaxis_title="Activations",
    yaxis_title="Count",
)
fig.show()

1
1675


In [None]:
# 5 neurons with skewness > 0.5 and 5 with skewness < 0.5
# Layer 1?. Go over min_impurity_decrease and plot variable count vs f1_score with hyperparameter value as label



In [44]:
from utils import *

In [47]:
W_out = model.W_out
W_out.shape
print(W_out[2, 1370, :].norm())

tensor(0.7621, device='cuda:0', grad_fn=<LinalgVectorNormBackward0>)


In [15]:
# Visualize what the Neuron Predictor get's correct and what it gets wrong
layer = 1
neuron = 2
args = NeuronPredictorArgs(
    dataset_kind = "big_argmax",
    neuron_predictor_type = "decision_tree",
    preprocess_train_type = "normal",
    preprocess_eval_type = "normal",
    layer = layer,
    neuron = neuron,
    len_data=100000,
)
evaluator = NeuronPredictorEvaluator(args)
weighted_f1, f1, variable_count, rules, y_pred_test = evaluator.train_and_evaluate(min_impurity_decrease=0.00004)

In [87]:
evaluator.neuron_predictor.regressor.tree_.value[0]

(125, 1, 1)

In [67]:
my_evaluator = evaluator_round
neuron_predictor = my_evaluator.neuron_predictor

In [84]:
neuron_predictor.regressor.tree_.value[0, 0]

array([0.93122, 0.06878])

In [30]:
dataset_test = my_evaluator.get_dataset_test()
X_test, y_test = dataset_to_X_y(dataset_test)

In [71]:
dataset_test["correct_pred"] = (y_pred_test > 0) == (y_test[:, 0] > 0)
dataset_test

Unnamed: 0,A0 empty,A0 yours,A0 mine,A1 empty,A1 yours,A1 mine,A2 empty,A2 yours,A2 mine,A3 empty,...,H0 placed,H1 placed,H2 placed,H3 placed,H4 placed,H5 placed,H6 placed,H7 placed,neuron activation,correct_pred
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.145008,True
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.021936,True
2,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.004274,True
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.150995,True
4,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.022478,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134221,True
49996,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.019271,True
49997,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021237,True
49998,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.020676,True


In [70]:
y_pred_test[y_pred_test < 0.5]

array([-1., -1., -1., ..., -1., -1., -1.])

In [57]:
y_test[:10, 0]

array([-0.14500807, -0.02193604, -0.00427393, -0.1509946 , -0.02247846,
       -0.00699016, -0.01876456, -0.13443109, -0.00935354, -0.02266369])

In [69]:
# make histogram of using plotly express of dataset_test 
import plotly.express as px
fig = px.histogram(dataset_test, x="neuron activation", color="correct_pred")
fig.update_yaxes(type="log")
fig.show()

In [53]:
neuron_predictor = evaluator.neuron_predictor
neuron_predictor.get_clean_format()

(['G4 yours AND H3 placed AND not G4 flipped AND not G5 mine AND not E6 mine AND not G5 yours => 1.1542',
  'G4 yours AND E6 placed AND G5 empty AND B4 empty AND F5 flipped => 1.1381',
  'G4 yours AND E2 placed AND G5 empty => 0.9710',
  'G4 yours AND D1 placed AND E2 flipped AND G3 empty => 0.9000',
  'G4 yours AND H4 placed AND not G4 flipped AND not H6 empty AND F5 empty => 0.8800',
  'G4 yours AND H5 placed AND not G4 flipped AND G3 empty AND not F3 empty => 0.8074',
  'G4 yours AND D7 placed AND F5 flipped AND not A4 yours => 0.8032',
  'G4 yours AND G2 placed AND G5 empty AND F6 empty => 0.7896',
  'G4 yours AND H3 placed AND not G4 flipped AND not G5 mine AND not E6 mine AND G5 yours => 0.7706',
  'G4 yours AND C0 placed AND not H5 mine AND E2 flipped AND G5 empty => 0.7424',
  'G4 yours AND E2 placed AND not G5 empty AND not G3 mine => 0.7137',
  'G4 yours AND E6 placed AND G5 empty AND B4 empty AND not F5 flipped => 0.6706',
  'G4 yours AND A4 placed AND D4 flipped AND E1 empt

In [35]:
cnf_rules, variable_count = tree_to_dnf(neuron_predictor.regressor.tree_, feature_names=get_variable_names(True))
cnf_rules

[([('G4 yours', '>', 0.5),
   ('E6 placed', '>', 0.5),
   ('G5 empty', '>', 0.5),
   ('B4 empty', '>', 0.5)],
  1.0730769230769233),
 ([('G4 yours', '>', 0.5),
   ('E6 placed', '<=', 0.5),
   ('E2 placed', '<=', 0.5),
   ('D1 placed', '>', 0.5),
   ('F3 flipped', '>', 0.5)],
  0.8400000000000005),
 ([('G4 yours', '>', 0.5),
   ('E6 placed', '<=', 0.5),
   ('E2 placed', '<=', 0.5),
   ('D1 placed', '<=', 0.5),
   ('D7 placed', '<=', 0.5),
   ('H3 placed', '<=', 0.5),
   ('C4 placed', '>', 0.5),
   ('E5 empty', '>', 0.5)],
  0.8189189189189185),
 ([('G4 yours', '>', 0.5),
   ('E6 placed', '<=', 0.5),
   ('E2 placed', '>', 0.5),
   ('H4 mine', '<=', 0.5)],
  0.7884210526315796),
 ([('G4 yours', '>', 0.5),
   ('E6 placed', '<=', 0.5),
   ('E2 placed', '<=', 0.5),
   ('D1 placed', '<=', 0.5),
   ('D7 placed', '>', 0.5),
   ('F5 flipped', '>', 0.5)],
  0.7500000000000001),
 ([('G4 yours', '>', 0.5),
   ('E6 placed', '<=', 0.5),
   ('E2 placed', '<=', 0.5),
   ('D1 placed', '<=', 0.5),
   ('D

In [7]:
from utils import get_focus_logits_and_cache
import plotly_express as px
import einops
from jaxtyping import Float
from torch import Tensor
import torch as t
focus_logits, focus_cache = get_focus_logits_and_cache()

In [8]:
layer = 1
neuron = 18

for neuron in range(10):
    dataset_train = get_dataset("small", "train", layer)
    dataset_train = get_filtered_dataset(dataset_train, layer, neuron, size=100000)
    # activations : Float[Tensor, "batch pos d_mlp"] = focus_cache["mlp_post", layer][:200, :, neuron].unsqueeze(2)
    # activations = einops.rearrange(activations, "batch pos d_mlp -> (batch pos d_mlp)")
    activations = t.Tensor(dataset_train[f"neuron activation"].values)
    activations = activations[activations > 0]
    print(len(activations))
    fig = px.histogram(x=activations.to("cpu"), nbins=50, range_x=[0, 4])
    fig.update_layout(
        title=f"Activations of Neurons in Layer {layer} of the MLP",
        xaxis_title="Activations",
        yaxis_title="Count",
    )
    fig.show()

2545


2480


6878


16008


3397


KeyError: 'L1_N5'

In [None]:
"""from typing import List, Tuple

def remove_redundancies(disjunction: List[Tuple[str, str, float]]) -> List[Tuple[str, str, float]]:
    result = []
    for var, op, threshold in disjunction:
        if op == "<=":
            existing = next((x for x in result if x[0] == var and x[1] == "<="), None)
            if existing is None or threshold < existing[2]:
                result = [x for x in result if not (x[0] == var and x[1] == "<=")]
                result.append((var, op, threshold))
        elif op == ">":
            existing = next((x for x in result if x[0] == var and x[1] == ">"), None)
            if existing is None or threshold > existing[2]:
                result = [x for x in result if not (x[0] == var and x[1] == ">")]
                result.append((var, op, threshold))
    return result

def remove_placed_variables(disjunction: List[Tuple[str, str, float]]) -> List[Tuple[str, str, float]]:
    placed_vars = [var for var, op, _ in disjunction if "placed" in var and op == ">"]
    if placed_vars:
        return [x for x in disjunction if not ("placed" in x[0] and x[0] not in placed_vars)]
    return disjunction

def group_literals(disjunction: List[Tuple[str, str, float]]) -> dict:
    grouped = {}
    for var, op, threshold in disjunction:
        if var not in grouped:
            grouped[var] = {">" : None, "<=" : None}
        grouped[var][op] = threshold
    return grouped

def generate_string(grouped: dict) -> str:
    parts = []
    for var, ops in grouped.items():
        if ops[">"] is not None and ops["<="] is not None:
            parts.append(f"{ops['>']} < {var} <= {ops['<=']}")
        elif ops[">"] is not None:
            parts.append(f"{var}")
        elif ops["<="] is not None:
            parts.append(f"not {var}")
    return " AND ".join(parts)

# Example usage
disjunction = [
    ("D4 mine", ">", 0.1),
    ("D4 mine", "<=", 0.4),
    ("B2 placed", ">", 0.0),
    ("C4 flipped", "<=", 1.0),
    ("A1 placed", "<=", 0.9),  # This should be removed
]

result = process_disjunction(disjunction)
print(result)"""