In [2]:
import numpy as np
import complete_random_forest.build_crf as crf
import complete_random_forest.crf_helpers as helper
from complete_random_forest.node_definition import Node, NodeValues

In [11]:
def check_label_sequence(labels) -> int:
    n = len(labels)
    if n < 2:
        return 0

    ch1 = None
    for i in range(n - 1):
        if labels[i] != labels[i + 1]:
            ch1 = i
            break
    if ch1 is None:
        return 0

    for j in range(ch1 + 1, n - 1):
        if labels[j] != labels[j + 1]:
            return j - ch1
    return (n - 1) - ch1

def check_tree_leaf(sr_tree: Node) -> np.ndarray:
    """
    Returns: NumPy array of shape (k, 2)
            column 0 = datas (subject IDs)
            column 1 = subresult from check_label_sequence(labels)
    """
    # Treat as internal if either child exists (MATLAB used field count > 2)
    left = sr_tree.left_node
    right = sr_tree.right_node
    is_internal = (left is not None) or (right is not None)

    if is_internal:
        parts = []
        if left is not None:
            parts.append(check_tree_leaf(left))
        if right is not None:
            parts.append(check_tree_leaf(right))
        # Stack child results vertically
        return parts[0] if len(parts) == 1 else np.vstack(parts)

    # Leaf node: build [datas, subresult] where subresult repeats for each data item
    datas = np.atleast_1d(np.asarray(sr_tree.value.datas, dtype=int))
    labels = np.asarray(sr_tree.value.datas, dtype=int)
    subresult = check_label_sequence(labels)
    return np.column_stack((datas, np.full(datas.shape[0], subresult, dtype=int)))

In [10]:
class CRF:
    def __init__(self, ntree:int, label_noise_threshold: int):
        self.rf_1 = []
        self.rf_2 = []
        self.label_noise_threshold = label_noise_threshold
        self.ntree = ntree

    def crf_v1(self, train_data: np.ndarray, ntree: int):
        subjects_count, features = train_data.shape
        is_continuous_data = helper.is_continuous(train_data[:, 1:])
        train_data = np.hstack(
            (train_data, np.arange(1, subjects_count + 1).reshape(-1, 1))
        )

        for _ in range(ntree):
            self.rf_1.append(crf.build_crf(train_data, is_continuous_data, [], 1))
            self.rf_2.append(crf.build_crf(train_data, is_continuous_data, [], 2))
            
        subject_noise_desicion = self.compute_nltc_sequence(train_data)
        
        noise_subjects = (subject_noise_desicion.sum(axis=1) > 0.5*ntree*2).astype(int)
        non_noise_subject_ID = np.where(noise_subjects == 0)[0]
        non_noise_data = train_data[non_noise_subject_ID, :]
        
        return non_noise_data, non_noise_subject_ID, subject_noise_desicion
        

    def compute_nltc_sequence(self, train_data: np.ndarray):
        # removing id's from the train_data:
        train_data = train_data[:, :-1]
        subject_count, features = train_data.shape
        
        nltc_label_seq1 = np.zeros((subject_count, self.ntree), dtype=int)
        nltc_label_seq2 = np.zeros((subject_count, self.ntree), dtype=int)
        
        for tree_id in range(self.ntree):
            tree_result_1 = check_tree_leaf(self.rf_1[tree_id])
            tree_result_1 =  tree_result_1[tree_result_1[:, 0].argsort()]
            nltc_label_seq1[:, tree_id] = tree_result_1[:, 1]
            
            tree_result_2 = check_tree_leaf(self.rf_2[tree_id])
            tree_result_2 =  tree_result_2[tree_result_2[:, 0].argsort()]
            nltc_label_seq2[:, tree_id] = tree_result_2[:, 1]
            
        nltc_labels = np.hstack((nltc_label_seq1, nltc_label_seq2))
        
        nltc_labels[nltc_labels < self.label_noise_threshold] = 0 # not noise
        nltc_labels[nltc_labels >= self.label_noise_threshold] = 1 # yes noise
        
        return nltc_labels