In [1]:
from sklearn.tree import _tree
import sys
sys.path.append('../')
sys.path
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot as plt

In [2]:
def train_and_optimize(X, y, param_grid):
    # Разделить данные на обучающий и тестовый наборы
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
    
    # Создать классификатор дерева решений
    clf = DecisionTreeClassifier()
    
    # Создать экземпляр GridSearchCV
    grid_search = GridSearchCV(clf, param_grid, cv=6, scoring='accuracy', n_jobs=12)
    
    # Подгонка модели
    grid_search.fit(X_train, y_train)
    
    # Получить лучшую модель
    best_model = grid_search.best_estimator_
    
    # Оценить лучшую модель на тестовой выборке
    test_score = best_model.score(X_test, y_test)
    train_score = best_model.score(X_train, y_train)
    pred_proba = best_model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, pred_proba)

    
    print(f"Best model test score: {test_score:.2f}, {train_score:.2f}")
    print(f"Best model AUC score: {auc}")
    return best_model

In [3]:
df = pd.read_csv('./df_model.csv')
y = df['y']
X = df.drop('y', axis=1)

feature_names = X.columns
target_names = y.unique()

In [4]:
param_grid = {
    'random_state': [40],
    'splitter' : ['random'],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [5],
    'min_samples_split': list(range(2, 9)),
    'min_samples_leaf' : list(range(2, 9)),
}

# Обучить и оптимизировать модель
best_model = train_and_optimize(X, y, param_grid)
best_model

Best model test score: 0.70, 0.70
Best model AUC score: 0.7160493827160493


In [5]:
from extractor.process import process_decision_tree

ModuleNotFoundError: No module named 'extractor'

In [None]:
for item in process_decision_tree(best_model, feature_names):
    print(item)

In [190]:
def extract_rules_from_tree(tree: DecisionTreeClassifier, feature_names) -> list[tuple[list[str], int]]:
    """
    Extract rules from a DecisionTreeClassifier.

    Parameters:
    - tree: the trained DecisionTreeClassifier object
    - feature_names: names of the features used to train the tree

    Returns:
    - A list of rules. Each rule is represented as a tuple:
      (conditions, class_label)
      where conditions is a list of strings and class_label is an int.
    """
    # Initialize the list of rules
    rules = []
    
    # Get the tree attributes
    n_nodes = tree.tree_.node_count
    children_left = tree.tree_.children_left
    children_right = tree.tree_.children_right
    feature = tree.tree_.feature
    threshold = tree.tree_.threshold
    values = tree.tree_.value

    # Define a recursive function to traverse the tree and extract rules
    def traverse_tree(node, path_conditions):
        # If we're at a leaf node
        if children_left[node] == children_right[node]:
            # Get the class label for this leaf node
            class_label = values[node].argmax()
            rules.append((path_conditions, class_label))
            return

        # Traverse the left subtree
        left_condition = f"{feature_names[feature[node]]} <= {threshold[node]:.2f}"
        traverse_tree(children_left[node], path_conditions + [left_condition])

        # Traverse the right subtree
        right_condition = f"{feature_names[feature[node]]} > {threshold[node]:.2f}"
        traverse_tree(children_right[node], path_conditions + [right_condition])

    # Start the tree traversal from the root
    traverse_tree(0, [])

    return rules

In [191]:
extracted = extract_rules_from_tree(best_model, feature_names)
for item in extracted:
    print(item)

print(len(extracted))

(['smoking <= 0.16', 'diabets <= 0.95', 'sex_male <= 0.19', 'ОХС <= 6.08', 'age <= 63.49'], 0)
(['smoking <= 0.16', 'diabets <= 0.95', 'sex_male <= 0.19', 'ОХС <= 6.08', 'age > 63.49'], 0)
(['smoking <= 0.16', 'diabets <= 0.95', 'sex_male <= 0.19', 'ОХС > 6.08', 'age <= 59.33'], 1)
(['smoking <= 0.16', 'diabets <= 0.95', 'sex_male <= 0.19', 'ОХС > 6.08', 'age > 59.33'], 1)
(['smoking <= 0.16', 'diabets <= 0.95', 'sex_male > 0.19', 'age <= 57.75', 'ОХС <= 5.23'], 0)
(['smoking <= 0.16', 'diabets <= 0.95', 'sex_male > 0.19', 'age <= 57.75', 'ОХС > 5.23'], 0)
(['smoking <= 0.16', 'diabets <= 0.95', 'sex_male > 0.19', 'age > 57.75', 'age <= 65.06'], 1)
(['smoking <= 0.16', 'diabets <= 0.95', 'sex_male > 0.19', 'age > 57.75', 'age > 65.06'], 1)
(['smoking <= 0.16', 'diabets > 0.95', 'sex_male <= 0.54', 'age <= 74.52', 'age <= 65.45'], 1)
(['smoking <= 0.16', 'diabets > 0.95', 'sex_male <= 0.54', 'age <= 74.52', 'age > 65.45'], 0)
(['smoking <= 0.16', 'diabets > 0.95', 'sex_male <= 0.54', 'a

In [192]:
def display_rules_as_tree(rules):
    """Display the rules in a tree format with indentation."""
    output = []
    for conditions, label in rules:
        for idx, condition in enumerate(conditions):
            indentation = '  ' * idx  # 2 spaces per level
            output.append(f"{indentation}{condition}")
        output.append(f"{indentation}=> {label}\n")

    return '\n'.join(output)

tree_representation = display_rules_as_tree(extracted)
print(tree_representation)

smoking <= 0.16
  diabets <= 0.95
    sex_male <= 0.19
      ОХС <= 6.08
        age <= 63.49
        => 0

smoking <= 0.16
  diabets <= 0.95
    sex_male <= 0.19
      ОХС <= 6.08
        age > 63.49
        => 0

smoking <= 0.16
  diabets <= 0.95
    sex_male <= 0.19
      ОХС > 6.08
        age <= 59.33
        => 1

smoking <= 0.16
  diabets <= 0.95
    sex_male <= 0.19
      ОХС > 6.08
        age > 59.33
        => 1

smoking <= 0.16
  diabets <= 0.95
    sex_male > 0.19
      age <= 57.75
        ОХС <= 5.23
        => 0

smoking <= 0.16
  diabets <= 0.95
    sex_male > 0.19
      age <= 57.75
        ОХС > 5.23
        => 0

smoking <= 0.16
  diabets <= 0.95
    sex_male > 0.19
      age > 57.75
        age <= 65.06
        => 1

smoking <= 0.16
  diabets <= 0.95
    sex_male > 0.19
      age > 57.75
        age > 65.06
        => 1

smoking <= 0.16
  diabets > 0.95
    sex_male <= 0.54
      age <= 74.52
        age <= 65.45
        => 1

smoking <= 0.16
  diabets > 0.95
    s

# REMOVE REDUTANT RULES V2

In [193]:
# from copy import deepcopy
# def remove_redundant_tree_rules(rules):
#     """Removes redundant rules by examining child nodes."""
#     rules = deepcopy(rules)
#     # First, sort rules by their length
#     sorted_rules = sorted(rules, key=lambda x: len(x[0]))
    
#     # Create a list of indices that should be deleted
#     to_delete = []
    
#     # Compare every rule to every other rule
#     for i, (rule1, _) in enumerate(sorted_rules):
#         for j, (rule2, _) in enumerate(sorted_rules):
#             # If rule1 is a subset of rule2 and they have the same outcome
#             if set(rule1).issubset(set(rule2)) and i != j:
#                 to_delete.append(j)
    
#     # Delete the redundant rules
#     optimized_rules = [rule for index, rule in enumerate(sorted_rules) if index not in to_delete]
    
#     return optimized_rules

# optimized_tree_rules = remove_redundant_tree_rules(extracted)

# for idx, item in enumerate(extracted):
#     print(item)
#     print(optimized_tree_rules[idx])
#     print('--')

In [194]:
# print(f'base_rules: {extracted}, your_rules: {optimized_tree_rules}')

In [195]:
extracted

[(['smoking <= 0.16',
   'diabets <= 0.95',
   'sex_male <= 0.19',
   'ОХС <= 6.08',
   'age <= 63.49'],
  0),
 (['smoking <= 0.16',
   'diabets <= 0.95',
   'sex_male <= 0.19',
   'ОХС <= 6.08',
   'age > 63.49'],
  0),
 (['smoking <= 0.16',
   'diabets <= 0.95',
   'sex_male <= 0.19',
   'ОХС > 6.08',
   'age <= 59.33'],
  1),
 (['smoking <= 0.16',
   'diabets <= 0.95',
   'sex_male <= 0.19',
   'ОХС > 6.08',
   'age > 59.33'],
  1),
 (['smoking <= 0.16',
   'diabets <= 0.95',
   'sex_male > 0.19',
   'age <= 57.75',
   'ОХС <= 5.23'],
  0),
 (['smoking <= 0.16',
   'diabets <= 0.95',
   'sex_male > 0.19',
   'age <= 57.75',
   'ОХС > 5.23'],
  0),
 (['smoking <= 0.16',
   'diabets <= 0.95',
   'sex_male > 0.19',
   'age > 57.75',
   'age <= 65.06'],
  1),
 (['smoking <= 0.16',
   'diabets <= 0.95',
   'sex_male > 0.19',
   'age > 57.75',
   'age > 65.06'],
  1),
 (['smoking <= 0.16',
   'diabets > 0.95',
   'sex_male <= 0.54',
   'age <= 74.52',
   'age <= 65.45'],
  1),
 (['smoking

In [196]:
# Define the tree structure
class Node:
    def __init__(self, value=None, left=None, right=None, parent=None):
        self.value = value
        self.left = left
        self.right = right
        self.parent = parent
        self.rule = None

# Build the tree using the rules
def build_tree(rules):
    root = Node()
    for rule, result in rules:
        current_node = root
        for condition in rule:
            # Check if we should go left or right
            if "<=" in condition or "<" in condition:
                if not current_node.left:
                    current_node.left = Node(parent=current_node)
                current_node = current_node.left
            else:
                if not current_node.right:
                    current_node.right = Node(parent=current_node)
                current_node = current_node.right
        current_node.rule = (rule, result)
    return root

tree = build_tree(extracted)

In [197]:
def simplify_tree(node):
    # Base case: if we reach a leaf node
    if not node.left and not node.right:
        return

    # Recursive call for left and right child
    if node.left:
        simplify_tree(node.left)
    if node.right:
        simplify_tree(node.right)

    # If both children exist and have rules
    if node.left and node.right and node.left.rule and node.right.rule:
        left_rule, left_result = node.left.rule
        right_rule, right_result = node.right.rule
        
        # If results are the same, check if we can merge rules
        if left_result == right_result:
            # Check if the rules are similar except the last condition
            if left_rule[:-1] == right_rule[:-1]:
                node.rule = (left_rule[:-1], left_result)
                # Remove the children rules as they are redundant
                node.left.rule = None
                node.right.rule = None

simplify_tree(tree)

In [200]:
def extract_rules(node):
    if not node:
        return []
    if node.rule:
        return [node.rule]
    return extract_rules(node.left) + extract_rules(node.right)

simplified_rules = extract_rules(tree)
for item in simplified_rules:
    print(item)

print(len(simplified_rules))

(['smoking <= 0.16', 'diabets <= 0.95', 'sex_male <= 0.19', 'ОХС <= 6.08'], 0)
(['smoking <= 0.16', 'diabets <= 0.95', 'sex_male <= 0.19', 'ОХС > 6.08'], 1)
(['smoking <= 0.16', 'diabets <= 0.95', 'sex_male > 0.19', 'age <= 57.75'], 0)
(['smoking <= 0.16', 'diabets <= 0.95', 'sex_male > 0.19', 'age > 57.75'], 1)
(['smoking <= 0.16', 'diabets > 0.95', 'sex_male <= 0.54', 'age <= 74.52', 'age <= 65.45'], 1)
(['smoking <= 0.16', 'diabets > 0.95', 'sex_male <= 0.54', 'age <= 74.52', 'age > 65.45'], 0)
(['smoking <= 0.16', 'diabets > 0.95', 'sex_male <= 0.54', 'age > 74.52'], 1)
(['smoking <= 0.16', 'diabets > 0.95', 'sex_male > 0.54', 'age <= 68.22', 'age <= 64.06'], 1)
(['smoking <= 0.16', 'diabets > 0.95', 'sex_male > 0.54', 'age <= 68.22', 'age > 64.06'], 0)
(['smoking <= 0.16', 'diabets > 0.95', 'sex_male > 0.54', 'age > 68.22'], 1)
(['smoking > 0.16', 'sex_male <= 0.76', 'age <= 51.90'], 0)
(['smoking > 0.16', 'sex_male <= 0.76', 'age > 51.90'], 1)
(['smoking > 0.16', 'sex_male > 0.76

In [199]:
len(simplified_rules)

16