In [49]:
!pip install category_encoders
!pip install ucimlrepo



In [50]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier, _tree
import category_encoders as ce

In [51]:
from ucimlrepo import fetch_ucirepo
import pandas as pd

# fetch dataset
credit_approval = fetch_ucirepo(id=27)

# data (as pandas dataframes)
X = credit_approval.data.features
y = credit_approval.data.targets

# metadata
print(credit_approval.metadata)

# variable information
print(credit_approval.variables)

credit_approval= pd.concat([X, y], axis=1)

{'uci_id': 27, 'name': 'Credit Approval', 'repository_url': 'https://archive.ics.uci.edu/dataset/27/credit+approval', 'data_url': 'https://archive.ics.uci.edu/static/public/27/data.csv', 'abstract': 'This data concerns credit card applications; good mix of attributes', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 690, 'num_features': 15, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': [], 'target_col': ['A16'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1987, 'last_updated': 'Wed Aug 23 2023', 'dataset_doi': '10.24432/C5FS30', 'creators': ['J. R. Quinlan'], 'intro_paper': None, 'additional_info': {'summary': 'This file concerns credit card applications.  All attribute names and values have been changed to meaningless symbols to protect confidentiality of the data.\r\n  \r\nThis dataset is interesting because there is a good mix of attributes --

In [52]:
stable_attributes = ['A14','A11','A5', 'A4', 'A3']
flexible_attributes = list(credit_approval.drop(columns=['A16','A14','A11','A5', 'A4', 'A3']).columns)
decision_attributes = ['A16']

print("Stable Attributes:",stable_attributes)
print("Flexible Attributes:",flexible_attributes)
print("Decision Attributes:",decision_attributes)

Stable Attributes: ['A14', 'A11', 'A5', 'A4', 'A3']
Flexible Attributes: ['A15', 'A13', 'A12', 'A10', 'A9', 'A8', 'A7', 'A6', 'A2', 'A1']
Decision Attributes: ['A16']


In [53]:
class ActionRulesDiscovery:
    def __init__(self, data, stable_attrs, flexible_attrs, decision_attr):
        self.data = data
        self.stable_attrs = stable_attrs
        self.flexible_attrs = flexible_attrs
        self.decision_attr = decision_attr
        self.rules = []
        self.action_rules = []

    def extract_classification_rules(self, min_support=0.1, min_confidence=0.4):
        """Extract classification rules for each decision class
           stores rules in rules
        """
        decision_values = self.data[self.decision_attr].unique()

        for decision_value in decision_values:
            target_records = self.data[self.data[self.decision_attr] == decision_value]

            for attr in self.flexible_attrs + self.stable_attrs:
                value_counts = target_records[attr].value_counts()
                support = value_counts / len(self.data) #calculate support
                valid_values = support[support >= min_support]

                for value in valid_values.index:
                    total_with_value = len(self.data[self.data[attr] == value]) #Calculates the number of records in the whole dataset that have the current attribute value
                    support_count = len(target_records[target_records[attr] == value]) #Counts the number of target records (for the current decision value) that have the attribute equal to the current value
                    confidence = support_count / total_with_value #Calculate confidence

                    if confidence >= min_confidence:
                        # Calculate lift
                        antecedent_support = len(self.data[self.data[attr] == value]) / len(self.data)
                        consequent_support = len(self.data[self.data[self.decision_attr] == decision_value]) / len(self.data)
                        lift = support[value] / (antecedent_support * consequent_support)

                        rule = {
                            'conditions': [(attr, value)],
                            'decision': decision_value,
                            'support': support[value],
                            'confidence': confidence,
                            'lift': lift
                        }
                        self.rules.append(rule)

    def build_d_tree(self, decision_value):
        """Grouping rules into equivalence classes based on stable attributes """
        d_tree = defaultdict(list)
        for rule in self.rules:
            if rule['decision'] == decision_value:
                # Extract stable attribute conditions
                stable_conditions = tuple(
                    (attr, val) for (attr, val) in rule['conditions']
                    if attr in self.stable_attrs
                )
                d_tree[stable_conditions].append(rule)
        return d_tree

    def generate_action_rules(self, desired_effect):
        """Generate action rules using equivalence classes """

        # Get all decision values except the desired one
        decision_values = self.data[self.decision_attr].unique()
        source_decisions = [dv for dv in decision_values if dv != desired_effect]

        # Build target d-tree (desired effect)
        target_d_tree = self.build_d_tree(desired_effect)

        for source_decision in source_decisions:
            # Build source d-tree (current undesired class)
            source_d_tree = self.build_d_tree(source_decision)

            # Compare equivalence classes between source and target
            for stable_key in source_d_tree:
                if stable_key in target_d_tree:
                    for source_rule in source_d_tree[stable_key]:
                        for target_rule in target_d_tree[stable_key]:
                            action_rule = self._compare_rules(source_rule, target_rule)
                            if action_rule:
                                self.action_rules.append(action_rule)

    def _compare_rules(self, source_rule, target_rule):
        """Compare two rules to generate an action rule."""
        for attr, value in source_rule['conditions']:
            if attr in self.stable_attrs:
                return None
                # target_value = next((v for a, v in target_rule['conditions'] if a == attr), None)
                # if target_value and value != target_value:
                #     return None

        actions = []
        for attr, source_value in source_rule['conditions']:
            if attr in self.flexible_attrs:
                target_value = next((v for a, v in target_rule['conditions'] if a == attr), None)
                if target_value and source_value != target_value:
                    actions.append((attr, f"{source_value} → {target_value}"))

        if actions:
            return {
                'actions': actions,
                'source_decision': source_rule['decision'],
                'target_decision': target_rule['decision'],
                'support': min(source_rule['support'], target_rule['support']),
                'confidence': min(source_rule['confidence'], target_rule['confidence']),
                'lift': min(source_rule['lift'], target_rule['lift'])
            }
        return None

In [54]:
def apply_action_rules(action_rules, original_data, confidence_threshold=0.1):
    """
    Applies action rules to transform the dataset by modifying flexible attributes and decision values.

    Args:
        action_rules (list): List of action rules (dictionaries with 'actions', 'confidence', etc.)
        original_data (pd.DataFrame): Original dataset to transform
        confidence_threshold (float): Minimum confidence required to apply a rule

    Returns:
        pd.DataFrame: Transformed dataset with applied changes
    """
    transformed_data = original_data.copy()
    modified_records = set()  # Track records that have already been modified

    for rule in action_rules:
        if rule['confidence'] >= confidence_threshold:
            # Build mask for records matching ALL action source values AND source decision
            mask = pd.Series(True, index=transformed_data.index)
            mask &= (transformed_data['A16'] == rule['source_decision'])

            # Prepare source→target mappings for each attribute in the action
            action_changes = []
            for attr, value_change in rule['actions']:
                source_val, target_val = value_change.split(' → ')
                mask &= (transformed_data[attr] == source_val)
                action_changes.append((attr, target_val))

            # Apply changes to matching records that haven't been modified yet
            for record_idx in transformed_data[mask].index:
                if record_idx not in modified_records:
                    # Update flexible attributes
                    for attr, target_val in action_changes:
                        transformed_data.at[record_idx, attr] = target_val
                    # Update decision attribute
                    transformed_data.at[record_idx, 'A16'] = rule['target_decision']
                    modified_records.add(record_idx)

    return transformed_data

In [55]:
def train_evaluate_model(original_data, transformed_data):
    """Train and evaluate Random Forest models on both original and transformed data."""
    # Split original data
    X = original_data.drop('A16', axis=1)
    y = original_data['A16']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Split transformed data
    X_t = transformed_data.drop('A16', axis=1)
    y_t = transformed_data['A16']
    X_t_train, X_t_test, y_t_train, y_t_test = train_test_split(X_t, y_t, test_size=0.2, random_state=42)

    # Initialize encoder and model
    encoder = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True)
    clf = RandomForestClassifier(max_depth=24, random_state=0)

    # Train and evaluate on original data
    X_train_encoded = encoder.fit_transform(X_train)
    X_test_encoded = encoder.transform(X_test)

    X_t_encoded = encoder.transform(X_t)
    X_encoded = encoder.transform(X)
    X_t_test_encoded = encoder.transform(X_t_test)

    clf.fit(X_train_encoded, y_train)

    # Confusion matrix for actual values of original dataset (entire dataset)
    original_confusion_actual = confusion_matrix(y, y)
    transformed_confusion_actual = confusion_matrix(y_t, y_t)

    # Accuracy and confusion for original data after predictions
    y_pred = clf.predict(X_test_encoded)
    original_accuracy = accuracy_score(y_test, y_pred)
    original_confusion_pred = confusion_matrix(y_test, y_pred)

    # Accuracy and confusion for transformed data after predictions
    y_t_pred = clf.predict(X_t_test_encoded)
    transformed_accuracy = accuracy_score(y_test, y_t_pred)
    transformed_confusion_pred = confusion_matrix(y_test, y_t_pred)

    # Full accuracy and confusion for original and transformed data (based on full dataset)
    full_t_accuracy = accuracy_score(y, clf.predict(X_t_encoded))
    full_t_confusion = confusion_matrix(y, clf.predict(X_t_encoded))

    full_accuracy = accuracy_score(y, clf.predict(X_encoded))
    full_confusion = confusion_matrix(y, clf.predict(X_encoded))

    return {
        'original': {'accuracy': original_accuracy, 'confusion_matrix': original_confusion_pred},
        'transformed': {'accuracy': transformed_accuracy, 'confusion_matrix': transformed_confusion_pred},
        'full transformed': {'accuracy': full_accuracy, 'confusion_matrix': full_confusion},
        'full original': {'accuracy': full_t_accuracy, 'confusion_matrix': full_t_confusion},
        'decision_distribution': {'original': y.value_counts(), 'transformed': y_t.value_counts()}
    }


In [56]:
def print_action_rules(action_rules):
    """
    Print action rules in a clear, formatted way.

    Args:
        action_rules: List of action rule dictionaries from ActionRulesDiscovery
    """
    if not action_rules:
        print("No action rules found.")
        return

    print("\n=== Action Rules ===")
    for i, rule in enumerate(action_rules, 1):
        print(f"\nRule {i}:")
        print("Actions:")
        for attr, change in rule['actions']:
            print(f"  • {attr}: {change}")

        print(f"Source Decision: {rule['source_decision']}")
        print(f"Target Decision: {rule['target_decision']}")
        print(f"Support: {rule['support']:.3f}")
        print(f"Confidence: {rule['confidence']:.3f}")
        print(f"Lift: {rule['lift']:.3f}")
        print("-" * 40)

In [57]:
# Initialize class
action_discovery = ActionRulesDiscovery(
    data=credit_approval,
    stable_attrs=stable_attributes,
    flexible_attrs=flexible_attributes,
    decision_attr='A16'
)

# Generate action rules
action_discovery.extract_classification_rules()
action_discovery.generate_action_rules(desired_effect='+')

print_action_rules(action_discovery.action_rules)


=== Action Rules ===

Rule 1:
Actions:
  • A12: f → t
Source Decision: -
Target Decision: +
Support: 0.212
Confidence: 0.462
Lift: 1.026
----------------------------------------

Rule 2:
Actions:
  • A12: t → f
Source Decision: -
Target Decision: +
Support: 0.233
Confidence: 0.430
Lift: 0.968
----------------------------------------

Rule 3:
Actions:
  • A10: f → t
Source Decision: -
Target Decision: +
Support: 0.303
Confidence: 0.708
Lift: 1.355
----------------------------------------

Rule 4:
Actions:
  • A9: f → t
Source Decision: -
Target Decision: +
Support: 0.412
Confidence: 0.787
Lift: 1.676
----------------------------------------

Rule 5:
Actions:
  • A7: v → h
Source Decision: -
Target Decision: +
Support: 0.126
Confidence: 0.576
Lift: 1.038
----------------------------------------

Rule 6:
Actions:
  • A1: b → a
Source Decision: -
Target Decision: +
Support: 0.142
Confidence: 0.467
Lift: 1.009
----------------------------------------

Rule 7:
Actions:
  • A1: a → b
Source 

In [58]:
transformed_dataset = apply_action_rules(
    action_rules=action_discovery.action_rules,
    original_data=credit_approval,
    confidence_threshold=0.75
)

results = train_evaluate_model(credit_approval, transformed_dataset)

# decsision values count
print("Decision distribution in original data:\n", results['decision_distribution']['original'])
print("Decision distribution in transformed data:\n", results['decision_distribution']['transformed'])

print("Original data accuracy:", results['original']['accuracy'])
# print("Transformed data accuracy:", results['transformed']['accuracy'])

print("Original data confusion_matrix:\n", results['original']['confusion_matrix'])
print("Transformed data confusion_matrix:\n", results['transformed']['confusion_matrix'])

# print("Full orginal data accuracy:", results['full transformed']['accuracy'])
print("Full orginal data confusion_matrix:\n", results['full transformed']['confusion_matrix'])

# print("Full transformed data accuracy:", results['full original']['accuracy'])
print("Full transformed data confusion_matrix:\n", results['full original']['confusion_matrix'])


Decision distribution in original data:
 A16
-    383
+    307
Name: count, dtype: int64
Decision distribution in transformed data:
 A16
+    613
-     77
Name: count, dtype: int64
Original data accuracy: 0.8623188405797102
Original data confusion_matrix:
 [[59 11]
 [ 8 60]]
Transformed data confusion_matrix:
 [[59 11]
 [37 31]]
Full orginal data confusion_matrix:
 [[296  11]
 [  8 375]]
Full transformed data confusion_matrix:
 [[296  11]
 [164 219]]
