In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df=pd.DataFrame(
    columns = ['build_Failed', 'gh_is_pr', 'git_prev_commit_resolution_status',
       'gh_team_size', 'gh_num_commit_comments', 'git_diff_src_churn',
       'git_diff_test_churn', 'gh_diff_files_added', 'gh_diff_files_deleted',
       'gh_diff_files_modified', 'gh_diff_tests_added',
       'gh_diff_tests_deleted', 'gh_diff_src_files', 'gh_diff_doc_files',
       'gh_diff_other_files', 'gh_sloc', 'gh_test_lines_per_kloc',
       'gh_test_cases_per_kloc', 'gh_asserts_cases_per_kloc', 'tr_build_id',
       'gh_build_started_at'],
    dtype='object')

path = '/mnt/d/PFE/Code/dataset'
path = 'D:\PFE\Code'

for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        if filename[-4:]==".csv":
            df = pd.concat([df, pd.read_csv(os.path.join(dirname, filename))])

X = df.iloc[:,1:19]
y = df.iloc[:,0].astype(int)

from sklearn.model_selection import train_test_split
X_train, X_val , y_train, y_val = train_test_split(np.array(X), np.array(y), test_size=0.2, shuffle=True, stratify=y, random_state=42) # keep ratio of classes in split


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.tree import _tree
from sklearn.metrics import f1_score 

class modDecisionTree:
    """
    Represents the classification model
    based on sklearn implementation with added methods for modifying single nodes
    """

    def __init__(self, max_depth=3, random_state=42):
        # add init of hyper-param
        self.model = DecisionTreeClassifier(max_depth=max_depth, splitter='random', random_state=random_state)
    
    def fit(self, X_train, y_train, columns_names):
        self.model.fit(X_train, y_train)
        self.tree = self.model.tree_
        self.n_nodes = self.tree.node_count   # nbr nodes
        self.threshold = self.tree.threshold     # all the thresholds
        self.feature = self.tree.feature         # list of features, one feature for each node
        self.columns_names = columns_names
        self.features_names = [list(self.columns_names)[i] for i in self.feature]
        self.nodes_type = self.get_nodes_type()

    def evaluate(self, X_val, y_val):
        y_pred = self.model.predict(X_val)
        metrics = {}
        metrics['F1'] = f1_score(y_pred ,y_val)
        return metrics

    def feature_importance(self):
        feat_imp = []
        for name, importance in zip(self.features_names , self.model.feature_importances_):
            feat_imp.append((name, importance))
        feat_imp.sort(key=lambda t:t[1], reverse=True)
        return feat_imp

    def plot_tree(self):
        plt.figure(figsize=(15,10))  # set plot size (denoted in inches)
        tree.plot_tree(self.model, fontsize=10, class_names=['pass','fail'])
        plt.show()
    
    def get_nodes_type(self):
        clf = self.model
        children_left = self.tree.children_left
        children_right = self.tree.children_right
        node_depth = np.zeros(shape=self.n_nodes, dtype=np.int64)
        is_leaves = np.zeros(shape=self.n_nodes, dtype=bool)
        stack = [(0, 0)]  # start with the root node id (0) and its depth (0)
        while len(stack) > 0:
            # `pop` ensures each node is only visited once
            node_id, depth = stack.pop()
            node_depth[node_id] = depth

            # If the left and right child of a node is not the same we have a split
            # node
            is_split_node = children_left[node_id] != children_right[node_id]
            # If a split node, append left and right children and depth to `stack`
            # so we can loop through them
            if is_split_node:
                stack.append((children_left[node_id], depth + 1))
                stack.append((children_right[node_id], depth + 1))
            else:
                is_leaves[node_id] = True
        return is_leaves

    def node_is_leaf(self, node):
        """Node is either a leaf (terminal node) or a split node"""
        return self.nodes_type[node]==1

    def set_node_threshold(self, node, value):
        if self.node_is_leaf(node):
            print("Error: can't change a leaf node's threshold.")
            return
        self.threshold[node] = value 

    def set_node_feature(self, node, feat_name):
        if self.node_is_leaf(node):
            print("Error: can't change a leaf node's feature.")
            return
        feat_id = self.features_names.index(feat_name)
        self.feature[node] = feat_id