In [None]:
"""Class that represents the solution to be evolved."""
import random
class Solution():
    def __init__(self, all_possible_params):
        self.entry = {}
        self.score = 0.
        self.all_possible_params = all_possible_params
        self.params = {}  #  represents model parameters to be picked by creat_random method
        self.model = None
        
    """Create the model random params."""
    def create_random(self):
        for key in self.all_possible_params:
            self.params[key] = random.choice(self.all_possible_params[key])

    def set_params(self, params):
        self.params = params
      
    """
        Train the model and record the score.
    """
    def train_model(self, fn_train,params_fn):
        
        if self.score == 0.:
                res = fn_train(self.params,params_fn)
                self.score =  res["entry"]["F1"] #1-float(res["validation_loss"])
                self.model = res["model"]
                self.entry = res['entry']
            
    """Print out a network."""
    def print_solution(self):
        print("for params ", self.params , "the score in the train = ",self.score)

In [None]:
"""
Class that holds a genetic algorithm for evolving a population of params.
"""
from functools import reduce
from operator import add
import random
"""Class that implements genetic algorithm for Hyper-parameter tuning"""
class Optimizer():
    
    def __init__(self, GA_params, all_possible_params):
        """Create an optimizer."""
        self.random_select = GA_params["random_select"]
        self.mutate_chance = GA_params["mutate_chance"]
        self.retain = GA_params["retain"]
        self.all_possible_params = all_possible_params
    
    def create_population(self, count):
        """Create a population of random solutions."""
        pop = []
        for _ in range(0, count):
            # Create a random solution.
            solution = Solution(self.all_possible_params)
            solution.create_random()
            # Add the solution to our population.
            pop.append(solution)
        return pop

    @staticmethod
    def fitness(solution):
        """Return the score, which is our fitness function."""
        return solution.score

    def grade(self, pop):
        """Find average fitness for a population. """
        summed = reduce(add, (self.fitness(solution) for solution in pop))
        return summed / float((len(pop)))

    def crossover(self, mother, father):
        """Make two children as parts of their parents.
        Args:
            mother (dict): parameters
            father (dict): parameters
        Returns:
            (list): combined params
        """
        children = []
        for _ in range(2):
            child = {}
            # Loop through the parameters and pick params for the kid.
            for param in self.all_possible_params:
                child[param] = random.choice([mother.params[param], father.params[param]] )

            solution = Solution(self.all_possible_params)
            solution.set_params(child)
            # Randomly mutate some of the children.
            if self.mutate_chance > random.random():
                solution = self.mutate(solution)
            children.append(solution)
        return children
    
    
    def mutate(self, solution):
        """Randomly mutate one part of the solution."""
        # Choose a random key.
        mutation = random.choice(list(self.all_possible_params.keys()))
        # Mutate one of the params.
        solution.params[mutation] = random.choice(self.all_possible_params[mutation])
        return solution
    
    """Evolve a population of solutions."""
    def evolve(self, pop):
        #Get scores for each solution.
        graded = [(self.fitness(solution), solution) for solution in pop]
        #"Sort on the scores.
        graded = [x[1] for x in sorted(graded, key=lambda x: x[0], reverse=True)]
        #Get the number we want to keep for the next gen.
        retain_length = int(len(graded)*self.retain)
        # define what we want to keep.
        parents = graded[:retain_length]
        # For those we aren't keeping, randomly keep some anyway.
        for individual in graded[retain_length:]:
            if self.random_select > random.random():
                parents.append(individual)
        # Now find out how many spots we have left to fill.
        parents_length = len(parents)
        desired_length = len(pop) - parents_length
        
        # Add children, which are bred from two remaining solutions.
        if parents_length > 1 and desired_length> 0:
            children = []
            while len(children) < desired_length:
                if parents_length==2:
                    male_index = 1
                    female_index = 0
                else:
                    male_index = random.randint(0, parents_length-1)
                    female_index = random.randint(0, parents_length-1)
                
                # Assuming they aren't the same solutions...
                if male_index != female_index:
                    print("Get a random mom and dad.")
                    male = parents[male_index]
                    female = parents[female_index]
                    # crossover them.
                    babies = self.crossover(male, female)
                    # Add the children one at a time.
                    for baby in babies:
                        # Don't grow larger than desired length.
                        if len(children) < desired_length:
                            children.append(baby)
            parents.extend(children)
        return parents

In [None]:
from tqdm import tqdm
import threading
def train_sol_thread(solution,fn_train,params_fn,i):
    solution.train_model(fn_train,params_fn)
    print("solution ", i," trained")
    
def train_population(pop, fn_train,params_fn):
    pbar = tqdm(total=len(pop))
    threads = list()
    i=1
    for solution in pop:
        x = threading.Thread(target=train_sol_thread, args=(solution,fn_train,params_fn,i))
        i=i+1
        threads.append(x)
        x.start()
        pbar.update(1)
        
    for index, thread in enumerate(threads):
        thread.join()
    pbar.close()


def get_average_score(pop):
    """Get the average score for a group of solutions."""
    total_scores = 0
    for solution in pop:
        total_scores += solution.score
    return total_scores / len(pop)

"""Generate the optimal params with the genetic algorithm."""
""" Args:
        GA_params: Params for GA
        all_possible_params (dict): Parameter choices for the model
        train_set : training dataset
        fn_train : a function used to compute the prediction accuracy
"""
def generate(all_possible_params, fn_train , params_fn):
   
    GA_params = {
            "population_size": nbr_sol,
            "max_generations": nbr_gen,
            "retain": 0.7,
            "random_select":0.1,
            "mutate_chance":0.1
            }
    
    print("params of GA" , GA_params)
    optimizer = Optimizer(GA_params ,all_possible_params)
    pop = optimizer.create_population(GA_params['population_size'])
    # Evolve the generation.
    for i in range(GA_params['max_generations']):
        print("*********************************** REP(GA) ",(i+1))
        # Train and get accuracy for solutions.
        train_population(pop,fn_train,params_fn)
        # Get the average accuracy for this generation.
        average_accuracy = get_average_score(pop)
        # Print out the average accuracy each generation.
        print("Generation average: %.2f%%" % (average_accuracy * 100))
        # Evolve, except on the last iteration.
        if i != (GA_params['max_generations']):
            print("Generation evolving..")
            evolved = optimizer.evolve(pop)
            if(len(evolved)!=0):
                pop=evolved
        else:
            pop = sorted(pop, key=lambda x: x.score, reverse=True)
    # Print out the top 2 solutions.
    size = len(pop)
    if size < 3:
        print_pop(pop[:size])
    else:
        print_pop(pop[:3])
    return pop[0].params ,pop[0].model,pop[0].entry

def print_pop(pop):
    for solution in pop:
        solution.print_solution()    


In [None]:

from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.preprocessing import StandardScaler
import warnings # `do not disturbe` mode
warnings.filterwarnings('ignore')
sc = StandardScaler()
from numpy import arange
from numpy import argmax

nbr_rep = 6
nbr_gen = 2
nbr_sol = 2
max_eval = nbr_gen*nbr_sol

with_smote = False 
hybrid_option = False # means smote and threshold moving

if hybrid_option:
    with_smote =True


def getDataset(file_name):
    dataset = pd.read_csv("dataset/"+file_name, 
                          parse_dates=['gh_build_started_at'], 
                          index_col="gh_build_started_at")
    dataset.sort_values(by=['gh_build_started_at'], inplace=True)
    return dataset
    
# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
    return (pos_probs >= threshold).astype('int')

def getBestThreshold(probs, y_train):
    # keep probabilities for the positive outcome only
    #probs = predicted_builds[:, 1]
    thresholds = arange(0, 1, 0.001)
    # evaluate each threshold
    scores = [roc_auc_score(y_train, to_labels(probs, t)) for t in thresholds]
    # get best threshold
    ix = argmax(scores)
    #print('\nThreshold=%.2f, AUC=%.2f' % (thresholds[ix], scores[ix]))
    return  thresholds[ix]


def failureInfo(dataset):
    condition =  dataset['build_Failed'] > 0
    rate = (dataset[condition].shape[0]) /dataset.shape[0]
    size=dataset.shape[0]
    return rate,size

def getEntry(y, predicted_builds):
    entry = {}
    entry["AUC"] =  roc_auc_score(y, predicted_builds)
    entry["accuracy"] =  accuracy_score(y, predicted_builds)
    entry["F1"] =  f1_score(y,predicted_builds)
    return entry

def predict_model(classifier,X,y):
    predicted_builds = classifier.predict(X)
    
    if with_smote and not hybrid_option:
        decision_threshold = 0.5
    else:
        decision_threshold = getBestThreshold(predicted_builds, y)
        
    predicted_builds = (predicted_builds >= decision_threshold)
    return getEntry(y, predicted_builds)

def isInt(n):
    try:
        n=int(n)
        return True
    except:
        return False

def frange(start, stop=None, step=None):

    if stop == None:
        stop = start + 0.0
        start = 0.0

    if step == None:
        step = 1.0

    while True:
        if step > 0 and start >= stop:
            break
        elif step < 0 and start <= stop:
            break
        yield ("%g" % start) # return float number
        start = start + step
        
def frange_int(start, stop=None, step=None):

    if stop == None:
        stop = start 
        start = 0

    if step == None:
        step = 1

    while True:
        if step > 0 and start >= stop:
            break
        elif step < 0 and start <= stop:
            break
        yield (start) # return int number
        start = start + step
 

In [None]:
from hyperopt import hp,Trials,STATUS_OK ,fmin,tpe,rand
from keras.models import Sequential
from keras.layers import Dense,LSTM,Dropout
from keras.callbacks import EarlyStopping
import optunity
import optunity.metrics
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
import ConfigSpace as CS
from hpbandster.core.worker import Worker
from hpbandster.optimizers import BOHB as BOHB
from timeit import default_timer as timer

with_smote=0

def train_preprocess(dataset_train):
    
    X_train = dataset_train.iloc[:,1:19]
    y_train = dataset_train.iloc[:,0]

    if with_smote:
        X= training_set
        y= dataset_train.iloc[:,0].values
        X_train, y_train = SMOTE().fit_resample(X_train, y_train)
        training_set = X

    return X_train, y_train

def test_preprocess(dataset_test):

    X_test = dataset_test.iloc[:,1:19]
    y_test = dataset_test.iloc[:,0]

    X_test, y_test = np.array(X_test), np.array(y_test).astype(int)

    return X_test,y_test

def get_threshold_list(dataset):
    cdt =  dataset['build_Failed'] > 0
    failure_rate = (dataset[cdt].shape[0] /dataset.shape[0])
    return list(frange(0.01,max(1,failure_rate), 0.1))

class LSTMWorker(Worker):
    def __init__(self,  train_set, **kwargs):
        super().__init__(**kwargs)
        self.train_set= train_set

    def compute(self, config, *args, **kwargs):
        res = construct_model(config,self.train_set)
        return({
                    'loss': float(res["validation_auc"]),  # this is the a mandatory field to run hyperband,   
                    #remember: HpBandSter always minimizes!
                    'info': res["entry"] # can be used for any user-defined information - also mandatory
                })

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split

def construct_model (model_params,train_set):

    X, y = train_preprocess(train_set)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

    classifier = DecisionTreeClassifier(
        ccp_alpha = model_params["ccp_alpha"],
    #    class_weigh = model_params["class_weigh"],
    #    criterion = model_params["criterion"],
        max_depth = model_params["max_depth"],
        max_features = model_params["max_features"],
    #    max_leaf_nodes = model_params["max_leaf_nodes"],
    #    min_impurity_decrease = model_params["min_impurity_decrease"],
    #    min_samples_leaf = model_params["min_samples_leaf"],
    #    min_samples_split = model_params["min_samples_split"],
    #    min_weight_fraction_leaf = model_params["min_weight_fraction_leaf"],
    #    splitter = model_params["splitter"]
    )

    result =  classifier.fit(X_train, y_train)

    y_pred = result.predict(X_val)

    # Get the lowest validation loss of the training epochs
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_val, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)

    # Get prediction probs
    entry = predict_model(classifier,X_train,y_train)
    entry['validation_auc']=roc_auc
    return      {
                'validation_auc'  : roc_auc, #required by GA
                'model'   : classifier, #required by GA
                "entry"  : entry ,#required by GA
                }
global data
global global_params
global global_model
global global_entry


def evaluate_tuner(tuner_option, train_set):
    global data
    data = train_set
    #########################################
    ccp_alpha = [0.0]
    # class_weigh =
    # criterion = 
    max_depth = [30,50,70]
    max_features = [5,10,15]
    # max_leaf_nodes = 
    # min_impurity_decrease =
    # min_samples_leaf = 
    # min_samples_split = 
    # min_weight_fraction_leaf = 
    ##########################################################
    start = timer()
    
    param_choices = {
        'max_depth':   max_depth,
        'ccp_alpha':  ccp_alpha,
        'max_features':  max_features,

    }
    best_params ,best_model , entry_train = generate(param_choices, construct_model, data)


    end = timer()
    period = (end - start)
    entry_train["time"] = period
    entry_train["params"] = best_params
    entry_train["model"]  = best_model
    return entry_train
  

In [None]:
import pandas as pd
import os

global columns_res,columns_comp
columns_res = ["proj"]+["algo"]+["iter"]+["AUC"]+["accuracy"]+["F1"]+["exp"]

tuner = "ga"
results = pd.DataFrame(columns =  columns_res)
results_train = pd.DataFrame(columns =  columns_res)
bellwether="jruby.csv"
trainset = getDataset(bellwether)
for iteration in range (1,nbr_rep):
    entry_train  = evaluate_tuner(tuner,trainset)
    best_params = entry_train["params"]
    best_model = entry_train["model"]
    print(iteration,"*************************************** TRAIN",bellwether)
    entry_train["iter"] = iteration
    entry_train["proj"] = bellwether
    entry_train["algo"] = "DT"
    entry_train["params"] = best_params
    results_train = results_train.append(entry_train,ignore_index=True)
    print("entry_train",entry_train)
    for file_name in os.listdir("dataset"):
        if file_name!=bellwether:
            #print(file_name)
            testset = getDataset(file_name)
            X,y = test_preprocess(testset)
            entry= predict_model(best_model,X,y)
            entry["iter"] = iteration
            entry["proj"] = file_name
            entry["exp"] =  1
            entry["algo"] = "DT"
            results = results.append(entry,ignore_index=True)
results.to_excel("corss_proj_paramf_"+str(hybrid_option)+str(with_smote)+"_result_crossProj_"+tuner+"_DT.xlsx")
results_train.to_excel("cross_paramf"+str(hybrid_option)+str(with_smote)+"_train_crossProj_"+tuner+"_DT.xlsx")


displaying a desision tree

In [None]:
from sklearn.tree import _tree

def get_rules(tree, feature_names, class_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []
    path = []
    
    def recurse(node, path, paths):
        
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            p1, p2 = list(path), list(path)
            p1 += [f"({name} <= {np.round(threshold, 3)})"]
            recurse(tree_.children_left[node], p1, paths)
            p2 += [f"({name} > {np.round(threshold, 3)})"]
            recurse(tree_.children_right[node], p2, paths)
        else:
            path += [(tree_.value[node], tree_.n_node_samples[node])]
            paths += [path]
            
    recurse(0, path, paths)

    # sort by samples count
    samples_count = [p[-1][1] for p in paths]
    ii = list(np.argsort(samples_count))
    paths = [paths[i] for i in reversed(ii)]
    
    rules = []
    for path in paths:
        rule = "if "
        
        for p in path[:-1]:
            if rule != "if ":
                rule += " and "
            rule += str(p)
        rule += " then "
        if class_names is None:
            rule += "response: "+str(np.round(path[-1][0][0][0],3))
        else:
            classes = path[-1][0][0]
            l = np.argmax(classes)
            rule += f"class: {class_names[l]} (proba: {np.round(100.0*classes[l]/np.sum(classes),2)}%)"
        rule += f" | based on {path[-1][1]:,} samples\n"
        rules += [rule]
        
    return rules

rules = get_rules(dt, list(df.columns), ['pass','fail'])
for r in rules:
    print(r)