In [112]:
import random
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [113]:
class Model:
    def __init__(self, model_id, binary_features):
        self.model_id = model_id
        self.binary_features = binary_features
        self.error = None
        self.fitness = None

In [114]:
def initialization(feature_list, n_models, gen):
    """ Initialize n models with 1 or 0 for each feature """
    model_list = []
    for i in range(n_models):
        model_id = f"{gen}_{i}"
        binary_features = [random.getrandbits(1) for i in range(len(feature_list))]
        model_list.append(Model(model_id, binary_features))
    return model_list

In [129]:
def fitness_assignment(model_list, input_data, feature_list):
    """ Get the fitness of each model """

    for model_obj in model_list:
        chosen_cols = [feat for feat,bin in zip(feature_list,model_obj.binary_features) if bin]
        
        X = input_data[chosen_cols] # Features
        y = input_data.label # Target variable
        # split X and y into training and testing sets
        X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)    

        logreg = LogisticRegression(solver='lbfgs')

        # fit the model with data
        logreg.fit(X_train,y_train)

        #
        y_pred=logreg.predict(X_test)


        y_pred_proba = logreg.predict_proba(X_test)[::,1]
        fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
        auc = metrics.roc_auc_score(y_test, y_pred_proba)
        
        model_obj.error = 1-auc

    # Selective pressure (between 1 and 2)
    k = 1.5
    # Highest error first
    model_list.sort(key=lambda x: x.error, reverse=True)
    # Assign each model the fitness corresponding to the rank
    for rank, model_obj in enumerate(model_list):
        model_obj.fitness = k*rank
    
    model_list.sort(key=lambda x: x.fitness, reverse=True)
    
    return model_list

In [130]:
def weighted_random_choice(model_list):
    """ Randomly pick a model, giving precedent to those with higher fitness """
    max = sum(model.fitness for model in model_list)
    pick = random.uniform(0, max)
    current = 0
    for model in model_list:
        current += model.fitness
        if current > pick:
            return model

In [131]:
def selection(N, model_list):
    """ Select N/2 individuals according to fitness. Assumes model_list has highest fitness first """
    elitism = 1
    n_selected_individuals = N/2
    selected_individuals = []
    # First select elite individuals
    for i in range(elitism):
        selected_individuals.append(model_list[i])
    
    # Then select individuals randomly based on their proportion of fit
    fitness_list = None
    
    for i in range(round(n_selected_individuals - elitism)):
        selected_model = weighted_random_choice(model_list)
#         print(f"Selected Model Error: {selected_model.error}")
        selected_individuals.append(selected_model)
        
    return selected_individuals

In [132]:
def compute_crossover(parent_models):
    """ Computes the actual crossover of features """
    parent_features = [(a,b) for a,b in zip(parent_models[0].binary_features, 
                                            parent_models[1].binary_features)]
    output_features = []
    for feature in parent_features:
        pick = random.uniform(0, 1)
        output_features.append(feature[0] if pick < 0.5 else feature[1])
#     print(f"Left Parent: {parent_models[0].binary_features}, Right Parent: {parent_models[1].binary_features}")
#     print(f"Child: {output_features}")
    return output_features

In [133]:
def crossover(model_list, N, gen):
    """ Creates children models from parents """
    parent_models = random.sample(model_list, 2)
    child_models = []
    for i in range(N):
        new_features = compute_crossover(parent_models)
        model_id = f"{gen}_{i}"
        child_models.append(Model(model_id, new_features))
    return child_models

In [134]:
def mutation(model_list):
    """ Mutates one gene of each model """
    feature_len = len(model_list[0].binary_features)
    mutation_rate = 1/feature_len
    for model_obj in model_list:
        new_binary_features = []
#         print(model_obj.model_id)
        for feature in model_obj.binary_features:
            pick = random.uniform(0, 1)
#             print(f"Pick: {pick}")
#             print(f"mutation_rate {mutation_rate}")
            if pick < mutation_rate:
                new_fetaure = int(not feature)
#                 print(f"Old: {feature}, New: {new_fetaure}")
                new_binary_features.append(new_fetaure)
            else:
#                 print("\t PASS")
                new_binary_features.append(feature)
#         print(f"Old: {model_obj.binary_features}, New: {new_binary_features}")
        model_obj.binary_features = new_binary_features
    return model_list

In [135]:
def test_functions():
    
    print("\nTest Initialisation\n")
    for model_obj in initialization(["a","b","c"],4, 0):
        print(model_obj.binary_features)
    
    
    print("\nTest Selection\n")
    model_one = Model("0_1",[0,1,1])
    model_two = Model("0_2",[1,0,1])
    model_three = Model("0_3",[0,0,1])
    model_four = Model("0_4",[1,0,0])
    setattr(model_one, "fitness", 0.2)
    setattr(model_two, "fitness", 0.4)
    setattr(model_three, "fitness", 0.6)
    setattr(model_four, "fitness", 0.8)
    selected_models = selection(4, [model_one, model_two, model_three, model_four])
    for model_obj in selected_models:
        print(model_obj.binary_features)
        
    model_list = [Model("0_1",[1,1,1]),Model("0_2",[0,0,0])]
    
    print("\nTest Crossover\n")
    child_models = crossover(model_list, 4, 1)
    for model_obj in child_models:
        print(model_obj.binary_features)
  
    print("\nTest Mutation\n")
    mutated_models = mutation(model_list)
    for model_obj in mutated_models:
        print(model_obj.binary_features)

In [136]:
test_functions()


Test Initialisation

[0, 1, 1]
[1, 1, 0]
[0, 0, 0]
[1, 1, 0]

Test Selection

[0, 1, 1]
[0, 0, 1]

Test Crossover

[1, 0, 1]
[0, 0, 1]
[0, 0, 1]
[0, 1, 1]

Test Mutation

[1, 1, 0]
[0, 0, 0]


In [137]:
def run_feature_selection():
    """ Run the feature selection algorithm """
    col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
    # load dataset
    diabetes = pd.read_csv("diabetes.csv", header=0, names=col_names)
    feature_list = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age']

    n_models = 4
    model_list = initialization(feature_list, n_models, 0)
    generations = 100
    model_errors = [[model_obj.error for model_obj in model_list]]
    for gen in range(generations):
        print(f"Gen: {gen}")
        model_list = fitness_assignment(model_list, diabetes, feature_list)
        model_list = selection(n_models, model_list)
        model_list = crossover(model_list, n_models, gen)
        model_list = mutation(model_list)
        model_errors.append([model_obj.error for model_obj in model_list])
    print(model_errors)

In [138]:
run_feature_selection()

Gen: 0
Gen: 1
Gen: 2
Gen: 3




Gen: 4
Gen: 5
Gen: 6




Gen: 7
Gen: 8




Gen: 9
Gen: 10
Gen: 11
Gen: 12
Gen: 13
Gen: 14
Gen: 15
Gen: 16
Gen: 17
Gen: 18
Gen: 19
Gen: 20
Gen: 21
Gen: 22
Gen: 23




Gen: 24
Gen: 25
Gen: 26
Gen: 27
Gen: 28
Gen: 29
Gen: 30
Gen: 31
Gen: 32
Gen: 33
Gen: 34
Gen: 35
Gen: 36
Gen: 37
Gen: 38
Gen: 39
Gen: 40




Gen: 41
Gen: 42
Gen: 43
Gen: 44




Gen: 45




Gen: 46
Gen: 47




Gen: 48
Gen: 49




Gen: 50
Gen: 51




Gen: 52
Gen: 53
Gen: 54
Gen: 55




Gen: 56
Gen: 57
Gen: 58
Gen: 59
Gen: 60
Gen: 61
Gen: 62
Gen: 63
Gen: 64
Gen: 65




Gen: 66
Gen: 67
Gen: 68
Gen: 69
Gen: 70
Gen: 71
Gen: 72
Gen: 73
Gen: 74
Gen: 75




Gen: 76
Gen: 77




Gen: 78
Gen: 79
Gen: 80
Gen: 81




Gen: 82
Gen: 83
Gen: 84
Gen: 85
Gen: 86
Gen: 87
Gen: 88
Gen: 89
Gen: 90
Gen: 91
Gen: 92




Gen: 93
Gen: 94
Gen: 95
Gen: 96
Gen: 97
Gen: 98




Gen: 99
[[None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [No