In [48]:
import numpy as np
import random
import pandas as pd

In [49]:
students = pd.read_csv('dataset_full.csv')

# Group Hyperparameters
groupsize = 5

Darwinian Natural Selection

- Variation
- Selection
- Heredity

1. Variation (Create a population)

In [50]:
students.head()

Unnamed: 0,ID,Name,Gender,Preferred language,Majors,Level of ambition,Preferred meeting place,Personality type,Best friend,Preferred day
0,1,Daniel Neudorf,Male,Any,"('PHIL', 'NI')",Low,In person,ENFJ,Dominik Faust,Tuesday
1,2,Lea Austerlitz,Female,Any,"('PHIL', 'CP')",Very high,In person,ESFJ,Lea Grunwald,Thursday
2,3,Vanessa Abendroth,Female,German,"('NS', 'AI')",High,In person,INTJ,Dennis Cole,Wednesday
3,4,Martina Gaertner,Female,Any,"('NS', 'AI')",Low,In person,ESFJ,Annett Herzog,Monday
4,5,Frank Sankt,Male,Any,"('PHIL', 'CP')",Medium,In person,ENTP,Daniela Kortig,Wednesday


In [51]:
class Population():
    def __init__(self, students, groupsize):
        super(Population, self).__init__()
        
        self.students = students
        self.n_individuals = len(students)
        self.ids = students.ID.tolist()
        self.groupsize = groupsize

        if self.n_individuals % groupsize != 0:
            raise ValueError(f'Received unexpected group size: {self.n_individuals} groups % {groupsize} individuals has to be 0.')

    def create_random_individual(self):
        individual = self.ids.copy()
        random.shuffle(individual)

        return individual

    def create_initial_population(self):
        population = []
        for _ in range(self.n_individuals):
            population.append(self.create_random_individual())

        return np.array(population, dtype=int)

    def show_groups_from_individual(self, individual):

        groups = []
        for i in individual:
            groups.append(self.students['Name'][i-1])
        
        for n,j in enumerate(range(0, len(groups), self.groupsize)):
            print(f'Group {n+1} :',groups[j:j+5])

        return None

In [52]:
population = Population(students, groupsize)

test_population = population.create_initial_population()

test_population

array([[47, 45, 18, ..., 32, 10, 49],
       [22, 43, 33, ..., 11, 23, 28],
       [25, 35, 14, ..., 19, 46, 28],
       ...,
       [21, 12, 31, ..., 34, 17, 11],
       [42,  3, 31, ..., 48,  6, 41],
       [26, 47, 16, ..., 17, 30, 41]])

Show Groups from the IDs

In [53]:
population.show_groups_from_individual(test_population[0])

Group 1 : ['Lea Grunwald', 'Ralf Sankt', 'Nadine Feierabend', 'Dirk Lehrer', 'Daniel Neudorf']
Group 2 : ['Dominik Faust', 'Monika Faerber', 'Ulrike Muench', 'Leon Meister', 'Martina Gaertner']
Group 3 : ['Jan Schroder', 'Vanessa Abendroth', 'Matthias Kaufmann', 'Torsten Traugott', 'Yvonne Sanger']
Group 4 : ['Yvonne Bergmann', 'Stefanie Eichel', 'Sarah Frey', 'Mario Brauer', 'Annett Herzog']
Group 5 : ['Daniela Drescher', 'Ines Herman', 'Ralph Abt', 'Maria Cole', 'Doreen Krueger']
Group 6 : ['Kevin Neumann', 'Lisa Wurfel', 'Torsten Busch', 'Lea Austerlitz', 'René Naumann']
Group 7 : ['Dennis Cole', 'Thomas Beckenbauer', 'Anna Frueh', 'Lucas Eisenhauer', 'Gabriele Herz']
Group 8 : ['Frank Sankt', 'Swen Wolf', 'Daniela Kortig', 'Jürgen Furst', 'Ines Baer']
Group 9 : ['Thorsten Theissen', 'Monika Koenig', 'Jonas Dresdner', 'Torsten Kastner', 'Jessika Muller']
Group 10 : ['Heike Schulze', 'Kristian Kastner', 'Martin Engel', 'Lucas Wulf', 'Barbara Fassbinder']


________

2. Selection (Evaluate the fitness of each group, find fittest)

In [54]:
class Fitness():
    def __init__(self):
        super(Fitness, self).__init__()

    def evaluate_fitness(self, individual, students):
        """ Evaluates the fitness of an individual. """

        # split individual into student groups of the groupsize
        groups = np.array_split(individual, (len(individual)/groupsize))

        # iterate over groups and calculate scores for the different parameters
        scores = []
        for group_ids in groups:
            # get full data for students in this group from pd dataframe
            group = students.loc[students['ID'].isin(group_ids)]

            # get individual scores for parameters
            language_score = self.evaluate_language(group)
            if language_score == -1:
                # if we really want to use a hard constraint we would need to return -1 here, this makes for really bad (initial) results though
                # return -1
                scores.append(language_score)
                continue

            major_score = self.evaluate_majors(group)
            ambition_score = self.evaluate_ambition(group)
            place_score = self.evaluate_meeting_place(group)
            gender_score = self.evaluate_gender(group)
            friend_score = self.evaluate_friends(group)
            personality_score = self.evaluate_personality(group)
            day_score = self.evaluate_meeting_day(group)

            # formula for adding and weighting different scores
            scores.append(language_score+major_score+ambition_score+place_score+gender_score+friend_score+personality_score+day_score)

        #Convert to series to calculate mean more easily
        return pd.Series(scores).mean()

    def evaluate_language(self, group):
        # number of groupmembers per language
        counts = group['Preferred language'].value_counts()

        # hard constraints, if languages are conflicting, return -1
        if 'German' in counts.index and 'English' in counts.index:
            return -1

        return groupsize

    def evaluate_majors(self, group):
        majors = group['Majors'].tolist()

        # preprocess majors from dataset notation to list with all majors
        group_majors = []
        for pair in majors:
            pair = pair[1:-1].split(", ")

            group_majors.append(pair[0][1:-1])
            group_majors.append(pair[1][1:-1])

        #convert to Series for easier handling
        group_majors = pd.Series(group_majors)
        #get value counts
        group_major_values = group_majors.value_counts()
        #remove majors only one person takes (as they provide no synergy to the group)
        group_major_values = group_major_values[group_major_values > 1]

        #add number of shared majors and divide by 2; Formula is kinda arbitrary
        return group_major_values.sum() /2

    def evaluate_ambition(self, group):
        # get pd Series of ambitions
        ambitions = group['Level of ambition']
        # get int value mappings for ambitions

        mapping = {
            'Very low': 1,
            'Low': 2,
            'Medium': 3,
            'High': 4,
            'Very high': 5
        } 

        ambitions = ambitions.map(mapping)

        # fitness is groupsize - variance in group motivation (so less variance = more fitness)
        return groupsize - ambitions.var()

    def evaluate_meeting_place(self, group):
        # number of groupmembers for each preferred meeting place
        meeting_place = group['Preferred meeting place'].value_counts()

        # if all prefer the same meeting place return 5, else 0
        if meeting_place[0] == groupsize:
            return 5

        return 0

    def evaluate_gender(self, group):
        # evaluate by variance
        genders = group['Gender'].value_counts()

        # add 0 entry for missing genders
        for gender in ['Male', 'Female', 'Indeterminate']:
            if gender not in genders.index:
                genders[gender] = 0

        # return groupsize - variance
        return groupsize - genders.var()

    def evaluate_friends(self, group):
        #for each member +1 if friend is also in group
        group_member_names = group['Name'].tolist()
        best_friends_name = group['Best friend'].tolist()

        # get intersection between both lists
        friends_in_group = list(set(group_member_names).intersection(best_friends_name))

        # fitness += 1 for every pair of friends
        return len(friends_in_group)

    def evaluate_personality(self, group):
        #information about compatible personality types is taken from
        # Montequín, Vicente Rodríguez, et al. "Using Myers-Briggs type indicator (MBTI) as a tool for setting up student teams for information technology projects." Journal of Information Technology and Application in Education 1.1 (2012): 28-34.

        #count existing personality types in each group
        personalities = group['Personality type']
        types = personalities.value_counts()

        #fitness function starts with 0 and gets better
        # with every good group member
        fitness = 0

        #its good if there is a group leader like an ISTJ or an ESTJ, but only one
        try:
            if (types['ISTJ'] + types['ESTJ'] == 1):
                fitness+=5
            elif (types['ISTJ'] + types['ESTJ'] >= 2):
                fitness-=5
        except KeyError:
            pass

        #compare compatibility of group members
        for i, personality_a in enumerate(personalities.tolist()):
            for j, personality_b in enumerate(personalities.tolist()):
                # skip same group member and members already compared
                if i <= j:
                    continue

                # increase fitness if
                if (personality_a[1] != personality_b[1]) ^ (personality_a[2] != personality_b[2]):
                    if (personality_a[0] != personality_b[0]) or (personality_a[3] != personality_b[3]):
                        fitness+=1

        return fitness

    def evaluate_meeting_day(self, group):
        # number of groupmembers for each preferred meeting day
        meeting_day = group['Preferred day'].value_counts()

        # if all prefer the same meeting day return 5, else 0
        if meeting_day[0] == groupsize:
            return 5

        return 0

    def mean_fitness(self, population):
        # get list of fitness scores for all individuals in this population
        fitness_scores = [self.evaluate_fitness(individual, students) for individual in population]

        # convert to series to calculate mean more easily
        return round(pd.Series(fitness_scores).mean(), 3)

    def best_fitness(self, population):
        # get list of fitness scores for all individuals in this population
        fitness_scores = [self.evaluate_fitness(individual, students) for individual in population]

        # sort by best first
        scores_sorted = sorted(fitness_scores, reverse=True)

        # return fitness of best individual
        return round(scores_sorted[0], 3)

    def indices_sorted_by_fitness(self, population):
        fitness_scores = [self.evaluate_fitness(individual, students) for individual in population]

        indices = np.argsort(fitness_scores)

        return indices.tolist()

In [55]:
best_fitness = Fitness().best_fitness(test_population)
best_fitness

21.937

3. Heredity (Pass on the fittest to the next generation)

In [56]:
class Crossover():
    def __init__(self):
        super(Crossover, self).__init__()

    def uniform_order_crossover(self, p1, p2):

        template = self.get_crossover_template(len(p1), crossover_rate=0.2)
        # create 'empty' child
        child = np.zeros((len(p1),),dtype=int)
        # where the template is true, take values from p1
        child[template] = p1[template]
        # store genes used from p1
        used_genes = p1[template]

        # get all genes from p2
        remaining_genes = p2.tolist()
        # add genes from p2 (that were not used from p1) to the empty spots of the child
        for i, value in enumerate(child):
            # if this spot is already filled, continue
            if value != 0:
                continue

            # do while:  pop(get and remove) next gene from p2 until one is found that is not yet in the genome of the child, then add that
            while True:
                next_gene = remaining_genes.pop(0)
                if next_gene not in used_genes:
                    child[i] = next_gene
                    break

        return child

    def get_crossover_template(self,length, crossover_rate = 0.2):
        # initialize template with false values
        template = np.zeros((length,),dtype=bool)
        # get random indices of the amount #of genes * crossover rate
        random_indices = np.random.choice(template.shape[0], int(length*crossover_rate), replace=False)
        #set these indices to true
        template[random_indices] = True

        return template

    def tournament_selection(self, popuplation, tournament_size = 8):
        # get random indices to select random individuals from population
        random_indices = np.random.choice(popuplation.shape[0], tournament_size*2, replace=False)

        # get individuals from random indices and split into two tournaments
        tournament1 = popuplation[random_indices[:tournament_size]]
        tournament2 = popuplation[random_indices[tournament_size:]]

        parents = []
        # tournament is won by fittest individual in each tournament, those become the two parents
        for tournament in (tournament1,tournament2):
            # get fitness scores for every individual in the tournament
            fitness_scores = [Fitness().evaluate_fitness(individual, students) for individual in tournament]
            # get indices ordered by highest fitness first
            idx = np.argsort(fitness_scores)[::-1]
            # add individual with highest fitness to parents
            parents.append(tournament[idx[0]])

        return parents

In [57]:
random_population = Population(students, groupsize).create_initial_population()

random_parent1, random_parent2 = Crossover().tournament_selection(random_population)

print('First parent:')
print(random_parent1, '\n')
print('Second parent:')
print(random_parent2, '\n')

child = Crossover().uniform_order_crossover(random_parent1, random_parent2)

print('Child after crossover:')
print(child)

First parent:
[31  6 19 17 28 14  2 47 10 26 45 33 34 18  1 12 46 16  8 21 37 36 20  4
 44  7 30  5 42 15 32 49 24 48 22 40 11 43 13 29 35 39  9 38 41 50 25 27
  3 23] 

Second parent:
[ 2 34 28 30  1 19 22 12 36 32 23 18 50 26 42 13 14 48  9 41 15 10 20 17
 43 11 25 31 37 35  7 38 47 16 21  4 24 46 27 40  8 29 49 44 33  5 45  6
 39  3] 

Child after crossover:
[34 28 30  1 19 14  2 22 36 26 32 23 18 50 42 12 13 48  9 10 20 17 43 11
 25 31 37 35  7 15 47 49 24 16 21  4 46 40  8 29 44 33  5 38 41 45  6 27
 39  3]


We also add a mutation function: 

In [58]:
def mutation(individual, mutation_rate = 0.2):
    # if random percentage is lower than the mutation rate, switch two random genes
    if random.uniform(0, 1) < mutation_rate:
        idx1 = random.randint(0, len(individual)-1)
        idx2 = random.randint(0, len(individual)-1)

        individual[idx1], individual[idx2] = individual[idx2], individual[idx1]

    return individual

Finally, our executable GA class:

In [61]:
class Genetic_Algorithm():
    def __init__(self, students, groupsize):
        super(Genetic_Algorithm, self).__init__()

        self.students = students
        self.groupsize = groupsize
        self.num_individuals = len(students)
        self.ids = students.ID.tolist()

    def run(self, episodes, replace='all', mutation_rate=0.2):

        self.epidodes = episodes
        self.population = Population(students, groupsize).create_initial_population()

        print("episode " + str(0) + ": mean fitness score: " + str(Fitness().mean_fitness(self.population)) + "; best individual fitness: " + str(Fitness().best_fitness(self.population)))

        if replace == 'all':
            for episode in range(episodes):
                new_pop = []
                # we get two new individuals by each step, so half the pop size
                for _ in range(len(self.population)//2):
                    # find two parents by tournament selection
                    p1, p2 = Crossover().tournament_selection(self.population,8)

                    # create two children by uniform order crossover
                    c1 = Crossover().uniform_order_crossover(p1,p2)
                    c2 = Crossover().uniform_order_crossover(p2,p1)
                    # do mutation
                    c1 = mutation(c1, mutation_rate)
                    c2 = mutation(c2, mutation_rate)
                    # add children to new population
                    new_pop.append(c1)
                    new_pop.append(c2)

                self.population = np.array(new_pop,dtype=int)

                print("episode " + str(episode+1) + ": mean fitness score: " + str(Fitness().mean_fitness(self.population)) + "; best individual fitness: " + str(Fitness().best_fitness(self.population)))

        elif list(replace):
            for episode in range(episodes):
                pop_indices_sorted_by_fitness = Fitness().indices_sorted_by_fitness(self.population)
                
                # we get two new individuals by each step, so half the pop size
                for _ in range(replace//2):
                    # find two parents by tournament selection
                    p1, p2 = Crossover().tournament_selection(self.population,8)

                    # create two children by uniform order crossover
                    c1 = Crossover().uniform_order_crossover(p1,p2)
                    c2 = Crossover().uniform_order_crossover(p2,p1)
                    # do mutation
                    c1 = mutation(c1)
                    c2 = mutation(c2)
                    # add children to new population
                    next_idx = pop_indices_sorted_by_fitness.pop(0)
                    self.population[next_idx] = c1
                    next_idx = pop_indices_sorted_by_fitness.pop(0)
                    self.population[next_idx] = c2

                print("episode " + str(episode+1) + ": mean fitness score: " + str(Fitness().mean_fitness(self.population)) + "; best individual fitness: " + str(Fitness().best_fitness(self.population)))

        return self.population

In [62]:
best_population = Genetic_Algorithm(students, groupsize).run(episodes=5, replace='all', mutation_rate=0.2)

episode 0: mean fitness score: 16.337; best individual fitness: 19.707
episode 1: mean fitness score: 17.004; best individual fitness: 20.807
episode 2: mean fitness score: 18.024; best individual fitness: 21.197
episode 3: mean fitness score: 18.258; best individual fitness: 21.217
episode 4: mean fitness score: 18.936; best individual fitness: 21.257
episode 5: mean fitness score: 19.705; best individual fitness: 22.077


In [64]:
Population(students, groupsize).show_groups_from_individual(np.reshape(best_population, -1))

Group 1 : ['Dirk Lehrer', 'Vanessa Abendroth', 'Matthias Kaufmann', 'Lucas Eisenhauer', 'Barbara Fassbinder']
Group 2 : ['Martina Gaertner', 'Leon Meister', 'Yvonne Bergmann', 'Thomas Beckenbauer', 'Torsten Kastner']
Group 3 : ['Yvonne Sanger', 'Frank Sankt', 'Heike Schulze', 'Torsten Busch', 'Annett Herzog']
Group 4 : ['Ines Herman', 'Kevin Neumann', 'Jan Schroder', 'Mario Brauer', 'Monika Koenig']
Group 5 : ['Anna Frueh', 'Swen Wolf', 'René Naumann', 'Sarah Frey', 'Ines Baer']
Group 6 : ['Daniel Neudorf', 'Dennis Cole', 'Doreen Krueger', 'Ralph Abt', 'Dominik Faust']
Group 7 : ['Gabriele Herz', 'Stefanie Eichel', 'Kristian Kastner', 'Daniela Drescher', 'Lea Grunwald']
Group 8 : ['Lisa Wurfel', 'Torsten Traugott', 'Martin Engel', 'Maria Cole', 'Ulrike Muench']
Group 9 : ['Lucas Wulf', 'Monika Faerber', 'Jonas Dresdner', 'Jürgen Furst', 'Nadine Feierabend']
Group 10 : ['Thorsten Theissen', 'Ralf Sankt', 'Jessika Muller', 'Daniela Kortig', 'Lea Austerlitz']
Group 11 : ['Dirk Lehrer', 'V