In [49]:
import numpy as np
import random
import pandas as pd
from typing import List

In [50]:
students = pd.read_csv('dataset_full.csv')

# hyperparameters
num_individuals = 50
groupsize = 5

Darwinian Natural Selection

- Variation
- Selection
- Heredity

1. Variation (Create a population)

In [51]:
students.head()

Unnamed: 0,ID,Name,Gender,Preferred language,Majors,Level of ambition,Preferred meeting place,Personality type,Best friend,Preferred day
0,1,Marko Pfaff,Male,English,"('CP', 'AI')",High,In person,INTP,Mathias Durr,Friday
1,2,Christian Kuester,Indeterminate,Any,"('CL', 'PHIL')",Very high,In person,ESTJ,Phillipp Koertig,Thursday
2,3,Johanna Schreiber,Female,Any,"('CL', 'NI')",Very high,In person,ENFJ,Marie Kohler,Wednesday
3,4,Tim Krueger,Male,Any,"('NI', 'CP')",Medium,In person,ESFJ,Marina Kalb,Tuesday
4,5,Jessica Fried,Female,Any,"('CP', 'PHIL')",Very low,In person,ESFP,Florian Bader,Wednesday


In [52]:
class Population():
    def __init__(self, students, n_individuals, groupsize):
        super(Population, self).__init__()

        if n_individuals % groupsize != 0:
            raise ValueError(f'Received unexpected group size: {n_individuals} groups % {groupsize} individuals has to be 0.')
        
        self.students = students
        self.n_individuals = n_individuals
        self.ids = students.ID.tolist()
        self.groupsize = groupsize

    def create_random_individual(self):
        individual = self.ids.copy()
        random.shuffle(individual)

        return individual

    def create_initial_population(self) -> np.ndarray:
        """ Creates an initial populations and returns it"""
        population = []
        for _ in range(self.n_individuals):
            population.append(self.create_random_individual())

        return np.array(population, dtype=int)

    def show_groups_from_individual(self, individual: np.ndarray) -> None:
        """ Prints all Groups of given $individual$"""
        groups = []
        for i in individual:
            groups.append(self.students['Name'][i-1])
        
        for n,j in enumerate(range(0, len(groups), self.groupsize)):
            print(f'Group {n+1} :',groups[j:j+5])

        return None

    def get_groups_from_individual(self, individual: np.ndarray) -> List[np.ndarray]:
        """ Returns a Python List of all Groups in an individual(as numpy ndarrays) """
        groups = []
        for i in individual:
            groups.append(self.students.loc[i-1])

        nested_groups_list = []
        for n,j in enumerate(range(0, len(groups), self.groupsize)):
            nested_groups_list.append(np.ndarray(groups[j:j+5]))

        return nested_groups_list

    def get_groups_by_person_fullname(self, fullname: str, individual: np.ndarray) -> List[np.ndarray]:
        """ Returns all Groups that got a Person with $fullname$ in given $individual$"""
        nested_groups_list = self.get_groups_from_individual(individual)

        groups_with_person = []
        for group in nested_groups_list:
            if fullname in group["Name"].values:
                groups_with_person.append(group)
        return groups_with_person

    def get_group_by_person_ID(self, ID: int, individual: np.ndarray) -> np.ndarray:
        """ Returns Group that has Person with given $ID$ in given $individual$"""
        nested_groups_list = self.get_groups_from_individual(individual)

        groups_with_person = []
        for group in nested_groups_list:
            if ID in group["ID"].values:
                return group
        return None

In [53]:
population_manager = Population(students, num_individuals, groupsize)

test_population = population_manager.create_initial_population()

test_population

array([[62, 49, 75, ..., 94, 38, 28],
       [19, 70, 77, ..., 73, 85, 91],
       [27, 80, 50, ..., 69, 15, 44],
       ...,
       [91,  1, 63, ..., 98, 95, 50],
       [62, 53, 56, ..., 64,  7,  3],
       [66, 26, 38, ..., 85, 50, 62]])

Show Groups from the IDs

In [54]:
population_manager.show_groups_from_individual(test_population[0])

Group 1 : ['Jennifer Zimmer', 'Phillipp Schroeder', 'Sandra Metzger', 'Tim Krueger', 'Jonas Zimmer']
Group 2 : ['Ines Ebersbacher', 'Jessica Fried', 'Tim Neustadt', 'Max Osterhagen', 'Ursula Burger']
Group 3 : ['Angelika Eggers', 'Martina Weisz', 'Juliane Frei', 'Benjamin Schmitz', 'Dominik Kohl']
Group 4 : ['Dennis Reiniger', 'Christian Kuester', 'Stefan Freud', 'Max Meyer', 'Maximilian Fiedler']
Group 5 : ['Anke Richter', 'Stefan Beyer', 'René Dresdner', 'Jessika Wagner', 'Sara Papst']
Group 6 : ['Petra Zweig', 'Dieter Schweitzer', 'Kathrin Scherer', 'Felix Wirtz', 'Christian Werfel']
Group 7 : ['Mathias Durr', 'Dominik Hoch', 'Florian Pfeiffer', 'Sabrina Fink', 'Sarah Hoover']
Group 8 : ['Juliane Urner', 'Luca Wexler', 'Jan Richter', 'Laura Wulf', 'Sven Holzman']
Group 9 : ['Michael Bohm', 'Mike Freud', 'Erik Schroeder', 'Ralph Busch', 'Maria Weisz']
Group 10 : ['Martina Baumgartner', 'Marie Kohler', 'Jörg Strauss', 'Andreas Fisher', 'Marina Kalb']
Group 11 : ['Anke Lehmann', 'Simon

________

2. Selection (Evaluate the fitness of each group, find fittest)

In [55]:
class Fitness():
    def __init__(self):
        super(Fitness, self).__init__()

    def evaluate_fitness(self, individual, students):
        """ Evaluates the fitness of an individual. """

        # split individual into student groups of the groupsize
        groups = np.array_split(individual, (len(individual)/groupsize))

        # iterate over groups and calculate scores for the different parameters
        scores = []
        for group_ids in groups:
            # get full data for students in this group from pd dataframe
            group = students.loc[students['ID'].isin(group_ids)]

            # get individual scores for parameters
            language_score = self.evaluate_language(group)
            if language_score == -1:
                # if we really want to use a hard constraint we would need to return -1 here, this makes for really bad (initial) results though
                # return -1
                scores.append(language_score)
                continue

            major_score = self.evaluate_majors(group)
            ambition_score = self.evaluate_ambition(group)
            place_score = self.evaluate_meeting_place(group)
            gender_score = self.evaluate_gender(group)
            friend_score = self.evaluate_friends(group)
            personality_score = self.evaluate_personality(group)
            day_score = self.evaluate_meeting_day(group)

            # formula for adding and weighting different scores
            scores.append(language_score+major_score+ambition_score+place_score+gender_score+friend_score+personality_score+day_score)

        #Convert to series to calculate mean more easily
        return pd.Series(scores).mean()

    def evaluate_language(self, group):
        # number of groupmembers per language
        counts = group['Preferred language'].value_counts()

        # hard constraints, if languages are conflicting, return -1
        if 'German' in counts.index and 'English' in counts.index:
            return -1

        return groupsize

    def evaluate_majors(self, group):
        majors = group['Majors'].tolist()

        # preprocess majors from dataset notation to list with all majors
        group_majors = []
        for pair in majors:
            pair = pair[1:-1].split(", ")

            group_majors.append(pair[0][1:-1])
            group_majors.append(pair[1][1:-1])

        #convert to Series for easier handling
        group_majors = pd.Series(group_majors)
        #get value counts
        group_major_values = group_majors.value_counts()
        #remove majors only one person takes (as they provide no synergy to the group)
        group_major_values = group_major_values[group_major_values > 1]

        #add number of shared majors and divide by 2; Formula is kinda arbitrary
        return group_major_values.sum() /2

    def evaluate_ambition(self, group):
        # get pd Series of ambitions
        ambitions = group['Level of ambition']
        # get int value mappings for ambitions

        mapping = {
            'Very low': 1,
            'Low': 2,
            'Medium': 3,
            'High': 4,
            'Very high': 5
        } 

        ambitions = ambitions.map(mapping)

        # fitness is groupsize - variance in group motivation (so less variance = more fitness)
        return groupsize - ambitions.var()

    def evaluate_meeting_place(self, group):
        # number of groupmembers for each preferred meeting place
        meeting_place = group['Preferred meeting place'].value_counts()

        # if all prefer the same meeting place return 5, else 0
        if meeting_place[0] == groupsize:
            return 5

        return 0

    def evaluate_gender(self, group):
        # evaluate by variance
        genders = group['Gender'].value_counts()

        # add 0 entry for missing genders
        for gender in ['Male', 'Female', 'Indeterminate']:
            if gender not in genders.index:
                genders[gender] = 0

        # return groupsize - variance
        return groupsize - genders.var()

    def evaluate_friends(self, group):
        #for each member +1 if friend is also in group
        group_member_names = group['Name'].tolist()
        best_friends_name = group['Best friend'].tolist()

        # get intersection between both lists
        friends_in_group = list(set(group_member_names).intersection(best_friends_name))

        # fitness += 1 for every pair of friends
        return len(friends_in_group)

    def evaluate_personality(self, group):
        #information about compatible personality types is taken from
        # Montequín, Vicente Rodríguez, et al. "Using Myers-Briggs type indicator (MBTI) as a tool for setting up student teams for information technology projects." Journal of Information Technology and Application in Education 1.1 (2012): 28-34.

        #count existing personality types in each group
        personalities = group['Personality type']
        types = personalities.value_counts()

        #fitness function starts with 0 and gets better
        # with every good group member
        fitness = 0

        #its good if there is a group leader like an ISTJ or an ESTJ, but only one
        try:
            if (types['ISTJ'] + types['ESTJ'] == 1):
                fitness+=5
            elif (types['ISTJ'] + types['ESTJ'] >= 2):
                fitness-=5
        except KeyError:
            pass

        #compare compatibility of group members
        for i, personality_a in enumerate(personalities.tolist()):
            for j, personality_b in enumerate(personalities.tolist()):
                # skip same group member and members already compared
                if i <= j:
                    continue

                # increase fitness if
                if (personality_a[1] != personality_b[1]) ^ (personality_a[2] != personality_b[2]):
                    if (personality_a[0] != personality_b[0]) or (personality_a[3] != personality_b[3]):
                        fitness+=1

        return fitness

    def evaluate_meeting_day(self, group):
        # number of groupmembers for each preferred meeting day
        meeting_day = group['Preferred day'].value_counts()

        # if all prefer the same meeting day return 5, else 0
        if meeting_day[0] == groupsize:
            return 5

        return 0

    def mean_fitness(self, population):
        # get list of fitness scores for all individuals in this population
        fitness_scores = [self.evaluate_fitness(individual, students) for individual in population]

        # convert to series to calculate mean more easily
        return round(pd.Series(fitness_scores).mean(), 3)

    def best_fitness(self, population):
        # get list of fitness scores for all individuals in this population
        fitness_scores = [self.evaluate_fitness(individual, students) for individual in population]

        # sort by best first
        scores_sorted = sorted(fitness_scores, reverse=True)

        # return fitness of best individual
        return round(scores_sorted[0], 3)

    def get_fittest_individual(self, population, students):
        # get list of fitness scores for all individuals in this population
        highest_fitness = float("-inf")
        fittest_individual = None
        for individual in population:
            fitness = self.evaluate_fitness(individual, students)
            if highest_fitness < self.evaluate_fitness(individual, students):
                highest_fitness = fitness
                fittest_individual = individual
        return fittest_individual

    def indices_sorted_by_fitness(self, population):
        fitness_scores = [self.evaluate_fitness(individual, students) for individual in population]

        indices = np.argsort(fitness_scores)

        return indices.tolist()

In [56]:
best_fitness = Fitness().best_fitness(test_population)
best_fitness

19.193

3. Heredity (Pass on the fittest to the next generation)

In [57]:
class Crossover():
    def __init__(self):
        super(Crossover, self).__init__()

    def uniform_order_crossover(self, p1, p2):

        template = self.get_crossover_template(len(p1), crossover_rate=0.2)
        # create 'empty' child
        child = np.zeros((len(p1),),dtype=int)
        # where the template is true, take values from p1
        child[template] = p1[template]
        # store genes used from p1
        used_genes = p1[template]

        # get all genes from p2
        remaining_genes = p2.tolist()
        # add genes from p2 (that were not used from p1) to the empty spots of the child
        for i, value in enumerate(child):
            # if this spot is already filled, continue
            if value != 0:
                continue

            # do while:  pop(get and remove) next gene from p2 until one is found that is not yet in the genome of the child, then add that
            while True:
                next_gene = remaining_genes.pop(0)
                if next_gene not in used_genes:
                    child[i] = next_gene
                    break

        return child

    def get_crossover_template(self,length, crossover_rate = 0.2):
        # initialize template with false values
        template = np.zeros((length,),dtype=bool)
        # get random indices of the amount #of genes * crossover rate
        random_indices = np.random.choice(template.shape[0], int(length*crossover_rate), replace=False)
        #set these indices to true
        template[random_indices] = True

        return template

    def tournament_selection(self, popuplation, tournament_size = 8):
        # get random indices to select random individuals from population
        random_indices = np.random.choice(popuplation.shape[0], tournament_size*2, replace=False)

        # get individuals from random indices and split into two tournaments
        tournament1 = popuplation[random_indices[:tournament_size]]
        tournament2 = popuplation[random_indices[tournament_size:]]

        parents = []
        # tournament is won by fittest individual in each tournament, those become the two parents
        for tournament in (tournament1,tournament2):
            # get fitness scores for every individual in the tournament
            fitness_scores = [Fitness().evaluate_fitness(individual, students) for individual in tournament]
            # get indices ordered by highest fitness first
            idx = np.argsort(fitness_scores)[::-1]
            # add individual with highest fitness to parents
            parents.append(tournament[idx[0]])

        return parents

In [58]:
random_population = Population(students, num_individuals, groupsize).create_initial_population()

print(random_population, '\n')

random_parent1, random_parent2 = Crossover().tournament_selection(random_population)

print(random_parent1, '\n')
print(random_parent2, '\n')

child = Crossover().uniform_order_crossover(random_parent1, random_parent2)

print(child)

[[ 31  17  83 ...   2  65  46]
 [ 66  19 100 ...  12  42  26]
 [ 19  16  83 ...  85  68   9]
 ...
 [ 48  66  58 ...  94  75  15]
 [ 89   9  25 ...  88  15  72]
 [ 52  20  25 ...   9  96  36]] 

[ 43  49  91  79  55  72  22  38  67  56  63  12  88   7  81  99  52  97
  54  87  76  65  47  11  29  26  17  10  25   4   1  34  84  16  98  27
  60  14  50  94  66  23  82  89  40  78  57  31  92  75  58   9  35  80
  93  20  42  46  59   8  24   6  39  21  64  37  68  48  95  70  28  83
  32  41  90  36  30  45 100  77  44   3   5  15  73  61  51  96  85  62
  19  74  71  69   2  86  33  13  18  53] 

[ 94  34  75  69  90   8  46  28  93  86  99  44  65  96  20  81  72  26
   5  30  58  29  89  67  45  88  14  54  71  19   1  38  51  59  13  87
  17  82  23  22  18  31  66  84  40  41  74  78 100   6  37  62  55  57
  53  64  16  97  52  73  61  95  15  32  79  11  91  56  49   7  43  35
  92  25  50  80  76  60  83  12  10   9  42  21  48  24   3   4  47   2
  27  85  63  98  77  33  70  39

We also add a mutation function: 

In [59]:
def mutation(individual, mutation_rate = 0.2):
    # if random percentage is lower than the mutation rate, switch two random genes
    if random.uniform(0, 1) < mutation_rate:
        idx1 = random.randint(0, len(individual)-1)
        idx2 = random.randint(0, len(individual)-1)

        individual[idx1], individual[idx2] = individual[idx2], individual[idx1]

    return individual

Finally, our executable GA class:

In [60]:
class Genetic_Algorithm():
    def __init__(self, students, num_individuals, groupsize):
        super(Genetic_Algorithm, self).__init__()

        self.students = students
        self.groupsize = groupsize
        self.num_individuals = num_individuals
        self.ids = students.ID.tolist()

    def run(self, episodes, replace='all', mutation_rate=0.2):

        self.epidodes = episodes
        self.population = Population(students, num_individuals, groupsize).create_initial_population()

        print("episode " + str(0) + ": mean fitness score: " + str(Fitness().mean_fitness(self.population)) + "; best individual fitness: " + str(Fitness().best_fitness(self.population)))

        if replace == 'all':
            for episode in range(episodes):
                new_pop = []
                # we get two new individuals by each step, so half the pop size
                for _ in range(len(self.population)//2):
                    # find two parents by tournament selection
                    p1, p2 = Crossover().tournament_selection(self.population,8)

                    # create two children by uniform order crossover
                    c1 = Crossover().uniform_order_crossover(p1,p2)
                    c2 = Crossover().uniform_order_crossover(p2,p1)
                    # do mutation
                    c1 = mutation(c1, mutation_rate)
                    c2 = mutation(c2, mutation_rate)
                    # add children to new population
                    new_pop.append(c1)
                    new_pop.append(c2)

                self.population = np.array(new_pop,dtype=int)

                print("episode " + str(episode+1) + ": mean fitness score: " + str(Fitness().mean_fitness(self.population)) + "; best individual fitness: " + str(Fitness().best_fitness(self.population)))

        elif list(replace):
            for episode in range(episodes):
                pop_indices_sorted_by_fitness = Fitness().indices_sorted_by_fitness(self.population)
                
                # we get two new individuals by each step, so half the pop size
                for _ in range(replace//2):
                    # find two parents by tournament selection
                    p1, p2 = Crossover().tournament_selection(self.population,8)

                    # create two children by uniform order crossover
                    c1 = Crossover().uniform_order_crossover(p1,p2)
                    c2 = Crossover().uniform_order_crossover(p2,p1)
                    # do mutation
                    c1 = mutation(c1)
                    c2 = mutation(c2)
                    # add children to new population
                    next_idx = pop_indices_sorted_by_fitness.pop(0)
                    self.population[next_idx] = c1
                    next_idx = pop_indices_sorted_by_fitness.pop(0)
                    self.population[next_idx] = c2

                print("episode " + str(episode+1) + ": mean fitness score: " + str(Fitness().mean_fitness(self.population)) + "; best individual fitness: " + str(Fitness().best_fitness(self.population)))

        return self.population

In [61]:
population = Genetic_Algorithm(students, num_individuals, groupsize).run(episodes=10, replace='all', mutation_rate=0.2)

episode 0: mean fitness score: 16.143; best individual fitness: 19.608
episode 1: mean fitness score: 17.154; best individual fitness: 19.442
episode 2: mean fitness score: 17.607; best individual fitness: 20.252
episode 3: mean fitness score: 17.655; best individual fitness: 20.282
episode 4: mean fitness score: 18.321; best individual fitness: 20.392
episode 5: mean fitness score: 18.669; best individual fitness: 20.912
episode 6: mean fitness score: 19.104; best individual fitness: 21.102
episode 7: mean fitness score: 19.267; best individual fitness: 20.702
episode 8: mean fitness score: 19.49; best individual fitness: 21.042
episode 9: mean fitness score: 19.873; best individual fitness: 21.217
episode 10: mean fitness score: 20.35; best individual fitness: 21.247


In [74]:
# Select the fittest individiual from our population
fittest_individual = Fitness().get_fittest_individual(population, students)

# Convert the Gene String to a List of Groups
groups_df = population_manager.get_groups_from_individual(fittest_individual)

# Plot an example group of the list
groups_df[0]

Unnamed: 0,ID,Name,Gender,Preferred language,Majors,Level of ambition,Preferred meeting place,Personality type,Best friend,Preferred day
80,81,Juliane Drescher,Female,Any,"('CL', 'CP')",Very low,In person,INTP,Jan Dietrich,Wednesday
67,68,Anne Brandt,Female,Any,"('CL', 'CP')",Low,In person,ENTJ,Patrick Shuster,Friday
89,90,Martina Baumgartner,Female,English,"('PHIL', 'NS')",Low,In person,ISFJ,Max Meyer,Monday
84,85,Florian Bader,Male,Any,"('PHIL', 'NS')",Very low,In person,ISFP,Jessica Fried,Tuesday
96,97,Sara Papst,Female,Any,"('NI', 'AI')",Very low,In person,ESFP,Sarah Mehler,Thursday


*Let's find your group or the one of a friend*

In [80]:
# User Input for the Full Name - Please be accurate!
fullname = str(input("Please the Full-Name you want to look up: "))
# Get the group by your Full Name - If there are multiple Persons with your name you will get all of them
my_group = population_manager.get_groups_by_person_fullname(fullname, fittest_individual)
# Plotting the first group found
if my_group:
    print(my_group)
else:
    print("Didn't find the requested Person with given Fullname")

[    ID              Name  Gender Preferred language          Majors  \
14  15  Marco Eisenhauer    Male                Any    ('NI', 'CL')   
19  20         Max Meyer    Male                Any    ('NI', 'CL')   
45  46    Gabriele Weiss  Female                Any    ('AI', 'NI')   
28  29       Jan Richter    Male                Any    ('NI', 'AI')   
98  99      Marie Kohler  Female                Any  ('NI', 'PHIL')   

   Level of ambition Preferred meeting place Personality type  \
14              High               In person             ISTJ   
19            Medium               In person             INTJ   
45         Very high               In person             ISFJ   
28               Low                  Online             ISTP   
98         Very high               In person             ENFJ   

            Best friend Preferred day  
14   Maximilian Fiedler       Tuesday  
19  Martina Baumgartner     Wednesday  
45          Sophie Bohm      Thursday  
28    Dieter Schweitz

In [84]:
# Find
my_group = population_manager.get_group_by_person_ID(33, fittest_individual)
# Plotting the group, if one found with given ID
if my_group is not None:
    print(my_group)
else:
    print("Didn't find the requested Person with given ID")

    ID               Name  Gender Preferred language          Majors  \
26  27         Mike Freud    Male                Any    ('NS', 'AI')   
2    3  Johanna Schreiber  Female                Any    ('CL', 'NI')   
10  11    Angelika Eggers  Female                Any  ('PHIL', 'CL')   
33  34    Nadine Reiniger  Female                Any    ('NI', 'AI')   
32  33        Niklas Frey    Male             German    ('NI', 'NS')   

   Level of ambition Preferred meeting place Personality type  \
26          Very low                  Online             INTP   
2          Very high               In person             ENFJ   
10         Very high               In person             ESTP   
33               Low               In person             ENFP   
32            Medium                  Online             ENTJ   

         Best friend Preferred day  
26      Jonas Zimmer       Tuesday  
2       Marie Kohler     Wednesday  
10  Florian Pfeiffer        Friday  
33    Christine Baum     Wed