In [132]:
import numpy as np
import random
import pandas as pd
from typing import List

In [133]:
students = pd.read_csv('dataset_full.csv')

# hyperparameters
num_individuals = 100
groupsize = 5

Darwinian Natural Selection

- Variation
- Selection
- Heredity

1. Variation (Create a population)

In [134]:
students.head()

Unnamed: 0,ID,Name,Gender,Preferred language,Majors,Level of ambition,Preferred meeting place,Personality type,Best friend,Preferred day
0,1,Marko Pfaff,Male,English,"('CP', 'AI')",High,In person,INTP,Mathias Durr,Friday
1,2,Christian Kuester,Indeterminate,Any,"('CL', 'PHIL')",Very high,In person,ESTJ,Phillipp Koertig,Thursday
2,3,Johanna Schreiber,Female,Any,"('CL', 'NI')",Very high,In person,ENFJ,Marie Kohler,Wednesday
3,4,Tim Krueger,Male,Any,"('NI', 'CP')",Medium,In person,ESFJ,Marina Kalb,Tuesday
4,5,Jessica Fried,Female,Any,"('CP', 'PHIL')",Very low,In person,ESFP,Florian Bader,Wednesday


In [135]:
class Population():
    def __init__(self, students, n_individuals, groupsize):
        super(Population, self).__init__()

        if n_individuals % groupsize != 0:
            raise ValueError(f'Received unexpected group size: {n_individuals} groups % {groupsize} individuals has to be 0.')
        
        self.students = students
        self.n_individuals = n_individuals
        self.ids = students.ID.tolist()
        self.groupsize = groupsize

    def create_random_individual(self):
        individual = self.ids.copy()
        random.shuffle(individual)

        return individual

    def create_initial_population(self) -> np.ndarray:
        """ Creates an initial populations and returns it"""
        population = []
        for _ in range(self.n_individuals):
            population.append(self.create_random_individual())

        return np.array(population, dtype=int)

    def show_groups_from_individual(self, individual: np.ndarray) -> None:
        """ Prints all Groups of given $individual$"""
        groups = []
        for i in individual:
            groups.append(self.students['Name'][i-1])
        
        for n,j in enumerate(range(0, len(groups), self.groupsize)):
            print(f'Group {n+1} :',groups[j:j+5])

        return None

    def get_groups_from_individual(self, individual: np.ndarray) -> List[np.ndarray]:
        """ Returns a Python List of all Groups in an individual(as numpy ndarrays) """
        groups = []
        for i in individual:
            groups.append(self.students.loc[i-1])

        nested_groups_list = []
        for n,j in enumerate(range(0, len(groups), self.groupsize)):
            nested_groups_list.append(pd.DataFrame(groups[j:j+5]))

        return nested_groups_list

    def get_groups_by_person_fullname(self, fullname: str, individual: np.ndarray) -> List[np.ndarray]:
        """ Returns all Groups that got a Person with $fullname$ in given $individual$"""
        nested_groups_list = self.get_groups_from_individual(individual)

        groups_with_person = []
        for group in nested_groups_list:
            if fullname in group["Name"].values:
                groups_with_person.append(group)
        return groups_with_person

    def get_group_by_person_ID(self, ID: int, individual: np.ndarray) -> np.ndarray:
        """ Returns Group that has Person with given $ID$ in given $individual$"""
        nested_groups_list = self.get_groups_from_individual(individual)

        groups_with_person = []
        for group in nested_groups_list:
            if ID in group["ID"].values:
                return group
        return None

In [136]:
population_manager = Population(students, num_individuals, groupsize)

test_population = population_manager.create_initial_population()

test_population

array([[ 16,   4,  78, ...,  82,  91,   1],
       [ 92,  47,  77, ...,  90,   5,  48],
       [ 54,  13,   7, ...,  19,  90,  60],
       ...,
       [ 41,  65,  70, ...,  25,   6,  75],
       [ 59,  85,  97, ..., 100,  70,  31],
       [ 88,  13,  57, ...,  48,   5,  38]])

Show Groups from the IDs

In [137]:
population_manager.show_groups_from_individual(test_population[0])

Group 1 : ['Stefan Beyer', 'Tim Krueger', 'Marie Muench', 'Luca Wexler', 'Karin Pfaff']
Group 2 : ['Leon Fisher', 'René Dresdner', 'Sabrina Diederich', 'Lukas Kaiser', 'Marco Eisenhauer']
Group 3 : ['Jan Richter', 'Felix Gruenewald', 'Maria Weisz', 'Andreas Fisher', 'Lukas Sommer']
Group 4 : ['Michael Bohm', 'Phillipp Schroeder', 'Petra Schwab', 'Dominik Kohl', 'Sarah Hoover']
Group 5 : ['Claudia Abendroth', 'Erik Schroeder', 'Juliane Urner', 'Eric Kirsch', 'Jessica Fried']
Group 6 : ['Brigitte Müller', 'David Schweitzer', 'Max Meyer', 'Juliane Drescher', 'Diana Traugott']
Group 7 : ['Erik Schäfer', 'René Kuefer', 'Anke Lehmann', 'Sophie Bohm', 'Florian Bader']
Group 8 : ['Sarah Boehm', 'Patrick Shuster', 'Juliane Freeh', 'Jessika Wagner', 'Max Osterhagen']
Group 9 : ['Kathrin Scherer', 'Florian Pfeiffer', 'Marina Kalb', 'Mike Freud', 'Jennifer König']
Group 10 : ['Sarah Mehler', 'Angelika Brauer', 'Anke Richter', 'Jennifer Zimmer', 'Sabrina Fink']
Group 11 : ['Maximilian Schäfer', 'Pe

________

2. Selection (Evaluate the fitness of each group, find fittest)

In [138]:
class Fitness():
    def __init__(self):
        super(Fitness, self).__init__()

    def evaluate_fitness(self, individual, students):
        """ Evaluates the fitness of an individual. """

        # split individual into student groups of the groupsize
        groups = np.array_split(individual, (len(individual)/groupsize))

        # iterate over groups and calculate scores for the different parameters
        scores = []
        for group_ids in groups:
            # get full data for students in this group from pd dataframe
            group = students.loc[students['ID'].isin(group_ids)]

            # get individual scores for parameters
            language_score = self.evaluate_language(group)
            if language_score == -1:
                # if we really want to use a hard constraint we would need to return -1 here, this makes for really bad (initial) results though
                # return -1
                scores.append(language_score)
                continue

            major_score = self.evaluate_majors(group)
            ambition_score = self.evaluate_ambition(group)
            place_score = self.evaluate_meeting_place(group)
            gender_score = self.evaluate_gender(group)
            friend_score = self.evaluate_friends(group)
            personality_score = self.evaluate_personality(group)
            day_score = self.evaluate_meeting_day(group)

            # formula for adding and weighting different scores
            scores.append(language_score+major_score+ambition_score+place_score+gender_score+friend_score+personality_score+day_score)

        #Convert to series to calculate mean more easily
        return pd.Series(scores).mean()

    def evaluate_language(self, group):
        # number of groupmembers per language
        counts = group['Preferred language'].value_counts()

        # hard constraints, if languages are conflicting, return -1
        if 'German' in counts.index and 'English' in counts.index:
            return -1

        return groupsize

    def evaluate_majors(self, group):
        majors = group['Majors'].tolist()

        # preprocess majors from dataset notation to list with all majors
        group_majors = []
        for pair in majors:
            pair = pair[1:-1].split(", ")

            group_majors.append(pair[0][1:-1])
            group_majors.append(pair[1][1:-1])

        #convert to Series for easier handling
        group_majors = pd.Series(group_majors)
        #get value counts
        group_major_values = group_majors.value_counts()
        #remove majors only one person takes (as they provide no synergy to the group)
        group_major_values = group_major_values[group_major_values > 1]

        #add number of shared majors and divide by 2; Formula is kinda arbitrary
        return group_major_values.sum() /2

    def evaluate_ambition(self, group):
        # get pd Series of ambitions
        ambitions = group['Level of ambition']
        # get int value mappings for ambitions

        mapping = {
            'Very low': 1,
            'Low': 2,
            'Medium': 3,
            'High': 4,
            'Very high': 5
        } 

        ambitions = ambitions.map(mapping)

        # fitness is groupsize - variance in group motivation (so less variance = more fitness)
        return groupsize - ambitions.var()

    def evaluate_meeting_place(self, group):
        # number of groupmembers for each preferred meeting place
        meeting_place = group['Preferred meeting place'].value_counts()

        # if all prefer the same meeting place return 5, else 0
        if meeting_place[0] == groupsize:
            return 5

        return 0

    def evaluate_gender(self, group):
        # evaluate by variance
        genders = group['Gender'].value_counts()

        # add 0 entry for missing genders
        for gender in ['Male', 'Female', 'Indeterminate']:
            if gender not in genders.index:
                genders[gender] = 0

        # return groupsize - variance
        return groupsize - genders.var()

    def evaluate_friends(self, group):
        #for each member +1 if friend is also in group
        group_member_names = group['Name'].tolist()
        best_friends_name = group['Best friend'].tolist()

        # get intersection between both lists
        friends_in_group = list(set(group_member_names).intersection(best_friends_name))

        # fitness += 1 for every pair of friends
        return len(friends_in_group)

    def evaluate_personality(self, group):
        #information about compatible personality types is taken from
        # Montequín, Vicente Rodríguez, et al. "Using Myers-Briggs type indicator (MBTI) as a tool for setting up student teams for information technology projects." Journal of Information Technology and Application in Education 1.1 (2012): 28-34.

        #count existing personality types in each group
        personalities = group['Personality type']
        types = personalities.value_counts()

        #fitness function starts with 0 and gets better
        # with every good group member
        fitness = 0

        #its good if there is a group leader like an ISTJ or an ESTJ, but only one
        try:
            if (types['ISTJ'] + types['ESTJ'] == 1):
                fitness+=5
            elif (types['ISTJ'] + types['ESTJ'] >= 2):
                fitness-=5
        except KeyError:
            pass

        #compare compatibility of group members
        for i, personality_a in enumerate(personalities.tolist()):
            for j, personality_b in enumerate(personalities.tolist()):
                # skip same group member and members already compared
                if i <= j:
                    continue

                # increase fitness if
                if (personality_a[1] != personality_b[1]) ^ (personality_a[2] != personality_b[2]):
                    if (personality_a[0] != personality_b[0]) or (personality_a[3] != personality_b[3]):
                        fitness+=1

        return fitness

    def evaluate_meeting_day(self, group):
        # number of groupmembers for each preferred meeting day
        meeting_day = group['Preferred day'].value_counts()

        # if all prefer the same meeting day return 5, else 0
        if meeting_day[0] == groupsize:
            return 5

        return 0

    def mean_fitness(self, population):
        # get list of fitness scores for all individuals in this population
        fitness_scores = [self.evaluate_fitness(individual, students) for individual in population]

        # convert to series to calculate mean more easily
        return round(pd.Series(fitness_scores).mean(), 3)

    def best_fitness(self, population):
        # get list of fitness scores for all individuals in this population
        fitness_scores = [self.evaluate_fitness(individual, students) for individual in population]

        # sort by best first
        scores_sorted = sorted(fitness_scores, reverse=True)

        # return fitness of best individual
        return round(scores_sorted[0], 3)

    def get_fittest_individual(self, population, students):
        # get list of fitness scores for all individuals in this population
        highest_fitness = float("-inf")
        fittest_individual = None
        for individual in population:
            fitness = self.evaluate_fitness(individual, students)
            if highest_fitness < self.evaluate_fitness(individual, students):
                highest_fitness = fitness
                fittest_individual = individual
        return fittest_individual

    def indices_sorted_by_fitness(self, population):
        fitness_scores = [self.evaluate_fitness(individual, students) for individual in population]

        indices = np.argsort(fitness_scores)

        return indices.tolist()

In [139]:
best_fitness = Fitness().best_fitness(test_population)
best_fitness

20.357

3. Heredity (Pass on the fittest to the next generation)

In [140]:
class Crossover():
    def __init__(self):
        super(Crossover, self).__init__()

    def uniform_order_crossover(self, p1, p2):

        template = self.get_crossover_template(len(p1), crossover_rate=0.2)
        # create 'empty' child
        child = np.zeros((len(p1),),dtype=int)
        # where the template is true, take values from p1
        child[template] = p1[template]
        # store genes used from p1
        used_genes = p1[template]

        # get all genes from p2
        remaining_genes = p2.tolist()
        # add genes from p2 (that were not used from p1) to the empty spots of the child
        for i, value in enumerate(child):
            # if this spot is already filled, continue
            if value != 0:
                continue

            # do while:  pop(get and remove) next gene from p2 until one is found that is not yet in the genome of the child, then add that
            while True:
                next_gene = remaining_genes.pop(0)
                if next_gene not in used_genes:
                    child[i] = next_gene
                    break

        return child

    def get_crossover_template(self,length, crossover_rate = 0.2):
        # initialize template with false values
        template = np.zeros((length,),dtype=bool)
        # get random indices of the amount #of genes * crossover rate
        random_indices = np.random.choice(template.shape[0], int(length*crossover_rate), replace=False)
        #set these indices to true
        template[random_indices] = True

        return template

    def tournament_selection(self, population, tournament_size = 8):
        # get random indices to select random individuals from population
        random_indices = np.random.choice(population.shape[0], tournament_size*2, replace=False)

        # get individuals from random indices and split into two tournaments
        tournament1 = population[random_indices[:tournament_size]]
        tournament2 = population[random_indices[tournament_size:]]

        parents = []
        # tournament is won by fittest individual in each tournament, those become the two parents
        for tournament in (tournament1,tournament2):
            # get fitness scores for every individual in the tournament
            fitness_scores = [Fitness().evaluate_fitness(individual, students) for individual in tournament]
            # get indices ordered by highest fitness first
            idx = np.argsort(fitness_scores)[::-1]
            # add individual with highest fitness to parents
            parents.append(tournament[idx[0]])

        return parents

In [141]:
random_population = Population(students, num_individuals, groupsize).create_initial_population()

print(random_population, '\n')

random_parent1, random_parent2 = Crossover().tournament_selection(random_population)

print(random_parent1, '\n')
print(random_parent2, '\n')

child = Crossover().uniform_order_crossover(random_parent1, random_parent2)

print(child)

[[71 45 61 ... 65 14 97]
 [58 23 55 ... 72 61 91]
 [61 85 99 ... 94 44 62]
 ...
 [97 12 46 ...  6 75 47]
 [25 77 57 ... 84 12 73]
 [56 38 45 ... 78 64 44]] 

[ 92  87  49  27  89  44  53  82  61  43  71  55  15  76   6  84  47  74
  88  45  72  11  77  50  36  56   4   9  81  52  75  30  54  28  78  86
  34  94  13  35  67  31  42  29  10  24  64  23   7  25  90 100  16  19
  95  48   3  46  41  26  98  85  20  66   1  21  40  68  37  65  38  69
  63  93  60  97  62  58   2  17  33  96  51  14  99  18  80  59  39  12
  91  73   8  70   5  32  22  83  79  57] 

[ 75  60  62  78  69  40  31  85  94  92  14  29  21  38  83   4  41  25
  66  87  10   5  73   3  45  13  52  91  19  58  93  99  35  16  24  98
  18  63  82  50  23  30  15  86   1  34  11  39  64  28  49  51 100  22
  47  77  79  96  95  70  74  76   2  36  72  20  33  48  53  32  67  27
  68  81  46   7  56   9  55  43  84   8  54  71  17  59  44  88  61  12
  57  26  42  97  80  65   6  89  90  37] 

[ 75  60  62  78  69  31

We also add a mutation function: 

In [142]:
def mutation(individual, mutation_rate = 0.2):
    # if random percentage is lower than the mutation rate, switch two random genes
    if random.uniform(0, 1) < mutation_rate:
        idx1 = random.randint(0, len(individual)-1)
        idx2 = random.randint(0, len(individual)-1)

        individual[idx1], individual[idx2] = individual[idx2], individual[idx1]

    return individual

Finally, our executable GA class:

In [143]:
class Genetic_Algorithm():
    def __init__(self, students, num_individuals, groupsize):
        super(Genetic_Algorithm, self).__init__()

        self.students = students
        self.groupsize = groupsize
        self.num_individuals = num_individuals
        self.ids = students.ID.tolist()

    def run(self, episodes, replace='all', mutation_rate=0.2):

        self.epidodes = episodes
        self.population = Population(students, num_individuals, groupsize).create_initial_population()

        print("episode " + str(0) + ": mean fitness score: " + str(Fitness().mean_fitness(self.population)) + "; best individual fitness: " + str(Fitness().best_fitness(self.population)))

        if replace == 'all':
            for episode in range(episodes):
                new_pop = []
                # we get two new individuals by each step, so half the pop size
                for _ in range(len(self.population)//2):
                    # find two parents by tournament selection
                    p1, p2 = Crossover().tournament_selection(self.population,8)

                    # create two children by uniform order crossover
                    c1 = Crossover().uniform_order_crossover(p1,p2)
                    c2 = Crossover().uniform_order_crossover(p2,p1)
                    # do mutation
                    c1 = mutation(c1, mutation_rate)
                    c2 = mutation(c2, mutation_rate)
                    # add children to new population
                    new_pop.append(c1)
                    new_pop.append(c2)

                self.population = np.array(new_pop,dtype=int)

                print("episode " + str(episode+1) + ": mean fitness score: " + str(Fitness().mean_fitness(self.population)) + "; best individual fitness: " + str(Fitness().best_fitness(self.population)))

        elif list(replace):
            for episode in range(episodes):
                pop_indices_sorted_by_fitness = Fitness().indices_sorted_by_fitness(self.population)
                
                # we get two new individuals by each step, so half the pop size
                for _ in range(replace//2):
                    # find two parents by tournament selection
                    p1, p2 = Crossover().tournament_selection(self.population,8)

                    # create two children by uniform order crossover
                    c1 = Crossover().uniform_order_crossover(p1,p2)
                    c2 = Crossover().uniform_order_crossover(p2,p1)
                    # do mutation
                    c1 = mutation(c1)
                    c2 = mutation(c2)
                    # add children to new population
                    next_idx = pop_indices_sorted_by_fitness.pop(0)
                    self.population[next_idx] = c1
                    next_idx = pop_indices_sorted_by_fitness.pop(0)
                    self.population[next_idx] = c2

                print("episode " + str(episode+1) + ": mean fitness score: " + str(Fitness().mean_fitness(self.population)) + "; best individual fitness: " + str(Fitness().best_fitness(self.population)))

        return self.population

In [None]:
population = Genetic_Algorithm(students, num_individuals, groupsize).run(episodes=10, replace='all', mutation_rate=0.2)

episode 0: mean fitness score: 16.445; best individual fitness: 19.187
episode 1: mean fitness score: 16.878; best individual fitness: 19.607
episode 2: mean fitness score: 17.19; best individual fitness: 20.137
episode 3: mean fitness score: 17.738; best individual fitness: 20.822
episode 4: mean fitness score: 17.938; best individual fitness: 20.847
episode 5: mean fitness score: 18.63; best individual fitness: 20.797
episode 6: mean fitness score: 18.854; best individual fitness: 21.307
episode 7: mean fitness score: 19.163; best individual fitness: 20.967
episode 8: mean fitness score: 19.379; best individual fitness: 20.877
episode 9: mean fitness score: 19.362; best individual fitness: 21.212


In [None]:
# Select the fittest individiual from our population
fittest_individual = Fitness().get_fittest_individual(population, students)

# Convert the Gene String to a List of Groups
groups_df = population_manager.get_groups_from_individual(fittest_individual)

# Plot an example group of the list
groups_df[0]

*Let's find your group or the one of a friend*

In [None]:
# User Input for the Full Name - Please be accurate!
fullname = str(input("Please the Full-Name you want to look up: "))
# Get the group by your Full Name - If there are multiple Persons with your name you will get all of them
my_group = population_manager.get_groups_by_person_fullname(fullname, fittest_individual)
# Plotting the first group found
if my_group:
    print(my_group)
else:
    print("Didn't find the requested Person with given Fullname")

In [None]:
# Find your group by your ID
my_group = population_manager.get_group_by_person_ID(33, fittest_individual)
# Plotting the group, if one found with given ID
if my_group is not None:
    print(my_group)
else:
    print("Didn't find the requested Person with given ID")

## Manual Data Analysis
In the next code cells you can look at the distributions of people in groups. Feel free to add more!

In [None]:
def print_groups_distribution(column_name, groups):
    for group_index, group in enumerate(groups):
        statistics = group[column_name].value_counts() / group[column_name].value_counts().sum()
        print(f"Group {group_index}: Ratio of {column_name} is {sorted([i + ': ' + str(statistics[i]) for i in statistics.index])}")

In [None]:
def print_groups_friends_match(groups):

    for group_index, group in enumerate(groups):
        print(f"Group {group_index}: Friends matching: {Fitness().evaluate_friends(groups_df[0])}")

In [None]:
print_groups_distribution("Gender", groups_df)

In [None]:
print_groups_distribution("Preferred language", groups_df)

In [None]:
print_groups_distribution("Preferred meeting place", groups_df)

In [None]:
print_groups_distribution("Preferred day", groups_df)

In [None]:
print_groups_distribution("Personality type", groups_df)

In [None]:
print_groups_distribution("Level of ambition", groups_df)

In [None]:
print_groups_friends_match(groups_df)
