In [1]:
import numpy as np
import random
import pandas as pd
from typing import List

In [3]:
students = pd.read_csv('dataset_full.csv')

# Groupsize hyperparameter
groupsize = 5

Darwinian Natural Selection

- Variation
- Selection
- Heredity

1. Variation (Create a population)

In [4]:
students.head()

Unnamed: 0,ID,Name,Gender,Preferred language,Majors,Level of ambition,Preferred meeting place,Personality type,Best friend,Preferred day
0,1,Stephan Ritter,Male,Any,"('NS', 'CL')",Very high,In person,ENTJ,Kerstin König,Thursday
1,2,Barbara Wolf,Female,Any,"('CL', 'NI')",High,In person,INTP,Lucas Kuester,Friday
2,3,Kristian Richter,Indeterminate,German,"('PHIL', 'NI')",Very low,In person,ENFP,Laura Hoover,Tuesday
3,4,Brigitte Austerlitz,Female,Any,"('PHIL', 'AI')",Low,In person,ENTJ,Stefan Lange,Thursday
4,5,Mandy Roth,Female,Any,"('PHIL', 'NI')",Very low,In person,ISTJ,Philipp Frankfurter,Thursday


In [5]:
class Population():
    def __init__(self, students, groupsize):
        super(Population, self).__init__()
        
        self.students = students
        self.n_individuals = len(students)
        self.ids = students.ID.tolist()
        self.groupsize = groupsize

        if self.n_individuals % groupsize != 0:
            raise ValueError(f'Received unexpected group size: {self.n_individuals} groups % {groupsize} individuals has to be 0.')

    def create_random_individual(self):
        individual = self.ids.copy()
        random.shuffle(individual)

        return individual

    def create_initial_population(self) -> np.ndarray:
        """ Creates an initial populations and returns it"""
        population = []
        for _ in range(self.n_individuals):
            population.append(self.create_random_individual())

        return np.array(population, dtype=int)

    def show_groups_from_individual(self, individual: np.ndarray) -> None:
        """ Prints all Groups of given $individual$"""
        groups = []
        for i in individual:
            groups.append(self.students['Name'][i-1])
        
        for n,j in enumerate(range(0, len(groups), self.groupsize)):
            print(f'Group {n+1} :',groups[j:j+5])

        return None

    def get_groups_from_individual(self, individual: np.ndarray) -> List[np.ndarray]:
        """ Returns a Python List of all Groups in an individual(as numpy ndarrays) """
        groups = pd.DataFrame()
        for i in individual:
            groups = pd.concat((groups, self.students.loc[self.students['ID'] == i]))

        groups = groups.reset_index(drop=True)

        nested_groups_list = []
        for n,j in enumerate(range(0, len(groups), self.groupsize)):
            nested_groups_list.append(pd.DataFrame(groups[j:j+groupsize]))

        return nested_groups_list

    def get_groups_by_person_fullname(self, fullname: str, individual: np.ndarray) -> List[np.ndarray]:
        """ Returns all Groups that got a Person with $fullname$ in given $individual$"""
        nested_groups_list = self.get_groups_from_individual(individual)

        groups_with_person = []
        for group in nested_groups_list:
            if fullname in group["Name"].values:
                groups_with_person.append(group)
        return groups_with_person

    def get_group_by_person_ID(self, ID: int, individual: np.ndarray) -> np.ndarray:
        """ Returns Group that has Person with given $ID$ in given $individual$"""
        nested_groups_list = self.get_groups_from_individual(individual)

        for group in nested_groups_list:
            if ID in group["ID"].values:
                return group
        return None

In [6]:
population_manager = Population(students, groupsize)

test_population = population_manager.create_initial_population()

test_population

array([[13, 24, 52, ..., 94, 29, 66],
       [53, 24, 63, ..., 81, 32, 59],
       [51, 55, 29, ..., 60, 76, 90],
       ...,
       [ 8, 25, 32, ..., 65, 11,  2],
       [84, 13, 21, ..., 41,  2, 53],
       [97, 42, 16, ..., 61, 65, 38]])

Show Groups from the IDs

In [7]:
population_manager.show_groups_from_individual(test_population[0])

Group 1 : ['Andrea Kastner', 'Susanne Kaestner', 'Wolfgang Berg', 'Sebastian Wurfel', 'Kristin Feierabend']
Group 2 : ['Mandy Roth', 'Petra Kaufmann', 'Christina Hoffmann', 'René Kaufmann', 'Sophie Bohm']
Group 3 : ['Marko Hoch', 'Erik Engel', 'Jens Hoffmann', 'Diana Traugott', 'Sven Kalb']
Group 4 : ['Ulrike Hoover', 'Marco Junker', 'Nicole Burger', 'Barbara Wolf', 'Janina Kruger']
Group 5 : ['Michael Kaiser', 'Michael Eiffel', 'Stefanie Wurfel', 'Marco Schaefer', 'Martin Kalb']
Group 6 : ['Lisa Eiffel', 'Uwe Naumann', 'Paul Pfaff', 'Christian Moench', 'Tim Reinhard']
Group 7 : ['Tom Mueller', 'Mike Fuchs', 'Mandy Himmel', 'Katrin Lang', 'Stefan Lange']
Group 8 : ['Matthias Schreiber', 'Christian Austerlitz', 'Kerstin Fenstermacher', 'Jessika Huber', 'Kathrin Schwab']
Group 9 : ['Gabriele Krüger', 'Torsten Freeh', 'Alexander Egger', 'Lucas Kuester', 'Ulrich Scholz']
Group 10 : ['Jonas Fassbinder', 'Sophia Schmitt', 'Swen Metzger', 'Laura Hoover', 'Sabine Theissen']
Group 11 : ['Dennis

________

2. Selection (Evaluate the fitness of each group, find fittest)

In [8]:
class Fitness():
    def __init__(self):
        super(Fitness, self).__init__()

    def evaluate_fitness(self, individual, students):
        """ Evaluates the fitness of an individual. """

        # split individual into student groups of the groupsize
        groups = np.array_split(individual, (len(individual)/groupsize))

        # iterate over groups and calculate scores for the different parameters
        scores = []
        for group_ids in groups:
            # get full data for students in this group from pd dataframe
            group = students.loc[students['ID'].isin(group_ids)]

            # get individual scores for parameters
            language_score = self.evaluate_language(group)
            if language_score == -1:
                # if we really want to use a hard constraint we would need to return -1 here, this makes for really bad (initial) results though
                # return -1
                scores.append(language_score)
                continue

            major_score = self.evaluate_majors(group)
            ambition_score = self.evaluate_ambition(group)
            place_score = self.evaluate_meeting_place(group)
            gender_score = self.evaluate_gender(group)
            friend_score = self.evaluate_friends(group)
            personality_score = self.evaluate_personality(group)
            day_score = self.evaluate_meeting_day(group)

            # formula for adding and weighting different scores
            scores.append(language_score+major_score+ambition_score+place_score+gender_score+friend_score+personality_score+day_score)

        #Convert to series to calculate mean more easily
        return pd.Series(scores).mean()

    def evaluate_language(self, group):
        # number of groupmembers per language
        counts = group['Preferred language'].value_counts()

        # hard constraints, if languages are conflicting, return -1
        if 'German' in counts.index and 'English' in counts.index:
            return -1

        return groupsize

    def evaluate_majors(self, group):
        majors = group['Majors'].tolist()

        # preprocess majors from dataset notation to list with all majors
        group_majors = []
        for pair in majors:
            pair = pair[1:-1].split(", ")

            group_majors.append(pair[0][1:-1])
            group_majors.append(pair[1][1:-1])

        #convert to Series for easier handling
        group_majors = pd.Series(group_majors)
        #get value counts
        group_major_values = group_majors.value_counts()
        #remove majors only one person takes (as they provide no synergy to the group)
        group_major_values = group_major_values[group_major_values > 1]

        #add number of shared majors and divide by 2; Formula is kinda arbitrary
        return group_major_values.sum() /2

    def evaluate_ambition(self, group):
        # get pd Series of ambitions
        ambitions = group['Level of ambition']
        # get int value mappings for ambitions

        mapping = {
            'Very low': 1,
            'Low': 2,
            'Medium': 3,
            'High': 4,
            'Very high': 5
        } 

        ambitions = ambitions.map(mapping)

        # fitness is groupsize - variance in group motivation (so less variance = more fitness)
        return groupsize - ambitions.var()

    def evaluate_meeting_place(self, group):
        # number of groupmembers for each preferred meeting place
        meeting_place = group['Preferred meeting place'].value_counts()

        # if all prefer the same meeting place return 5, else 0
        if meeting_place[0] == groupsize:
            return 5

        return 0

    def evaluate_gender(self, group):
        # evaluate by variance
        genders = group['Gender'].value_counts()

        # add 0 entry for missing genders
        for gender in ['Male', 'Female', 'Indeterminate']:
            if gender not in genders.index:
                genders[gender] = 0

        # return groupsize - variance
        return groupsize - genders.var()

    def evaluate_friends(self, group):
        #for each member +1 if friend is also in group
        group_member_names = group['Name'].tolist()
        best_friends_name = group['Best friend'].tolist()

        # get intersection between both lists
        friends_in_group = list(set(group_member_names).intersection(best_friends_name))

        # fitness += 1 for every pair of friends
        return len(friends_in_group)

    def evaluate_personality(self, group):
        #information about compatible personality types is taken from
        # Montequín, Vicente Rodríguez, et al. "Using Myers-Briggs type indicator (MBTI) as a tool for setting up student teams for information technology projects." Journal of Information Technology and Application in Education 1.1 (2012): 28-34.

        #count existing personality types in each group
        personalities = group['Personality type']
        types = personalities.value_counts()

        #fitness function starts with 0 and gets better
        # with every good group member
        fitness = 0

        #its good if there is a group leader like an ISTJ or an ESTJ, but only one
        try:
            if (types['ISTJ'] + types['ESTJ'] == 1):
                fitness+=5
            elif (types['ISTJ'] + types['ESTJ'] >= 2):
                fitness-=5
        except KeyError:
            pass

        #compare compatibility of group members
        for i, personality_a in enumerate(personalities.tolist()):
            for j, personality_b in enumerate(personalities.tolist()):
                # skip same group member and members already compared
                if i <= j:
                    continue

                # increase fitness if
                if (personality_a[1] != personality_b[1]) ^ (personality_a[2] != personality_b[2]):
                    if (personality_a[0] != personality_b[0]) or (personality_a[3] != personality_b[3]):
                        fitness+=1

        return fitness

    def evaluate_meeting_day(self, group):
        # number of groupmembers for each preferred meeting day
        meeting_day = group['Preferred day'].value_counts()

        # if all prefer the same meeting day return 5, else 0
        if meeting_day[0] == groupsize:
            return 5

        return 0

    def mean_fitness(self, population):
        # get list of fitness scores for all individuals in this population
        fitness_scores = [self.evaluate_fitness(individual, students) for individual in population]

        # convert to series to calculate mean more easily
        return round(pd.Series(fitness_scores).mean(), 3)

    def best_fitness(self, population):
        # get list of fitness scores for all individuals in this population
        fitness_scores = [self.evaluate_fitness(individual, students) for individual in population]

        # sort by best first
        scores_sorted = sorted(fitness_scores, reverse=True)

        # return fitness of best individual
        return round(scores_sorted[0], 3)

    def get_fittest_individual(self, population, students):
        # get list of fitness scores for all individuals in this population
        highest_fitness = float("-inf")
        fittest_individual = None
        for individual in population:
            fitness = self.evaluate_fitness(individual, students)
            if highest_fitness < self.evaluate_fitness(individual, students):
                highest_fitness = fitness
                fittest_individual = individual
        return fittest_individual

    def indices_sorted_by_fitness(self, population):
        fitness_scores = [self.evaluate_fitness(individual, students) for individual in population]

        indices = np.argsort(fitness_scores)

        return indices.tolist()

In [9]:
best_fitness = Fitness().best_fitness(test_population)
best_fitness

18.75

3. Heredity (Pass on the fittest to the next generation)

In [10]:
class Crossover():
    def __init__(self):
        super(Crossover, self).__init__()

    def uniform_order_crossover(self, p1, p2):

        template = self.get_crossover_template(len(p1), crossover_rate=0.2)
        # create 'empty' child
        child = np.zeros((len(p1),),dtype=int)
        # where the template is true, take values from p1
        child[template] = p1[template]
        # store genes used from p1
        used_genes = p1[template]

        # get all genes from p2
        remaining_genes = p2.tolist()
        # add genes from p2 (that were not used from p1) to the empty spots of the child
        for i, value in enumerate(child):
            # if this spot is already filled, continue
            if value != 0:
                continue

            # do while:  pop(get and remove) next gene from p2 until one is found that is not yet in the genome of the child, then add that
            while True:
                next_gene = remaining_genes.pop(0)
                if next_gene not in used_genes:
                    child[i] = next_gene
                    break

        return child

    def get_crossover_template(self,length, crossover_rate = 0.2):
        # initialize template with false values
        template = np.zeros((length,),dtype=bool)
        # get random indices of the amount #of genes * crossover rate
        random_indices = np.random.choice(template.shape[0], int(length*crossover_rate), replace=False)
        #set these indices to true
        template[random_indices] = True

        return template

    def tournament_selection(self, population, tournament_size = 8):
        # get random indices to select random individuals from population
        random_indices = np.random.choice(population.shape[0], tournament_size*2, replace=False)

        # get individuals from random indices and split into two tournaments
        tournament1 = population[random_indices[:tournament_size]]
        tournament2 = population[random_indices[tournament_size:]]

        parents = []
        # tournament is won by fittest individual in each tournament, those become the two parents
        for tournament in (tournament1,tournament2):
            # get fitness scores for every individual in the tournament
            fitness_scores = [Fitness().evaluate_fitness(individual, students) for individual in tournament]
            # get indices ordered by highest fitness first
            idx = np.argsort(fitness_scores)[::-1]
            # add individual with highest fitness to parents
            parents.append(tournament[idx[0]])

        return parents

In [11]:
random_population = Population(students, groupsize).create_initial_population()

random_parent1, random_parent2 = Crossover().tournament_selection(random_population)

print('First parent:')
print(random_parent1, '\n')
print('Second parent:')
print(random_parent2, '\n')

child = Crossover().uniform_order_crossover(random_parent1, random_parent2)

print('Child after crossover:')
print(child)

First parent:
[ 30 100  44  67  86  41  39   9  59  52  68  17  36  62  29  33  11  19
  90  56  53  84  81  87  28  16  89  23  10  73  21  98  94   4  99  79
  43  66  50   2  20  45   3  91  85  83  46  31  32  63  27  26   7  65
  61  15  88  75  14  70  42   5   1  74  34  76  97  72  37  82  49  40
  48  22  35  55  54  69  12  58  92  18  71  57   6  25  95  80  60  93
  13  78  64  47  24  51  77   8  38  96] 

Second parent:
[ 29  72  94  82  49  33  44  17  15  48  38  40  85  79  60 100   2  63
  13  68   6  99  45  96  53  98  30  59  58  62  66  16  10  64  43   3
  39  90  31  27  32  55   5  36  52  18  28  22   8  71  54  75  97  83
  65  73  81  95  93  77  80  14  88  23  41  89  56  76  24  35  12  92
  21  61  34   7  57   4  70  50   1   9  74  91  26  69  19  11  84  87
  51  42  37  67  20  78  86  47  46  25] 

Child after crossover:
[ 29  82  44  49  86  33  17  15  48  52  38  40  85  79 100   2  63  13
  68   6  99  45  81  87  96  53  98  30  59  73  58  62 

We also add a mutation function: 

In [12]:
def mutation(individual, mutation_rate = 0.2):
    # if random percentage is lower than the mutation rate, switch two random genes
    if random.uniform(0, 1) < mutation_rate:
        idx1 = random.randint(0, len(individual)-1)
        idx2 = random.randint(0, len(individual)-1)

        individual[idx1], individual[idx2] = individual[idx2], individual[idx1]

    return individual

Finally, our executable GA class:

In [14]:
class Genetic_Algorithm():
    def __init__(self, students, groupsize):
        super(Genetic_Algorithm, self).__init__()

        self.students = students
        self.groupsize = groupsize
        self.num_individuals = len(students)
        self.ids = students.ID.tolist()

    def run(self, episodes, replace='all', mutation_rate=0.05):

        self.epidodes = episodes
        self.population = Population(students, groupsize).create_initial_population()

        print("episode " + str(0) + ": mean fitness score: " + str(Fitness().mean_fitness(self.population)) + "; best individual fitness: " + str(Fitness().best_fitness(self.population)))

        if replace == 'all':
            for episode in range(episodes):
                new_pop = []
                # we get two new individuals by each step, so half the pop size
                for _ in range(len(self.population)//2):
                    # find two parents by tournament selection
                    p1, p2 = Crossover().tournament_selection(self.population,8)

                    # create two children by uniform order crossover
                    c1 = Crossover().uniform_order_crossover(p1,p2)
                    c2 = Crossover().uniform_order_crossover(p2,p1)
                    # do mutation
                    c1 = mutation(c1, mutation_rate)
                    c2 = mutation(c2, mutation_rate)
                    # add children to new population
                    new_pop.append(c1)
                    new_pop.append(c2)

                self.population = np.array(new_pop,dtype=int)

                print("episode " + str(episode+1) + ": mean fitness score: " + str(Fitness().mean_fitness(self.population)) + "; best individual fitness: " + str(Fitness().best_fitness(self.population)))

        elif int(replace):
            for episode in range(episodes):
                pop_indices_sorted_by_fitness = Fitness().indices_sorted_by_fitness(self.population)
                
                # we get two new individuals by each step, so half the pop size
                for _ in range(replace//2):
                    # find two parents by tournament selection
                    p1, p2 = Crossover().tournament_selection(self.population,8)

                    # create two children by uniform order crossover
                    c1 = Crossover().uniform_order_crossover(p1,p2)
                    c2 = Crossover().uniform_order_crossover(p2,p1)
                    # do mutation
                    c1 = mutation(c1)
                    c2 = mutation(c2)
                    # add children to new population
                    next_idx = pop_indices_sorted_by_fitness.pop(0)
                    self.population[next_idx] = c1
                    next_idx = pop_indices_sorted_by_fitness.pop(0)
                    self.population[next_idx] = c2

                print("episode " + str(episode+1) + ": mean fitness score: " + str(Fitness().mean_fitness(self.population)) + "; best individual fitness: " + str(Fitness().best_fitness(self.population)))

        return self.population

In [15]:
best_population = Genetic_Algorithm(students, groupsize).run(episodes=10, replace='all', mutation_rate=0.1)

episode 0: mean fitness score: 16.459; best individual fitness: 19.657
episode 1: mean fitness score: 17.217; best individual fitness: 19.713


_____________________

Evaluations

In [42]:
# Select the fittest individiual from our population
fittest_individual = Fitness().get_fittest_individual(best_population, students)

# Convert the Gene String to a List of Groups
groups_df = population_manager.get_groups_from_individual(fittest_individual)

# Plot an example group of the list
groups_df[0]

Unnamed: 0,ID,Name,Gender,Preferred language,Majors,Level of ambition,Preferred meeting place,Personality type,Best friend,Preferred day
0,8,Sabrina Diederich,Female,Any,"('PHIL', 'AI')",Low,In person,ENTP,Sandra Metzger,Monday
1,67,Doreen Freud,Female,Any,"('NS', 'NI')",Medium,In person,INTP,Felix Wirtz,Monday
2,68,Anne Brandt,Female,Any,"('CL', 'CP')",Low,In person,ENTJ,Patrick Shuster,Friday
3,39,Antje Schroder,Female,Any,"('NS', 'CP')",Medium,Online,ESTP,Dennis Reiniger,Friday
4,76,Andreas Fisher,Male,German,"('NI', 'PHIL')",Low,In person,ESTP,Ralph Weissmuller,Friday
5,35,Jan Dietrich,Male,German,"('AI', 'NI')",High,In person,ISTP,Juliane Drescher,Wednesday
6,78,Marie Muench,Female,Any,"('NI', 'AI')",Very high,In person,INFP,Erik Schäfer,Tuesday
7,30,Christian Werfel,Male,Any,"('CP', 'AI')",Very low,Online,ISTJ,Diana Traugott,Thursday
8,57,Maximilian Fiedler,Male,German,"('AI', 'NI')",Very low,In person,ISTP,Marco Eisenhauer,Tuesday
9,58,Jonas Zimmer,Male,Any,"('AI', 'CP')",High,Online,ENTP,Mike Freud,Thursday


*Let's find your group or the one of a friend*

In [45]:
# User Input for the Full Name - Please be accurate!
fullname = str(input("Please the Full-Name you want to look up: "))
# Get the group by your Full Name - If there are multiple Persons with your name you will get all of them
my_group = population_manager.get_groups_by_person_fullname(fullname, fittest_individual)
# Plotting the first group found
if my_group:
    print(my_group)
else:
    print("Didn't find the requested Person with given Fullname")

[     ID                 Name  Gender Preferred language          Majors  \
60   92         Sarah Hoover  Female                Any    ('NS', 'NI')   
61   16         Stefan Beyer    Male                Any    ('CP', 'NI')   
62   73         Jörg Strauss    Male            English  ('NS', 'PHIL')   
63   10          René Kuefer    Male                Any    ('NS', 'NI')   
64  100  Juliane Schultheiss  Female            English    ('AI', 'CP')   
65   45      Brigitte Müller  Female            English    ('AI', 'CP')   
66   72   Maximilian Schäfer    Male            English    ('AI', 'NI')   
67    3    Johanna Schreiber  Female                Any    ('CL', 'NI')   
68   93          Marina Kalb  Female                Any  ('NI', 'PHIL')   
69   15     Marco Eisenhauer    Male                Any    ('NI', 'CL')   

   Level of ambition Preferred meeting place Personality type  \
60              High               In person             ESTJ   
61               Low               In perso

In [46]:
# Find your group by your ID
my_group = population_manager.get_group_by_person_ID(33, fittest_individual)
# Plotting the group, if one found with given ID
if my_group is not None:
    print(my_group)
else:
    print("Didn't find the requested Person with given ID")

    ID              Name  Gender Preferred language          Majors  \
70  97        Sara Papst  Female                Any    ('NI', 'AI')   
71  64   Angelika Brauer  Female                Any    ('CP', 'NS')   
72  33       Niklas Frey    Male             German    ('NI', 'NS')   
73  82  Phillipp Koertig    Male                Any    ('CP', 'NS')   
74  63       Karin Pfaff  Female                Any    ('NS', 'CP')   
75  65         Sven Weiß    Male                Any    ('AI', 'CP')   
76  21      Sven Holzman    Male                Any    ('CL', 'NI')   
77  95    Diana Traugott  Female                Any    ('NI', 'NS')   
78  44   Ursula Reiniger  Female                Any    ('CL', 'NS')   
79  12       Luca Wexler    Male             German  ('NI', 'PHIL')   

   Level of ambition Preferred meeting place Personality type  \
70          Very low               In person             ESFP   
71         Very high                  Online             ISFP   
72            Medium   

## Manual Data Analysis
In the next code cells you can look at the distributions of people in groups. Feel free to add more!

In [None]:
def print_groups_distribution(column_name, groups):
    for group_index, group in enumerate(groups):
        statistics = group[column_name].value_counts() / group[column_name].value_counts().sum()
        print(f"Group {group_index}: Ratio of {column_name} is {sorted([i + ': ' + str(statistics[i]) for i in statistics.index])}")

In [None]:
def print_groups_friends_match(groups):

    for group_index, group in enumerate(groups):
        print(f"Group {group_index}: Friends matching: {Fitness().evaluate_friends(groups_df[0])}")

In [None]:
print_groups_distribution("Gender", groups_df)

In [None]:
print_groups_distribution("Preferred language", groups_df)

In [None]:
print_groups_distribution("Preferred meeting place", groups_df)

In [None]:
print_groups_distribution("Preferred day", groups_df)

In [None]:
print_groups_distribution("Personality type", groups_df)

In [None]:
print_groups_distribution("Level of ambition", groups_df)

In [None]:
print_groups_friends_match(groups_df)