In [1]:
import numpy as np
import random
import pandas as pd

In [2]:
students = pd.read_csv (r'../dataset.csv')
student_ids = students.ID.tolist()

# TODO: dataset IDs should start with 1 to make handling easier
student_ids = [id+1 for id in student_ids]

# hyperparameters
num_individuals = 50
groupsize = 5
# between 0 and 1
mutation_rate = 0.05

In [3]:
def create_random_individual(ids):
    individual = ids.copy()
    random.shuffle(individual)

    return individual

In [4]:
def create_initial_population(ids, num_individuals):
    population = []
    for _ in range(num_individuals):
        population.append(create_random_individual(ids))

    return np.array(population,dtype=int)

In [5]:
def tournament_selection(popuplation, tournament_size = 8):
    # get random indices to select random individuals from population
    random_indices = np.random.choice(popuplation.shape[0], tournament_size*2, replace=False)

    # get individuals from random indices and split into two tournaments
    tournament1 = popuplation[random_indices[:tournament_size]]
    tournament2 = popuplation[random_indices[tournament_size:]]

    parents = []
    # tournament is won by fittest individual in each tournament, those become the two parents
    for tournament in (tournament1,tournament2):
        # get fitness scores for every individual in the tournament
        fitness_scores = [evaluate_fitness(individual, students) for individual in tournament]
        # get indices ordered by highest fitness first
        idx = np.argsort(fitness_scores)[::-1]
        # add individual with highest fitness to parents
        parents.append(tournament[idx[0]])

    return parents

In [6]:
def uniform_order_crossover(p1, p2, template):
    # create 'empty' child
    child = np.zeros((len(p1),),dtype=int)
    # where the template is true, take values from p1
    child[template] = p1[template]
    # store genes used from p1
    used_genes = p1[template]

    # get all genes from p2
    remaining_genes = p2.tolist()
    # add genes from p2 (that were not used from p1) to the empty spots of the child
    for i, value in enumerate(child):
        # if this spot is already filled, continue
        if value != 0:
            continue

        # do while:  pop(get and remove) next gene from p2 until one is found that is not yet in the genome of the child, then add that
        while True:
            next_gene = remaining_genes.pop(0)
            if next_gene not in used_genes:
                child[i] = next_gene
                break

    return child

In [7]:
def get_crossover_template(crossover_rate = 0.1):
    # initialize template with false values
    template = np.zeros((len(pop[0]),),dtype=bool)
    # get random indices of the amount #of genes * crossover rate
    random_indices = np.random.choice(template.shape[0], int(len(pop[0])*crossover_rate), replace=False)
    #set these indices to true
    template[random_indices] = True

    return template

In [8]:
def mutation(individual):
    # if random percentage is lower than the mutation rate, switch two random genes
    if random.uniform(0, 1) < mutation_rate:
        idx1 = random.randint(0, len(individual)-1)
        idx2 = random.randint(0, len(individual)-1)

        individual[idx1], individual[idx2] = individual[idx2], individual[idx1]

    return individual

In [9]:
def evaluate_fitness(individual, students):
    # split individual into student groups of the groupsize
    groups = np.array_split(individual, (len(individual)/groupsize))

    # iterate over groups and calculate scores for the different parameters
    scores = []
    for group_ids in groups:
        # get full data for students in this group from pd dataframe
        group = students.loc[students['ID'].isin(group_ids)]

        # get individual scores for parameters
        language_score = evaluate_language(group)
        major_score = evaluate_majors(group)
        ambition_score = evaluate_ambition(group)
        place_score = evaluate_meeting_place(group)

        # formula for adding and weighting different scores
        scores.append(language_score+major_score+ambition_score+place_score)

    #Convert to series to calculate mean more easily
    return pd.Series(scores).mean()

In [10]:
def evaluate_language(group):
    # number of groupmembers per language
    counts = group['Preferred language'].value_counts()

    # max fitness here is number of members
    fitness = groupsize

    # if there are conflicting languages (any is ignored) decrease fitness for every conflict
    try:
        if counts['German'] and counts['English']:
            fitness -= counts['German'] + counts['English']
    except KeyError:
        pass

    return fitness

In [11]:
def evaluate_majors(group):
    majors = group['Majors'].tolist()

    # preprocess majors from dataset notation to list with all majors
    group_majors = []
    for pair in majors:
        pair = pair[1:-1].split(", ")

        group_majors.append(pair[0][1:-1])
        group_majors.append(pair[1][1:-1])

    #convert to Series for easier handling
    group_majors = pd.Series(group_majors)
    #get value counts
    group_major_values = group_majors.value_counts()
    #remove majors only one person takes (as they provide no synergy to the group)
    group_major_values = group_major_values[group_major_values > 1]

    #add number of shared majors and divide by 2; Formula is kinda arbitrary
    return group_major_values.sum() /2

In [12]:
def ambition_to_int(ambition):
    mapping = {
        'Very low': 1,
        'Low': 2,
        'Medium': 3,
        'High': 4,
        'Very high': 5
    }

    return mapping[ambition]

def evaluate_ambition(group):
    # get pd Series of ambitions
    ambitions = group['Level of ambition']
    # get int value mappings for ambitions
    ambitions = ambitions.apply(lambda val: ambition_to_int(val))

    # fitness is groupsize - variance in group motivation (so less variance = more fitness)
    return groupsize - ambitions.var()

In [13]:
def evaluate_meeting_place(group):
    # number of groupmembers for each preferred meeting place
    meeting_place = group['Prefered meeting place'].value_counts()

    # if all prefer the same meeting place return 5, else 0
    if meeting_place[0] == 5:
        return 5

    return 0

In [14]:
def evaluate_pop(population):
    # get list of fitness scores for all individuals in this population
    fitness_scores = [evaluate_fitness(individual, students) for individual in population]

    # convert to series to calculate mean more easily
    return pd.Series(fitness_scores).mean()

In [15]:
pop = create_initial_population(student_ids, num_individuals)

for episode in range(25):
    new_pop = []
    # we get two new individuals by each step, so half the pop size
    for _ in range(len(pop)//2):
        # find two parents by tournament selection
        p1, p2 = tournament_selection(pop)

        # get random binary template for crossover
        crossover_template = get_crossover_template()

        # create two children by uniform order crossover
        c1 = uniform_order_crossover(p1,p2,crossover_template)
        c2 = uniform_order_crossover(p2,p1,crossover_template)
        # do mutation
        c1 = mutation(c1)
        c2 = mutation(c2)
        # add children to new population
        new_pop.append(c1)
        new_pop.append(c2)

    pop = np.array(new_pop,dtype=int)

    print("episode " + str(episode) + ": mean fitness score: " + str(evaluate_pop(pop)))

episode 0: mean fitness score: 13.055416666666666
episode 1: mean fitness score: 13.355966666666664
episode 2: mean fitness score: 13.350099999999998
episode 3: mean fitness score: 13.623483333333338
episode 4: mean fitness score: 14.11245
episode 5: mean fitness score: 14.547366666666665
episode 6: mean fitness score: 14.694966666666664
episode 7: mean fitness score: 14.751700000000001
episode 8: mean fitness score: 14.858133333333333
episode 9: mean fitness score: 14.999216666666669
episode 10: mean fitness score: 15.113933333333332
episode 11: mean fitness score: 15.31583333333333
episode 12: mean fitness score: 15.364933333333335
episode 13: mean fitness score: 15.366333333333344
episode 14: mean fitness score: 15.322366666666671
episode 15: mean fitness score: 15.442133333333338
episode 16: mean fitness score: 15.410633333333326
episode 17: mean fitness score: 15.547633333333321
episode 18: mean fitness score: 15.592816666666652
episode 19: mean fitness score: 15.588133333333325
e