In [22]:
import numpy as np
import random
import pandas as pd

In [23]:
students = pd.read_csv (r'../dataset_full.csv')
student_ids = students.ID.tolist()

# hyperparameters
num_individuals = 50
groupsize = 5
# between 0 and 1
mutation_rate = 0.05

In [24]:
def create_random_individual(ids):
    individual = ids.copy()
    random.shuffle(individual)

    return individual

In [25]:
def create_initial_population(ids, num_individuals):
    population = []
    for _ in range(num_individuals):
        population.append(create_random_individual(ids))

    return np.array(population,dtype=int)

In [26]:
def tournament_selection(popuplation, tournament_size = 8):
    # get random indices to select random individuals from population
    random_indices = np.random.choice(popuplation.shape[0], tournament_size*2, replace=False)

    # get individuals from random indices and split into two tournaments
    tournament1 = popuplation[random_indices[:tournament_size]]
    tournament2 = popuplation[random_indices[tournament_size:]]

    parents = []
    # tournament is won by fittest individual in each tournament, those become the two parents
    for tournament in (tournament1,tournament2):
        # get fitness scores for every individual in the tournament
        fitness_scores = [evaluate_fitness(individual, students) for individual in tournament]
        # get indices ordered by highest fitness first
        idx = np.argsort(fitness_scores)[::-1]
        # add individual with highest fitness to parents
        parents.append(tournament[idx[0]])

    return parents

In [27]:
def uniform_order_crossover(p1, p2, template):
    # create 'empty' child
    child = np.zeros((len(p1),),dtype=int)
    # where the template is true, take values from p1
    child[template] = p1[template]
    # store genes used from p1
    used_genes = p1[template]

    # get all genes from p2
    remaining_genes = p2.tolist()
    # add genes from p2 (that were not used from p1) to the empty spots of the child
    for i, value in enumerate(child):
        # if this spot is already filled, continue
        if value != 0:
            continue

        # do while:  pop(get and remove) next gene from p2 until one is found that is not yet in the genome of the child, then add that
        while True:
            next_gene = remaining_genes.pop(0)
            if next_gene not in used_genes:
                child[i] = next_gene
                break

    return child

In [28]:
def get_crossover_template(crossover_rate = 0.1):
    # initialize template with false values
    template = np.zeros((len(pop[0]),),dtype=bool)
    # get random indices of the amount #of genes * crossover rate
    random_indices = np.random.choice(template.shape[0], int(len(pop[0])*crossover_rate), replace=False)
    #set these indices to true
    template[random_indices] = True

    return template

In [29]:
def mutation(individual):
    # if random percentage is lower than the mutation rate, switch two random genes
    if random.uniform(0, 1) < mutation_rate:
        idx1 = random.randint(0, len(individual)-1)
        idx2 = random.randint(0, len(individual)-1)

        individual[idx1], individual[idx2] = individual[idx2], individual[idx1]

    return individual

In [30]:
def evaluate_fitness(individual, students):
    # split individual into student groups of the groupsize
    groups = np.array_split(individual, (len(individual)/groupsize))

    # iterate over groups and calculate scores for the different parameters
    scores = []
    for group_ids in groups:
        # get full data for students in this group from pd dataframe
        group = students.loc[students['ID'].isin(group_ids)]

        # get individual scores for parameters
        language_score = evaluate_language(group)
        major_score = evaluate_majors(group)
        ambition_score = evaluate_ambition(group)
        place_score = evaluate_meeting_place(group)
        gender_score = evaluate_gender(group)
        friend_score = evaluate_friends(group)
        personality_score = evaluate_personality(group)
        day_score = evaluate_meeting_day(group)

        # formula for adding and weighting different scores
        scores.append(language_score+major_score+ambition_score+place_score+gender_score+friend_score+personality_score+day_score)

    #Convert to series to calculate mean more easily
    return pd.Series(scores).mean()

In [31]:
def evaluate_language(group):
    # number of groupmembers per language
    counts = group['Preferred language'].value_counts()

    # max fitness here is number of members
    fitness = groupsize

    # if there are conflicting languages (any is ignored) decrease fitness for every conflict
    try:
        if counts['German'] and counts['English']:
            fitness -= counts['German'] + counts['English']
    except KeyError:
        pass

    return fitness

In [32]:
def evaluate_majors(group):
    majors = group['Majors'].tolist()

    # preprocess majors from dataset notation to list with all majors
    group_majors = []
    for pair in majors:
        pair = pair[1:-1].split(", ")

        group_majors.append(pair[0][1:-1])
        group_majors.append(pair[1][1:-1])

    #convert to Series for easier handling
    group_majors = pd.Series(group_majors)
    #get value counts
    group_major_values = group_majors.value_counts()
    #remove majors only one person takes (as they provide no synergy to the group)
    group_major_values = group_major_values[group_major_values > 1]

    #add number of shared majors and divide by 2; Formula is kinda arbitrary
    return group_major_values.sum() /2

In [33]:
def ambition_to_int(ambition):
    mapping = {
        'Very low': 1,
        'Low': 2,
        'Medium': 3,
        'High': 4,
        'Very high': 5
    }

    return mapping[ambition]

def evaluate_ambition(group):
    # get pd Series of ambitions
    ambitions = group['Level of ambition']
    # get int value mappings for ambitions
    ambitions = ambitions.apply(lambda val: ambition_to_int(val))

    # fitness is groupsize - variance in group motivation (so less variance = more fitness)
    return groupsize - ambitions.var()

In [34]:
def evaluate_meeting_place(group):
    # number of groupmembers for each preferred meeting place
    meeting_place = group['Preferred meeting place'].value_counts()

    # if all prefer the same meeting place return 5, else 0
    if meeting_place[0] == groupsize:
        return 5

    return 0

In [35]:
def evaluate_gender(group):
    # evaluate by variance
    genders = group['Gender'].value_counts()

    # add 0 entry for missing genders
    for gender in ['Male', 'Female', 'Indeterminate']:
        if gender not in genders.index:
            genders[gender] = 0

    # return groupsize - variance
    return groupsize - genders.var()

In [36]:
def evaluate_friends(group):
    #for each member +1 if friend is also in group
    group_member_names = group['Name'].tolist()
    best_friends_name = group['Best friend'].tolist()

    # get intersection between both lists
    friends_in_group = list(set(group_member_names).intersection(best_friends_name))

    # fitness += 1 for every pair of friends
    return len(friends_in_group)

In [37]:
def evaluate_personality(group):
    #information about compatible personality types is taken from
    # Montequín, Vicente Rodríguez, et al. "Using Myers-Briggs type indicator (MBTI) as a tool for setting up student teams for information technology projects." Journal of Information Technology and Application in Education 1.1 (2012): 28-34.

    #count existing personality types in each group
    personalities = group['Personality type']
    types = personalities.value_counts()

    #fitness function starts with 0 and gets better
    # with every good group member
    fitness = 0

    #its good if there is a group leader like an ISTJ or an ESTJ, but only one
    try:
        if (types['ISTJ'] + types['ESTJ'] == 1):
            fitness+=5
        elif (types['ISTJ'] + types['ESTJ'] >= 2):
            fitness-=5
    except KeyError:
        pass

    #compare compatibility of group members
    for i, personality_a in enumerate(personalities.tolist()):
        for j, personality_b in enumerate(personalities.tolist()):
            # skip same group member and members already compared
            if i <= j:
                continue

            # increase fitness if
            if (personality_a[1] != personality_b[1]) ^ (personality_a[2] != personality_b[2]):
                if (personality_a[0] != personality_b[0]) or (personality_a[3] != personality_b[3]):
                    fitness+=1

    return fitness

In [38]:
def evaluate_meeting_day(group):
    # number of groupmembers for each preferred meeting day
    meeting_day = group['Preferred day'].value_counts()

    # if all prefer the same meeting day return 5, else 0
    if meeting_day[0] == groupsize:
        return 5

    return 0

In [39]:
def get_pop_mean_fitness(population):
    # get list of fitness scores for all individuals in this population
    fitness_scores = [evaluate_fitness(individual, students) for individual in population]

    # convert to series to calculate mean more easily
    return round(pd.Series(fitness_scores).mean(), 3)

In [40]:
def get_pop_best_fitness(population):
    # get list of fitness scores for all individuals in this population
    fitness_scores = [evaluate_fitness(individual, students) for individual in population]

    # sort by best first
    scores_sorted = sorted(fitness_scores, reverse=True)

    # return fitness of best individual
    return round(scores_sorted[0], 3)

In [41]:
pop = create_initial_population(student_ids, num_individuals)

print("episode " + str(0) + ": mean fitness score: " + str(get_pop_mean_fitness(pop)) + "; best individual fitness: " + str(get_pop_best_fitness(pop)))

for episode in range(5):
    new_pop = []
    # we get two new individuals by each step, so half the pop size
    for _ in range(len(pop)//2):
        # find two parents by tournament selection
        p1, p2 = tournament_selection(pop)

        # get random binary template for crossover
        crossover_template = get_crossover_template()

        # create two children by uniform order crossover
        c1 = uniform_order_crossover(p1,p2,crossover_template)
        c2 = uniform_order_crossover(p2,p1,crossover_template)
        # do mutation
        c1 = mutation(c1)
        c2 = mutation(c2)
        # add children to new population
        new_pop.append(c1)
        new_pop.append(c2)

    pop = np.array(new_pop,dtype=int)

    print("episode " + str(episode+1) + ": mean fitness score: " + str(get_pop_mean_fitness(pop)) + "; best individual fitness: " + str(get_pop_best_fitness(pop)))

episode 0: mean fitness score: 18.736; best individual fitness: 20.357
episode 1: mean fitness score: 19.319; best individual fitness: 21.212
episode 2: mean fitness score: 19.762; best individual fitness: 20.812
episode 3: mean fitness score: 20.028; best individual fitness: 20.872
episode 4: mean fitness score: 20.205; best individual fitness: 21.202
episode 5: mean fitness score: 20.348; best individual fitness: 21.492
