### To do: 
    - Implement timetable attribute
    - Name generator is not the most sophisticated (maybe include foreign names as well, choose first names of a 
    newer generations)
    - Percentages and following calculations might be problematic depending on whether n is dividable by 2
    - Solution for getting unique majors is not all that pretty
    - Best friends calculation not really stable

### Questions:
    - Does it even make sense to encode the attributes? Would it be better to use words instead of numbers to make 
    everything more comprehensive?


In [1]:
import pandas as pd
import random
import os  
import copy

In [2]:
class Error(Exception):
    pass

#### Basic attributes

In [3]:
# 0: set sample size (max ~2700 unique names)
n = 20

if n%2 != 0:
    raise Error("Please provide an even number of participants")
elif n > 2700:
    raise Error("Maximum number of participants is 2700")
        

# 1: Create IDs
l_id = list(range(0,n))

# 2: create names 

#load names from lists
with open("names\german-names-female.txt") as f:
    names_female = f.read().splitlines() 
    
with open("names\german-names-male.txt") as f:
    names_male = f.read().splitlines() 

# set percentages
p_female = 50
p_male = 50

# create sublists of unique names
l_female = random.sample(names_female,k = int(n/100 * p_female))
l_male = random.sample(names_male,k = int(n/100 * p_male))

# unify and shuffle
l_name = l_female + l_male 
random.shuffle(l_name)

# 3: create language preferences

# set percentages
p_any = 80
p_en =  10
p_ger = 10

# create sublists
l_any = ["Any"] * int(n/100 * p_any)
l_en = ["English"] * int(n/100 * p_en)
l_ger = ["German"] * int(n/100 * p_ger)

# unify and shuffle
l_lang = l_any + l_en + l_ger 
random.shuffle(l_lang)

# 4: create majors 

maj = ["AI", "NP", "PHIL", "CL", "NI", "NB", "DS"]

# create sublists

maj_1 = random.choices(maj, k=n+n)
maj_2 = random.choices(maj, k=n+n)

# zip them into list of tuples
tmp = list(zip(maj_1,maj_2))

#remove dups
l_maj = []

for x in tmp:
    if x[0] != x[1]:
        l_maj.append(x)
    if len(l_maj) == n:
        break

# 5: create ambitions 
amb = ["Very low","Low","Medium","High","Very high"]
l_amb = random.choices(amb, k=n)


#### Extra attributes

In [47]:
# 1: Meeting place

# set percentages
p_online = 20
p_ip =  80

# create sublists
l_online = ["Online"] * int(n/100 * p_online)
l_ip = ["In person"] * int(n/100 * p_ip)

# unify and shuffle
l_meet = l_online + l_ip
random.shuffle(l_meet)

# 2: Personality type

pers = ["ESTJ", "ENTJ", "ESFJ", "ENFJ", "ISTJ", "ISFJ", "INTJ", "INFJ", "ESTP", "ESFP", "ENTP", "ENFP", "ISTP", "ISFP", "INTP", "INFP"]

l_pers = random.choices(pers, k=n)

# 3: Best friend(s)

l_bf = copy.deepcopy(l_name)
    
# 4: Openness towards new people

# set percentages
p_rel = 20
p_neu = 40
p_con = 40

#create sublists
l_rel = ["Reluctant"] * int(n/100 * p_rel)
l_neu = ["Neutral"] * int(n/100 * p_neu)
l_con = ["Confident"] * int(n/100 * p_con)

# unify and shuffle
l_open = l_rel + l_neu + l_con
random.shuffle(l_open)

# 5: Timetable



#### Build the dataframe

In [48]:
# create dataframe from lists

df = pd.DataFrame(list(zip(l_id, l_name, l_lang, l_maj, l_amb, l_meet, l_pers, l_bf, l_open)), columns =['ID', 'Name', 'Preferred language', 'Majors', 'Level of ambition', 'Prefered meeting place', 'Personality type', 'Best friend(s)', 'Openness'])

df

Unnamed: 0,ID,Name,Preferred language,Majors,Level of ambition,Prefered meeting place,Personality type,Best friend(s),Openness
0,0,Niklas Drescher,English,"(NI, AI)",Very high,In person,INTJ,Niklas Drescher,Confident
1,1,Jessika Schultz,German,"(DS, NI)",Very high,In person,ISTP,Jessika Schultz,Confident
2,2,Antje Urner,Any,"(NB, AI)",Low,Online,ESFP,Antje Urner,Neutral
3,3,Markus Ritter,Any,"(NB, AI)",Very high,In person,ISFP,Markus Ritter,Neutral
4,4,Anna Pabst,Any,"(PHIL, DS)",Very high,In person,ENFP,Anna Pabst,Confident
5,5,Max Pfeffer,Any,"(CL, NP)",Medium,In person,ISFJ,Max Pfeffer,Confident
6,6,Nadine Cole,Any,"(NI, CL)",Very low,Online,INTP,Nadine Cole,Neutral
7,7,Jessica Schiffer,Any,"(CL, DS)",Low,Online,ENFP,Jessica Schiffer,Neutral
8,8,Stephanie Winkel,Any,"(PHIL, NP)",Medium,In person,ENFJ,Stephanie Winkel,Neutral
9,9,Ralf Neustadt,German,"(PHIL, CL)",Very high,In person,ISFJ,Ralf Neustadt,Reluctant


#### Export to CSV

In [6]:
# save dataframe as csv

df.to_csv("dataset.csv", index=False)