In [56]:
from faker import Faker
import numpy as np
import pandas as pd
import random

In [57]:
faker = Faker(seed=666)
np.random.seed(666)

In [58]:
keywords = [
    'archeology',
    'museums',
    'music',
    'art',
    'cinema',
    'countryside',
    'tracking',
    'rafting',
    'history',
    'literature',
    'sport',
    'food',
    'wine',
    'beer'
]

In [97]:
# guide profile generation
def generate_guide():
    guide = {}
    guide['gender'] = np.random.choice(['male', 'female'], size=1).item()
    guide['name'] = (faker.first_name_male() if guide['gender'] == 'male' else faker.first_name_female()) + ' ' + faker.last_name()
    guide['birth_date'] = faker.date_of_birth()
    guide['now_available'] = bool(np.random.binomial(1, 0.8))
    guide['languages_spoken'] = list(np.random.choice(['english', 'italian', 'french', 'spanish', 'deutsche', 'dutch', 'bulgarian', 'chinese'], size=1+np.random.poisson(lam=1, size=1).item(), replace=False))
    guide['price'] = int(max(10, np.abs(np.random.normal(30, 5))))
    guide['education'] = np.random.choice(['elementary', 'high-school', 'bachelor', 'master', 'phd'], size=1).item()
    guide['biography'] = faker.profile()['job']
    guide['keywords'] = list(np.random.choice(keywords, size=np.random.poisson(2, size=1).item()))
    guide['current_location'] = {
        "lat": np.random.normal(40.3524, 0.01),
        "lon": np.random.normal(18.1732, 0.01)
    }
    guide['experience'] = random.randint(1,10)
    
    return guide

In [98]:
# user profile generation
def generate_tourist():

    tourist = {}

    tourist['languages'] = list(np.random.choice(['english', 'italian', 'french', 'spanish', 'deutsche', 'dutch', 'bulgarian', 'chinese'], size=1+np.random.binomial(1, 0.2), replace=False))

    tourist['keywords'] = list(np.random.choice(keywords, size=np.random.poisson(2, size=1).item()))
    
    return tourist

In [99]:
import random 

# rating generation
def generate_rating (tourists, guides):
    ratings = []
    for i in range(tourists):
        
        # random choose 3 guides without duplicates
        random_guides = random.sample(range(guides),3)
        
        # random choose 1 or 2 guides to rate
        for j in range(np.random.randint(1,3)):

            rating = round(np.random.normal(loc=4, scale=0.5, size=1).item(),0)
            ratings.append((i,random_guides[j],rating))

        # random choose if rate badly
        if(np.random.binomial(1, 0.2)):
            bad_rating = round(np.random.normal(loc=1.5, scale=0.3, size=1).item(),0)
            # rate the last guide with bad rating
            ratings.append((i,random_guides[len(random_guides)-1],bad_rating))
    
    return ratings

#### Data generation

In [100]:
n_tourists = 100
n_guides = 20

In [101]:
guides = [generate_guide() for _ in range(n_guides)]
tourists = [generate_tourist() for _ in range(n_tourists)]

In [102]:
ratings = generate_rating(n_tourists,n_guides)

In [103]:
pd.DataFrame(guides)

Unnamed: 0,gender,name,birth_date,now_available,languages_spoken,price,education,biography,keywords,current_location,experience
0,female,Lindsey Smith,1926-08-23,True,"[bulgarian, english, italian, spanish]",21,master,"Research officer, trade union",[literature],"{'lat': 40.360068103137486, 'lon': 18.17450940...",9
1,male,Erik Pope,1970-01-12,False,"[english, dutch, french]",26,phd,Health visitor,"[literature, countryside, beer]","{'lat': 40.33247366502541, 'lon': 18.159053475...",9
2,female,Kristina Snyder,1941-01-09,True,"[bulgarian, deutsche, english]",29,elementary,"Psychotherapist, dance movement",[tracking],"{'lat': 40.36928308827176, 'lon': 18.177608454...",3
3,male,Stanley Lynch,1955-01-26,True,[french],27,bachelor,Financial planner,"[music, rafting]","{'lat': 40.334967155955326, 'lon': 18.15857588...",5
4,female,Brittany Robinson,1929-12-29,True,[french],35,elementary,Occupational psychologist,"[countryside, beer]","{'lat': 40.37570855664476, 'lon': 18.185105273...",9
5,female,Tonya Acosta,2000-10-18,True,"[bulgarian, italian, deutsche, french, chinese]",49,master,Artist,"[museums, tracking]","{'lat': 40.351964807531346, 'lon': 18.16532598...",4
6,male,Adam Mccoy,1986-02-03,True,[dutch],29,master,Museum/gallery conservator,[],"{'lat': 40.35237213552843, 'lon': 18.168331584...",4
7,male,David Rivera,1992-02-21,True,[chinese],30,master,Land/geomatics surveyor,"[music, tracking, music]","{'lat': 40.363622949494506, 'lon': 18.18723154...",10
8,female,Heather Gordon,1990-10-23,False,[bulgarian],30,bachelor,Health and safety adviser,"[wine, music, rafting, museums, museums]","{'lat': 40.36772965801416, 'lon': 18.173538719...",2
9,female,Jacqueline Pineda,1934-09-28,True,"[french, english]",19,high-school,Jewellery designer,"[music, rafting]","{'lat': 40.34824757569885, 'lon': 18.165995261...",4


In [104]:
pd.DataFrame(tourists)

Unnamed: 0,languages,keywords
0,[italian],"[cinema, food, music]"
1,"[spanish, chinese]","[wine, sport]"
2,[bulgarian],"[music, cinema, countryside]"
3,[bulgarian],"[countryside, food]"
4,[dutch],[wine]
...,...,...
95,[bulgarian],"[cinema, art, tracking, cinema, archeology, sp..."
96,[spanish],"[beer, history]"
97,[french],"[countryside, art, tracking, beer]"
98,[deutsche],[tracking]


In [105]:
pd.DataFrame(ratings)

Unnamed: 0,0,1,2
0,0,10,4.0
1,1,4,4.0
2,1,16,4.0
3,2,18,5.0
4,2,15,4.0
...,...,...,...
170,98,4,4.0
171,98,11,4.0
172,99,8,5.0
173,99,3,4.0
