In [56]:
from faker import Faker
import numpy as np
import pandas as pd
import random

In [57]:
faker = Faker(seed=666)
np.random.seed(666)

In [58]:
keywords = [
    'archeology',
    'museums',
    'music',
    'art',
    'cinema',
    'countryside',
    'tracking',
    'rafting',
    'history',
    'literature',
    'sport',
    'food',
    'wine',
    'beer'
]

In [59]:
# function for guide profile generation
def generate_guide():
    guide = {}
    guide['gender'] = np.random.choice(['male', 'female'], size=1).item()
    guide['name'] = (faker.first_name_male() if guide['gender'] == 'male' else faker.first_name_female()) + ' ' + faker.last_name()
    guide['birth_date'] = faker.date_of_birth()
    guide['now_available'] = bool(np.random.binomial(1, 0.3))
    guide['languages_spoken'] = list(np.random.choice(['english', 'italian', 'french', 'spanish', 'deutsche', 'dutch', 'bulgarian', 'chinese'], size=1+np.random.poisson(lam=1, size=1).item(), replace=False))
    guide['price'] = int(max(10, np.abs(np.random.normal(30, 5))))
    guide['education'] = np.random.choice(['elementary', 'high-school', 'bachelor', 'master', 'phd'], size=1).item()
    guide['biography'] = faker.profile()['job']
    guide['keywords'] = list(np.random.choice(keywords, size=np.random.poisson(2, size=1).item()))
    guide['current_location'] = {
        "lat": np.random.normal(40.3524, 0.01),
        "lon": np.random.normal(18.1732, 0.01)
    }
    guide['experience'] = random.randint(1,10)
    
    return guide

In [63]:
# function for user profile generation
def generate_tourist():

    tourist = {}

    tourist['languages'] = list(np.random.choice(['english', 'italian', 'french', 'spanish', 'deutsche', 'dutch', 'bulgarian', 'chinese'], size=1+np.random.poisson(lam=1, size=1).item(), replace=False))

    tourist['keywords'] = list(np.random.choice(keywords, size=np.random.poisson(2, size=1).item()))
    
    return tourist

In [64]:
n_tourists = 100
n_guides = 20

In [65]:
guides = [generate_guide() for _ in range(n_guides)]
tourists = [generate_tourist() for _ in range(n_tourists)]

In [67]:
pd.DataFrame(guides)

Unnamed: 0,gender,name,birth_date,now_available,languages_spoken,price,education,biography,keywords,current_location,experience
0,female,Nicole Byrd,1980-12-30,False,[spanish],30,elementary,"Conservator, furniture","[art, archeology, food, beer, countryside, tra...","{'lat': 40.3565812768762, 'lon': 18.1717473689...",8
1,male,Timothy Phillips,1945-12-21,False,"[english, dutch]",30,elementary,Retail merchandiser,"[cinema, beer, sport, literature, archeology, ...","{'lat': 40.34419200985469, 'lon': 18.168285913...",3
2,male,Clifford Sanders,1986-02-22,False,"[spanish, deutsche, french, chinese]",28,high-school,Environmental health practitioner,"[art, archeology]","{'lat': 40.35299097623107, 'lon': 18.172878191...",9
3,female,Amy Rush,2009-03-02,True,[italian],28,master,Nutritional therapist,"[history, museums, museums]","{'lat': 40.35166448290281, 'lon': 18.176024683...",4
4,female,Kiara Tucker,2000-12-26,False,"[chinese, french, spanish, dutch]",28,master,Special educational needs teacher,"[museums, museums, literature]","{'lat': 40.36295370549671, 'lon': 18.187571535...",7
5,female,Tamara Freeman,1983-04-19,True,"[bulgarian, french]",30,bachelor,"Engineer, broadcasting (operations)",[],"{'lat': 40.35768039876316, 'lon': 18.169815970...",9
6,male,Carlos Haley,1977-04-22,False,"[spanish, deutsche, dutch]",32,elementary,"Loss adjuster, chartered","[beer, archeology]","{'lat': 40.34035900510048, 'lon': 18.182877248...",7
7,male,Alejandro Lopez,1947-09-15,False,[deutsche],23,bachelor,Radio broadcast assistant,"[history, tracking, tracking]","{'lat': 40.35290740754565, 'lon': 18.158952998...",7
8,male,Joshua Valencia,1974-11-08,False,[english],29,master,Solicitor,"[history, wine]","{'lat': 40.37291430111815, 'lon': 18.200550498...",10
9,female,Beth Harris,2018-11-15,False,"[dutch, deutsche, spanish]",32,phd,"Doctor, hospital",[rafting],"{'lat': 40.34629795790367, 'lon': 18.177240333...",6


In [68]:
pd.DataFrame(tourists)

Unnamed: 0,languages,keywords
0,"[deutsche, italian, spanish, french]",[beer]
1,[french],"[art, museums]"
2,"[bulgarian, chinese]",[archeology]
3,"[english, chinese]",[]
4,"[deutsche, dutch, chinese, spanish]",[museums]
...,...,...
95,"[italian, chinese]","[wine, sport, beer]"
96,"[dutch, french]","[countryside, art, cinema]"
97,"[spanish, english]","[countryside, countryside, museums, countryside]"
98,"[bulgarian, spanish]","[cinema, cinema]"


In [368]:
import random 

def generate_rating (tourists, guides):
    ratings = []
    for i in range(tourists):

        # random choose 1 or 2 guides to rate
        for j in range(np.random.randint(1,3)):

            #random choose 3 guides
            random_guides = random.sample(range(guides),3)
            rating = round(np.random.normal(loc=4, scale=0.5, size=1)[0],0)
            ratings.append((i,random_guides[j],rating))
        
        if(np.random.binomial(1, 0.2)):
            bad_rating = round(np.random.normal(loc=1.5, scale=0.3, size=1)[0],0)
            # rate the last guide with bad rating
            ratings.append((i,random_guides[len(random_guides)-1],bad_rating))
    
    return ratings

In [369]:
ratings = generate_rating(n_tourists,n_guides)

In [371]:
import pandas as pd
pd.DataFrame(ratings)

Unnamed: 0,0,1,2
0,0,2,5.0
1,0,8,4.0
2,1,5,4.0
3,2,14,4.0
4,2,16,4.0
...,...,...,...
159,98,13,4.0
160,98,14,4.0
161,98,8,1.0
162,99,12,4.0
