In [56]:
from faker import Faker
import numpy as np
import pandas as pd
import random

In [57]:
faker = Faker(seed=666)
np.random.seed(666)

In [58]:
keywords = [
    'archeology',
    'museums',
    'music',
    'art',
    'cinema',
    'countryside',
    'tracking',
    'rafting',
    'history',
    'literature',
    'sport',
    'food',
    'wine',
    'beer'
]

In [106]:
# guide profile generation
def generate_guide():
    guide = {}
    guide['gender'] = np.random.choice(['male', 'female'], size=1).item()
    guide['name'] = (faker.first_name_male() if guide['gender'] == 'male' else faker.first_name_female()) + ' ' + faker.last_name()
    guide['birth_date'] = faker.date_of_birth()
    guide['now_available'] = True
    guide['languages_spoken'] = list(np.random.choice(['english', 'italian', 'french', 'spanish', 'deutsche', 'dutch', 'bulgarian', 'chinese'], size=1+np.random.poisson(lam=1, size=1).item(), replace=False))
    guide['price'] = int(max(10, np.abs(np.random.normal(30, 5))))
    guide['education'] = np.random.choice(['elementary', 'high-school', 'bachelor', 'master', 'phd'], size=1).item()
    guide['biography'] = faker.profile()['job']
    guide['keywords'] = list(np.random.choice(keywords, size=np.random.poisson(2, size=1).item()))
    guide['current_location'] = {
        "lat": np.random.normal(40.3524, 0.01),
        "lon": np.random.normal(18.1732, 0.01)
    }
    guide['experience'] = random.randint(1,10)
    
    return guide

In [107]:
# user profile generation
def generate_tourist():

    tourist = {}

    tourist['languages'] = list(np.random.choice(['english', 'italian', 'french', 'spanish', 'deutsche', 'dutch', 'bulgarian', 'chinese'], size=1+np.random.binomial(1, 0.2), replace=False))

    tourist['keywords'] = list(np.random.choice(keywords, size=np.random.poisson(2, size=1).item()))
    
    return tourist

In [108]:
import random 

# rating generation
def generate_rating (tourists, guides):
    ratings = []
    for i in range(tourists):
        
        # random choose 3 guides without duplicates
        random_guides = random.sample(range(guides),3)
        
        # random choose 1 or 2 guides to rate
        for j in range(np.random.randint(1,3)):

            rating = round(np.random.normal(loc=4, scale=0.5, size=1).item(),0)
            ratings.append((i,random_guides[j],rating))

        # random choose if rate badly
        if(np.random.binomial(1, 0.2)):
            bad_rating = round(np.random.normal(loc=1.5, scale=0.3, size=1).item(),0)
            # rate the last guide with bad rating
            ratings.append((i,random_guides[len(random_guides)-1],bad_rating))
    
    return ratings

#### Data generation

In [109]:
n_tourists = 100
n_guides = 20

In [110]:
guides = [generate_guide() for _ in range(n_guides)]
tourists = [generate_tourist() for _ in range(n_tourists)]

In [111]:
ratings = generate_rating(n_tourists,n_guides)

In [112]:
pd.DataFrame(guides)

Unnamed: 0,gender,name,birth_date,now_available,languages_spoken,price,education,biography,keywords,current_location,experience
0,female,Chelsea Martinez,1937-02-01,True,"[dutch, chinese]",29,bachelor,"Designer, textile",[wine],"{'lat': 40.360583803214006, 'lon': 18.17052799...",10
1,male,Daniel Ibarra,1916-10-03,True,[bulgarian],29,phd,Social researcher,[archeology],"{'lat': 40.351701701073104, 'lon': 18.17094077...",6
2,female,Wendy Anderson,1945-02-24,True,"[french, italian]",28,master,English as a second language teacher,"[museums, food, wine]","{'lat': 40.35688994865342, 'lon': 18.178920648...",4
3,male,Robert Adams,1915-04-10,True,[english],31,elementary,"Librarian, public",[],"{'lat': 40.352684858988965, 'lon': 18.17184990...",4
4,female,Grace Sparks,1940-04-08,True,"[spanish, bulgarian]",33,master,Copy,[literature],"{'lat': 40.353073531674546, 'lon': 18.17082903...",4
5,male,Michael Christian,1995-03-25,True,[chinese],28,high-school,Marine scientist,"[music, history]","{'lat': 40.36338197601472, 'lon': 18.181614978...",6
6,male,George Reeves,2022-07-16,True,"[english, chinese]",29,high-school,Public relations account executive,[],"{'lat': 40.345630619548274, 'lon': 18.16613237...",3
7,female,Cindy Molina,1954-05-16,True,[chinese],36,bachelor,Insurance account manager,"[literature, museums]","{'lat': 40.36147220160234, 'lon': 18.172627891...",5
8,female,Amanda Roberts,1977-09-25,True,[bulgarian],33,bachelor,Land,"[cinema, museums, literature, history]","{'lat': 40.344907525808964, 'lon': 18.18945843...",7
9,female,Alyssa Hill,2018-03-27,True,[deutsche],32,bachelor,"Scientist, physiological","[food, music, cinema, literature, museums, lit...","{'lat': 40.33631630303364, 'lon': 18.175822508...",1


In [104]:
pd.DataFrame(tourists)

Unnamed: 0,languages,keywords
0,[italian],"[cinema, food, music]"
1,"[spanish, chinese]","[wine, sport]"
2,[bulgarian],"[music, cinema, countryside]"
3,[bulgarian],"[countryside, food]"
4,[dutch],[wine]
...,...,...
95,[bulgarian],"[cinema, art, tracking, cinema, archeology, sp..."
96,[spanish],"[beer, history]"
97,[french],"[countryside, art, tracking, beer]"
98,[deutsche],[tracking]


In [105]:
pd.DataFrame(ratings)

Unnamed: 0,0,1,2
0,0,10,4.0
1,1,4,4.0
2,1,16,4.0
3,2,18,5.0
4,2,15,4.0
...,...,...,...
170,98,4,4.0
171,98,11,4.0
172,99,8,5.0
173,99,3,4.0
