In [56]:
from faker import Faker
import numpy as np
import pandas as pd
import random

In [57]:
faker = Faker(seed=666)
np.random.seed(666)

In [58]:
keywords = [
    'archeology',
    'museums',
    'music',
    'art',
    'cinema',
    'countryside',
    'tracking',
    'rafting',
    'history',
    'literature',
    'sport',
    'food',
    'wine',
    'beer'
]

In [81]:
# guide profile generation
def generate_guide():
    guide = {}
    guide['gender'] = np.random.choice(['male', 'female'], size=1).item()
    guide['name'] = (faker.first_name_male() if guide['gender'] == 'male' else faker.first_name_female()) + ' ' + faker.last_name()
    guide['birth_date'] = faker.date_of_birth()
    guide['now_available'] = bool(np.random.binomial(1, 0.3))
    guide['languages_spoken'] = list(np.random.choice(['english', 'italian', 'french', 'spanish', 'deutsche', 'dutch', 'bulgarian', 'chinese'], size=1+np.random.poisson(lam=1, size=1).item(), replace=False))
    guide['price'] = int(max(10, np.abs(np.random.normal(30, 5))))
    guide['education'] = np.random.choice(['elementary', 'high-school', 'bachelor', 'master', 'phd'], size=1).item()
    guide['biography'] = faker.profile()['job']
    guide['keywords'] = list(np.random.choice(keywords, size=np.random.poisson(2, size=1).item()))
    guide['current_location'] = {
        "lat": np.random.normal(40.3524, 0.01),
        "lon": np.random.normal(18.1732, 0.01)
    }
    guide['experience'] = random.randint(1,10)
    
    return guide

In [85]:
# user profile generation
def generate_tourist():

    tourist = {}

    tourist['languages'] = list(np.random.choice(['english', 'italian', 'french', 'spanish', 'deutsche', 'dutch', 'bulgarian', 'chinese'], size=1+np.random.binomial(1, 0.2), replace=False))

    tourist['keywords'] = list(np.random.choice(keywords, size=np.random.poisson(2, size=1).item()))
    
    return tourist

In [90]:
import random 

# rating generation
def generate_rating (tourists, guides):
    ratings = []
    for i in range(tourists):
        
        # random choose 3 guides without duplicates
        random_guides = random.sample(range(guides),3)
        
        # random choose 1 or 2 guides to rate
        for j in range(np.random.randint(1,3)):

            rating = round(np.random.normal(loc=4, scale=0.5, size=1).item(),0)
            ratings.append((i,random_guides[j],rating))

        # random choose if rate badly
        if(np.random.binomial(1, 0.2)):
            bad_rating = round(np.random.normal(loc=1.5, scale=0.3, size=1).item(),0)
            # rate the last guide with bad rating
            ratings.append((i,random_guides[len(random_guides)-1],bad_rating))
    
    return ratings

#### Data generation

In [91]:
n_tourists = 100
n_guides = 20

In [92]:
guides = [generate_guide() for _ in range(n_guides)]
tourists = [generate_tourist() for _ in range(n_tourists)]

In [93]:
ratings = generate_rating(n_tourists,n_guides)

In [94]:
pd.DataFrame(guides)

Unnamed: 0,gender,name,birth_date,now_available,languages_spoken,price,education,biography,keywords,current_location,experience
0,male,Patrick Nguyen,1930-12-26,False,[italian],30,elementary,Public affairs consultant,[countryside],"{'lat': 40.33750169036123, 'lon': 18.177781042...",6
1,female,Maria Williams,1931-05-08,False,"[english, deutsche]",31,bachelor,Radiation protection practitioner,"[music, literature, history]","{'lat': 40.34893567558408, 'lon': 18.175629977...",7
2,male,Jason Gibbs,1963-04-05,False,[bulgarian],27,master,Podiatrist,[],"{'lat': 40.33999729500657, 'lon': 18.159843118...",7
3,male,Andrew Moore,1923-11-08,False,[english],29,high-school,Clinical molecular geneticist,"[archeology, music, tracking]","{'lat': 40.340416521002666, 'lon': 18.17484978...",2
4,male,Patrick Wolf,1970-08-29,False,"[chinese, english]",27,elementary,Dispensing optician,"[tracking, literature, wine]","{'lat': 40.35084261510491, 'lon': 18.188645995...",9
5,male,John Wilson,1975-06-02,False,"[bulgarian, deutsche, spanish]",19,phd,Insurance underwriter,"[history, music, literature]","{'lat': 40.37097973911368, 'lon': 18.164000245...",2
6,female,Madison Maxwell,1961-10-11,False,"[chinese, italian]",25,phd,Amenity horticulturist,"[sport, sport]","{'lat': 40.370260668438604, 'lon': 18.18951471...",2
7,male,William Mcdaniel,2007-12-22,False,[english],43,master,Film/video editor,[rafting],"{'lat': 40.35694101726511, 'lon': 18.174314925...",4
8,female,Lisa Davis,1926-06-19,False,[bulgarian],28,master,Consulting civil engineer,[archeology],"{'lat': 40.34358320024683, 'lon': 18.187350339...",2
9,female,Laura Stein,1942-09-05,False,[deutsche],31,high-school,Local government officer,[art],"{'lat': 40.33647717848226, 'lon': 18.180656731...",2


In [95]:
pd.DataFrame(tourists)

Unnamed: 0,languages,keywords
0,[english],"[museums, sport, rafting, literature]"
1,[deutsche],"[literature, art]"
2,[italian],[]
3,"[spanish, english]","[sport, art, sport, cinema, archeology]"
4,[italian],"[rafting, history]"
...,...,...
95,[bulgarian],[literature]
96,[italian],[tracking]
97,[english],"[museums, sport]"
98,"[spanish, dutch]",[food]


In [96]:
pd.DataFrame(ratings)

Unnamed: 0,0,1,2
0,0,16,3.0
1,1,19,4.0
2,2,14,4.0
3,3,7,4.0
4,3,15,1.0
...,...,...,...
166,98,2,4.0
167,98,16,4.0
168,99,14,4.0
169,99,2,4.0
