## Fake data generation

In [1]:
from faker import Faker
import numpy as np
import pandas as pd
import random

In [2]:
faker = Faker(seed=666)
np.random.seed(666)

In [3]:
keywords = [
    'archeology',
    'museums',
    'music',
    'art',
    'cinema',
    'countryside',
    'tracking',
    'rafting',
    'history',
    'literature',
    'sport',
    'food',
    'wine',
    'beer'
]

In [4]:
# guide profile generation
def generate_guide():
    guide = {}
    guide['gender'] = np.random.choice(['male', 'female'], size=1).item()
    guide['name'] = (faker.first_name_male() if guide['gender'] == 'male' else faker.first_name_female()) + ' ' + faker.last_name()
    guide['birth_date'] = faker.date_of_birth(maximum_age=65, minimum_age=20)
    guide['now_available'] = True
    guide['languages_spoken'] = list(np.random.choice(['english', 'italian', 'french', 'spanish', 'deutsche', 'dutch', 'bulgarian', 'chinese'], size=1+np.random.poisson(lam=1, size=1).item(), replace=False))
    guide['price'] = int(max(10, np.abs(np.random.normal(30, 5))))
    guide['education'] = np.random.choice(['middle-school', 'high-school', 'bachelor', 'master', 'phd'], size=1).item()
    guide['biography'] = faker.profile()['job']
    guide['keywords'] = list(np.random.choice(keywords, size=np.random.poisson(2, size=1).item(), replace=False))
    guide['current_location'] = {
        "lat": np.random.normal(40.3524, 0.01),
        "lon": np.random.normal(18.1732, 0.01)
    }
    guide['experience'] = random.randint(1,2024-guide['birth_date'].year-18)
    
    return guide

In [5]:
# user profile generation
def generate_tourist():

    tourist = {}

    tourist['languages'] = list(np.random.choice(['english', 'italian', 'french', 'spanish', 'deutsche', 'dutch', 'bulgarian', 'chinese'], size=1+np.random.binomial(1, 0.2), replace=False))

    tourist['keywords'] = list(np.random.choice(keywords, size=np.random.poisson(2, size=1).item(), replace=False))
    
    return tourist

In [6]:
import random 

# rating generation
def generate_rating (tourists, guides):
    ratings = []
    for i in range(tourists):
        
        # random choose 3 guides without duplicates
        random_guides = random.sample(range(guides),3)
        
        # random choose 1 or 2 guides to rate
        for j in range(np.random.randint(1,3)):

            rating = round(np.random.normal(loc=4, scale=0.5, size=1).item(),0)
            ratings.append((i,random_guides[j],rating))

        # random choose if rate badly
        if(np.random.binomial(1, 0.2)):
            bad_rating = round(np.random.normal(loc=1.5, scale=0.3, size=1).item(),0)
            # rate the last guide with bad rating
            ratings.append((i,random_guides[len(random_guides)-1],bad_rating))
    
    return ratings

#### Data generation

In [7]:
n_tourists = 200
n_guides = 40

In [8]:
guides = [generate_guide() for _ in range(n_guides)]
tourists = [generate_tourist() for _ in range(n_tourists)]

In [9]:
ratings = generate_rating(n_tourists,n_guides)

Dataframes

In [10]:
guide_df = pd.DataFrame(guides)
guide_df

Unnamed: 0,gender,name,birth_date,now_available,languages_spoken,price,education,biography,keywords,current_location,experience
0,male,Sean Lewis,1982-10-10,True,[english],25,high-school,Multimedia programmer,[museums],"{'lat': 40.342693584880706, 'lon': 18.16438078...",8
1,female,Leah Blackwell,1959-10-31,True,"[italian, dutch]",36,middle-school,"Education officer, museum","[cinema, rafting, history, wine]","{'lat': 40.3367551413547, 'lon': 18.1569120995...",26
2,female,Tracey Lopez,1963-09-25,True,"[chinese, french, english]",34,phd,Child psychotherapist,"[food, archeology, art]","{'lat': 40.362447049660204, 'lon': 18.14225129...",31
3,male,Charles Young,1959-06-19,True,[bulgarian],46,bachelor,Scientific laboratory technician,"[countryside, rafting, art]","{'lat': 40.36584086550253, 'lon': 18.183002910...",16
4,male,David Burgess,1966-09-09,True,"[deutsche, french]",30,master,Equality and diversity officer,"[countryside, tracking, beer]","{'lat': 40.354724889812935, 'lon': 18.20308322...",5
5,male,Connor Davis,1997-06-04,True,"[deutsche, dutch, bulgarian]",27,middle-school,Purchasing manager,[],"{'lat': 40.35854623058413, 'lon': 18.183459879...",3
6,male,Michael Lee,1966-12-10,True,"[english, chinese, french]",34,middle-school,"Journalist, broadcasting","[rafting, art, sport]","{'lat': 40.35774331848761, 'lon': 18.163273185...",3
7,male,Derrick Davidson,1998-03-17,True,"[spanish, dutch, french, italian]",33,bachelor,Mental health nurse,"[museums, art, countryside, wine]","{'lat': 40.365193043282794, 'lon': 18.18548169...",2
8,male,Christopher Smith,1958-10-09,True,"[bulgarian, english, chinese]",31,middle-school,Ergonomist,"[literature, cinema, music]","{'lat': 40.35265552781134, 'lon': 18.151723396...",42
9,male,Albert Moore,1973-07-20,True,"[chinese, bulgarian, dutch, english]",37,phd,International aid/development worker,[museums],"{'lat': 40.33774657696426, 'lon': 18.182064664...",11


In [11]:
tourist_df = pd.DataFrame(tourists)
tourist_df

Unnamed: 0,languages,keywords
0,"[italian, english]","[cinema, music]"
1,[chinese],[beer]
2,"[bulgarian, spanish]","[history, museums, beer]"
3,"[deutsche, dutch]","[tracking, literature, music]"
4,[spanish],"[tracking, art]"
...,...,...
195,[bulgarian],[]
196,"[french, spanish]","[tracking, cinema]"
197,"[chinese, spanish]","[music, literature, art]"
198,[italian],[]


In [12]:
rating_df = pd.DataFrame(ratings)

In [13]:
rating_df

Unnamed: 0,0,1,2
0,0,7,4.0
1,0,3,3.0
2,1,23,5.0
3,2,4,4.0
4,2,27,4.0
...,...,...,...
351,197,23,2.0
352,198,35,4.0
353,199,18,4.0
354,199,21,4.0


In [14]:
tourist_df.to_csv("Data/tourists_{:d}.csv".format(n_tourists), index=True, sep=';')
guide_df.to_csv("Data/guides_{:d}.csv".format(n_guides), index=True, sep=';')
rating_df.to_csv('Data/ratings_{:d}_{:d}.csv'.format(n_tourists,n_guides), index=False, sep=';')