# Fake data generation

In [1]:
from faker import Faker
import numpy as np
import pandas as pd
import random

In [2]:
# Set random seed
seed = 666

random.seed(seed)
faker = Faker(seed=seed)
np.random.seed(seed)

In [3]:
languages = ['english', 'italian', 'french', 'spanish', 'deutsche', 'dutch', 'bulgarian', 'chinese']

In [4]:
keywords = [
    'archeology',
    'museums',
    'music',
    'art',
    'cinema',
    'countryside',
    'tracking',
    'rafting',
    'history',
    'literature',
    'sport',
    'food',
    'wine',
    'beer'
]

In [5]:
education = ['middle-school', 'high-school', 'bachelor', 'master', 'phd']

In [6]:
# guide profile generation
def generate_guide():
    guide = {}
    guide['gender'] = np.random.choice(['male', 'female'], size=1).item()
    guide['name'] = (faker.first_name_male() if guide['gender'] == 'male' else faker.first_name_female()) + ' ' + faker.last_name()
    guide['birth_date'] = faker.date_of_birth(maximum_age=65, minimum_age=20)
    guide['now_available'] = True
    guide['languages_spoken'] = list(np.random.choice(languages, size=1+np.random.poisson(lam=1, size=1).item(), replace=False))
    guide['price'] = int(max(10, np.abs(np.random.normal(30, 5))))
    guide['education'] = np.random.choice(education, size=1).item()
    guide['biography'] = faker.profile()['job']
    guide['keywords'] = list(np.random.choice(keywords, size=np.random.poisson(2, size=1).item(), replace=False))
    guide['current_location'] = {
        "lat": np.random.normal(40.3524, 0.01),
        "lon": np.random.normal(18.1732, 0.01)
    }
    guide['experience'] = random.randint(1,2024-guide['birth_date'].year-18)
    
    return guide

In [7]:
# user profile generation
def generate_tourist():

    tourist = {}

    tourist['languages'] = list(np.random.choice(languages, size=1, replace=False))
    tourist['keywords'] = list(np.random.choice(keywords, size=np.random.poisson(2, size=1).item(), replace=False))
    
    return tourist

In [8]:
import random 

# rating generation
def generate_rating (tourists, guides):
    ratings = []
    for i in range(len(tourists)):
        
        # random choose guides from intersection of languages
        intersect_guides = [index for index in range(len(guides)) if np.all(np.in1d(tourists[i]['languages'], guides[index]['languages_spoken']))]
        other_guides = [index for index in range(len(guides)) if index not in intersect_guides]
        
        # random choose 3 guides without duplicates
        random_guides = np.random.choice(intersect_guides,2, replace=False)
        random_guide = np.random.choice(other_guides,1)
        
        # random choose 1 or 2 guides to rate
        for j in range(np.random.randint(1,3)):

            rating = round(np.random.normal(loc=4, scale=0.5, size=1).item(),0)
            ratings.append((i,random_guides[j],rating))

#         # random choose if rate badly
# #        if(np.random.binomial(1, 0.2)):
#             bad_rating = round(np.random.normal(loc=1.5, scale=0.3, size=1).item(),0)
#             # rate the last guide with bad rating
#             ratings.append((i,random_guide[0],bad_rating))
#
    return ratings

#### Data generation

In [9]:
n_tourists = 200
n_guides = 40

In [10]:
guides = [generate_guide() for _ in range(n_guides)]
tourists = [generate_tourist() for _ in range(n_tourists)]

In [11]:
ratings = generate_rating(tourists,guides)

Dataframes

In [12]:
guide_df = pd.DataFrame(guides)
guide_df

Unnamed: 0,gender,name,birth_date,now_available,languages_spoken,price,education,biography,keywords,current_location,experience
0,male,Jeffrey Carroll,1960-10-18,True,[english],25,high-school,Sports coach,[museums],"{'lat': 40.342693584880706, 'lon': 18.16438078...",30
1,female,Regina Thomas,1989-11-21,True,"[italian, dutch]",36,middle-school,Stage manager,"[cinema, rafting, history, wine]","{'lat': 40.3367551413547, 'lon': 18.1569120995...",13
2,female,Brianna Nicholson,1962-05-03,True,"[chinese, french, english]",34,phd,"Psychotherapist, dance movement","[food, archeology, art]","{'lat': 40.362447049660204, 'lon': 18.14225129...",28
3,male,Jordan Ali,1992-10-15,True,[bulgarian],46,bachelor,"Scientist, clinical (histocompatibility and im...","[countryside, rafting, art]","{'lat': 40.36584086550253, 'lon': 18.183002910...",5
4,male,Steven Taylor,1985-01-25,True,"[deutsche, french]",30,master,Trading standards officer,"[countryside, tracking, beer]","{'lat': 40.354724889812935, 'lon': 18.20308322...",17
5,male,Ronald Mitchell,1959-06-15,True,"[deutsche, dutch, bulgarian]",27,middle-school,"Teacher, secondary school",[],"{'lat': 40.35854623058413, 'lon': 18.183459879...",1
6,male,John Lloyd,1983-04-08,True,"[english, chinese, french]",34,middle-school,"Engineer, electrical","[rafting, art, sport]","{'lat': 40.35774331848761, 'lon': 18.163273185...",18
7,male,Jason Hartman,1971-12-09,True,"[spanish, dutch, french, italian]",33,bachelor,Fish farm manager,"[museums, art, countryside, wine]","{'lat': 40.365193043282794, 'lon': 18.18548169...",22
8,male,Cody Stevens,1992-11-27,True,"[bulgarian, english, chinese]",31,middle-school,"Therapist, art","[literature, cinema, music]","{'lat': 40.35265552781134, 'lon': 18.151723396...",2
9,male,Daniel Carey,1988-09-18,True,"[chinese, bulgarian, dutch, english]",37,phd,Fish farm manager,[museums],"{'lat': 40.33774657696426, 'lon': 18.182064664...",2


In [13]:
guide_df.dtypes

gender              object
name                object
birth_date          object
now_available         bool
languages_spoken    object
price                int64
education           object
biography           object
keywords            object
current_location    object
experience           int64
dtype: object

In [14]:
tourist_df = pd.DataFrame(tourists)
tourist_df

Unnamed: 0,languages,keywords
0,[bulgarian],[cinema]
1,[deutsche],[]
2,[spanish],"[rafting, museums]"
3,[deutsche],"[cinema, history, food]"
4,[chinese],"[wine, rafting]"
...,...,...
195,[french],[cinema]
196,[dutch],"[food, tracking, history]"
197,[dutch],"[museums, art, history]"
198,[italian],"[rafting, literature]"


In [15]:
rating_df = pd.DataFrame(ratings)

In [16]:
rating_df

Unnamed: 0,0,1,2
0,0,32,5.0
1,0,10,4.0
2,1,20,5.0
3,1,15,4.0
4,2,34,4.0
...,...,...,...
300,195,14,4.0
301,196,18,3.0
302,197,5,4.0
303,198,18,4.0


In [17]:
tourist_df.to_csv("Data/tourists_{:d}.csv".format(n_tourists), index=True, sep=';')
guide_df.to_csv("Data/guides_{:d}.csv".format(n_guides), index=True, sep=';')
rating_df.to_csv('Data/ratings_{:d}_{:d}.csv'.format(n_tourists,n_guides), index=False, sep=';')