# Fake data generation

In this section, we generate synthetic data to train the recommender system. This data must be replaced with real data collected from the AsTourX application and should maintain the same or a similar structure.

In particular, we have three different entities:
- Tourist 
- Guide 
- Tour

In [1]:
import numpy as np
import pandas as pd
import random

from faker import Faker

In [2]:
# Set random seed
seed = 666

random.seed(seed)
# faker library for fake information generation
faker = Faker(seed=seed)
np.random.seed(seed)

## Attributes definition

In [3]:
# list of languages
languages = ['english', 'italian', 'french', 'spanish', 'deutsche', 'dutch', 'bulgarian', 'chinese']

In [4]:
# list of keywords 
keywords = [
    'archeology',
    'museums',
    'music',
    'art',
    'cinema',
    'countryside',
    'tracking',
    'rafting',
    'history',
    'literature',
    'sport',
    'food',
    'wine',
    'beer'
]

In [1]:
# list of educational level
education = ['middle-school', 'high-school', 'bachelor', 'master', 'phd']

In [6]:
# list of attractions in Lecce, associated with its type of tour  
lecce_attractions = [
    ("Basilica di Santa Croce", "Historical Tour"),
    ("Piazza del Duomo", "City Tour"),
    ("Roman Amphitheatre", "Archaeological Tour"),
    ("Porta Napoli", "City Tour"),
    ("Castello di Carlo V", "Historical Tour"),
    ("Church of San Matteo", "Religious Tour"),
    ("Piazza Sant'Oronzo", "City Tour"),
    ("Church of Santa Chiara", "Religious Tour"),
    ("Roman Theatre", "Archaeological Tour"),
    ("Church of San Niccolo' e Cataldo", "Religious Tour"),
    ("Palazzo dei Celestini", "Historical Tour"),
    ("Colonna di Sant'Oronzo", "City Tour"),
    ("Palazzo Carafa", "Historical Tour"),
    ("Museo Faggiano", "Museum Tour"),
    ("Torre del Parco", "Historical Tour"),
    ("Lecce Cathedral", "Religious Tour"),
    ("Celestine Convent", "Religious Tour"),
    ("Villa Comunale di Lecce", "Park Tour"),
    ("San Giovanni Battista Church", "Religious Tour"),
    ("Church of San Francesco della Scarpa", "Religious Tour")
]

In [7]:
# mapping of keywords to type of tour
keywords_mapping = {
    "Historical Tour": "history",
    "City Tour": "art",
    "Archaeological Tour": "archeology",
    "Religious Tour": "literature",
    "Museum Tour": "museums",
    "Park Tour": "countryside"
}

#lecce_attractions_with_keywords = [(attraction, keywords_mapping[tour_type]) for attraction, tour_type in lecce_attractions]
#lecce_attractions_with_keywords

In [8]:
lecce_attractions_dict = [{'name' : name, 'keywords' : keywords_mapping[tour_type]} for name, tour_type in lecce_attractions]
lecce_attractions_dict

[{'name': 'Basilica di Santa Croce', 'keywords': 'history'},
 {'name': 'Piazza del Duomo', 'keywords': 'art'},
 {'name': 'Roman Amphitheatre', 'keywords': 'archeology'},
 {'name': 'Porta Napoli', 'keywords': 'art'},
 {'name': 'Castello di Carlo V', 'keywords': 'history'},
 {'name': 'Church of San Matteo', 'keywords': 'literature'},
 {'name': "Piazza Sant'Oronzo", 'keywords': 'art'},
 {'name': 'Church of Santa Chiara', 'keywords': 'literature'},
 {'name': 'Roman Theatre', 'keywords': 'archeology'},
 {'name': "Church of San Niccolo' e Cataldo", 'keywords': 'literature'},
 {'name': 'Palazzo dei Celestini', 'keywords': 'history'},
 {'name': "Colonna di Sant'Oronzo", 'keywords': 'art'},
 {'name': 'Palazzo Carafa', 'keywords': 'history'},
 {'name': 'Museo Faggiano', 'keywords': 'museums'},
 {'name': 'Torre del Parco', 'keywords': 'history'},
 {'name': 'Lecce Cathedral', 'keywords': 'literature'},
 {'name': 'Celestine Convent', 'keywords': 'literature'},
 {'name': 'Villa Comunale di Lecce', '

## Data generation functions

In [9]:
import random
from datetime import datetime, timedelta

# function for generating a random date between the start_date and end_date
def generate_random_date(start_date, end_date):
    
    # Convert start and end dates to datetime objects
    start_dt = datetime.strptime(start_date, '%Y-%m-%d')
    end_dt = datetime.strptime(end_date, '%Y-%m-%d')

    # Calculate the range of days between start and end dates
    delta = end_dt - start_dt

    # Generate a random number of days within the range
    random_days = random.randint(0, delta.days)

    # Add the random number of days to the start date
    random_date = start_dt + timedelta(days=random_days)

    return random_date.strftime('%Y-%m-%d')

# Example usage
start_date = '2024-06-01'
end_date = '2024-07-01'
random_date = generate_random_date(start_date, end_date)
print("Random date between", start_date, "and", end_date, ":", random_date)

Random date between 2024-06-01 and 2024-07-01 : 2024-06-15


In [10]:
# guide profile generation
def generate_guide():
    
    guide = {}
    
    guide['gender'] = np.random.choice(['male', 'female'], size=1).item()
    guide['name'] = (faker.first_name_male() if guide['gender'] == 'male' else faker.first_name_female()) + ' ' + faker.last_name()
    guide['birth_date'] = faker.date_of_birth(maximum_age=65, minimum_age=20)
    
    # availability of the guide, meaning that it is 'online', for default it's set to true
    guide['now_available'] = True
    guide['languages_spoken'] = list(np.random.choice(languages, size=1+np.random.poisson(lam=1, size=1).item(), replace=False))
    
    # guide is associated to a price
    guide['price'] = int(max(10, np.abs(np.random.normal(30, 5))))
    guide['education'] = np.random.choice(education, size=1).item()
    guide['biography'] = faker.profile()['job']
    guide['keywords'] = list(np.random.choice(keywords, size=np.random.poisson(2, size=1).item(), replace=False))
    guide['current_location'] = {
        "lat": np.random.normal(40.3524, 0.01),
        "lon": np.random.normal(18.1732, 0.01)
    }
    guide['experience'] = random.randint(1,2024-guide['birth_date'].year-18)
    
    return guide

In [11]:
# user profile generation
# these information are retrieved from user's profile
def generate_tourist():

    tourist = {}

    tourist['languages'] = list(np.random.choice(languages, size=1, replace=False))
    # list of keywords that describe the tourist (user)
    tourist['keywords'] = list(np.random.choice(keywords, size=np.random.poisson(5, size=1).item(), replace=False))
    
    return tourist

In [12]:
random.sample(lecce_attractions_dict, random.randint(1,3))

[{'name': 'Museo Faggiano', 'keywords': 'museums'},
 {'name': "Church of San Niccolo' e Cataldo", 'keywords': 'literature'}]

In [13]:
def generate_tours(guide, index):
    
    tours = {}

    tours['guide'] = index
    tours['languages'] = guide['languages_spoken']
    tours['city'] = 'Lecce'
    # randomly choose 1-3 Lecce's attractions
    attractions = random.sample(lecce_attractions_dict, random.randint(1,3))
    tours['attractions'] = list(attraction['name'] for attraction in attractions)
    tours['keywords'] = list(np.unique([attraction['keywords'] for attraction in attractions]))
    tours['price'] = int(max(10, np.abs(np.random.normal(30, 5))))
    tours['date'] = generate_random_date(start_date, end_date)
    tours['duration'] = random.randint(5,12)

    return tours
    

In [14]:
import random 

# we assume that ther will be a rating system offered by AsTourX
# for the lack of data, we need to generate fake ratings 
# here the ratings are given by tourists to guides
# function for rating generation
def generate_rating_guides (tourists, guides):
    ratings = []
    for i in range(len(tourists)):
        
        # random choose guides from intersection of languages
        intersect_guides = [index for index in range(len(guides)) if np.all(np.in1d(tourists[i]['languages'], guides[index]['languages_spoken']))]
        # guides that did not speak the same language as that tourist
        other_guides = [index for index in range(len(guides)) if index not in intersect_guides]
        
        # random choose 2 guides without duplicates to give a high rate
        random_guides = np.random.choice(intersect_guides,2, replace=False)

        # random choose 1 guide to give a bad rate (because they do not speak the same language as the tourist)
        random_guide = np.random.choice(other_guides,1)

        
        # random choose 1 or 2 guides to rate
        for j in range(np.random.randint(1,3)):

            rating = round(np.random.normal(loc=4, scale=0.5, size=1).item(),0)
            ratings.append((i,random_guides[j],rating))

#         # random choose if rate badly
# #        if(np.random.binomial(1, 0.2)):
#             bad_rating = round(np.random.normal(loc=1.5, scale=0.3, size=1).item(),0)
#             # rate the last guide with bad rating
#             ratings.append((i,random_guide[0],bad_rating))
#
    return ratings

In [15]:
import random 

# generate fake ratings given by tourists to organized tours
# ratings generation
def generate_rating_tours (tourists, tours):
    ratings = []
    for i in range(len(tourists)):
        
        # in order to make ratings more reasonable
        # we give a high rate (3-5) to tours that have same keywords as the tourist has
        # random choose tours from intersection of keywords
        intersect_tours = [index for index in range(len(tours)) if np.any(np.in1d(tourists[i]['keywords'], tours[index]['keywords']))]

        # if not empty
        if(len(intersect_tours)!=0):
            # random choose 2 guides without duplicates
            random_tours = np.random.choice(intersect_tours, 2, replace=False)
            
            # random choose 1 or 2 guides to rate
            for j in range(np.random.randint(1,3)):
    
                rating = round(np.random.normal(loc=4, scale=0.5, size=1).item(),0)
                ratings.append((i,random_tours[j],rating))
        
    return ratings

## Dataset generation

In [16]:
# number of users (tourists and guides) 
n_tourists = 500
n_guides = 50

In [17]:
# generating user profiles
guides = [generate_guide() for _ in range(n_guides)]
tourists = [generate_tourist() for _ in range(n_tourists)]

In [19]:
tours = []
# generate an organized tour for each guide
for i in range(len(guides)):
    tours.append(generate_tours(guides[i], i))

n_tours = len(tours)
tours

[{'guide': 0,
  'languages': ['english'],
  'city': 'Lecce',
  'attractions': ['Celestine Convent', 'Roman Theatre'],
  'keywords': ['archeology', 'literature'],
  'price': 22,
  'date': '2024-06-30',
  'duration': 12},
 {'guide': 1,
  'languages': ['italian', 'dutch'],
  'city': 'Lecce',
  'attractions': ['Museo Faggiano'],
  'keywords': ['museums'],
  'price': 28,
  'date': '2024-06-26',
  'duration': 9},
 {'guide': 2,
  'languages': ['chinese', 'french', 'english'],
  'city': 'Lecce',
  'attractions': ['Palazzo Carafa', 'Church of Santa Chiara'],
  'keywords': ['history', 'literature'],
  'price': 27,
  'date': '2024-06-03',
  'duration': 11},
 {'guide': 3,
  'languages': ['bulgarian'],
  'city': 'Lecce',
  'attractions': ['Villa Comunale di Lecce',
   'Church of San Matteo',
   'Museo Faggiano'],
  'keywords': ['countryside', 'literature', 'museums'],
  'price': 31,
  'date': '2024-06-29',
  'duration': 11},
 {'guide': 4,
  'languages': ['deutsche', 'french'],
  'city': 'Lecce',
  

In [18]:
# randomly generate ratings given by tourists to guides
guides_ratings = generate_rating_guides(tourists,guides)
guides_ratings

[(0, 36, 4.0),
 (0, 41, 3.0),
 (1, 45, 5.0),
 (2, 48, 4.0),
 (2, 47, 4.0),
 (3, 22, 5.0),
 (3, 16, 3.0),
 (4, 20, 3.0),
 (5, 19, 5.0),
 (6, 34, 5.0),
 (7, 34, 4.0),
 (8, 31, 4.0),
 (8, 30, 4.0),
 (9, 11, 4.0),
 (9, 19, 3.0),
 (10, 40, 4.0),
 (10, 38, 4.0),
 (11, 9, 3.0),
 (11, 32, 3.0),
 (12, 35, 4.0),
 (13, 30, 4.0),
 (14, 20, 4.0),
 (14, 10, 3.0),
 (15, 20, 4.0),
 (15, 16, 4.0),
 (16, 25, 4.0),
 (17, 2, 4.0),
 (18, 14, 4.0),
 (19, 25, 4.0),
 (19, 9, 3.0),
 (20, 24, 4.0),
 (20, 49, 3.0),
 (21, 28, 4.0),
 (21, 7, 4.0),
 (22, 13, 4.0),
 (22, 44, 4.0),
 (23, 31, 4.0),
 (23, 13, 4.0),
 (24, 39, 4.0),
 (24, 7, 4.0),
 (25, 44, 4.0),
 (25, 34, 4.0),
 (26, 38, 4.0),
 (27, 30, 4.0),
 (27, 4, 4.0),
 (28, 13, 4.0),
 (28, 34, 4.0),
 (29, 49, 4.0),
 (29, 20, 5.0),
 (30, 7, 4.0),
 (31, 10, 4.0),
 (32, 22, 4.0),
 (32, 7, 4.0),
 (33, 23, 4.0),
 (33, 16, 4.0),
 (34, 1, 4.0),
 (34, 37, 4.0),
 (35, 24, 4.0),
 (36, 35, 4.0),
 (37, 5, 4.0),
 (38, 19, 4.0),
 (38, 31, 4.0),
 (39, 30, 4.0),
 (39, 5, 4.0),
 (

In [24]:
tours_ratings= generate_rating_tours(tourists, tours)
tours_ratings

[(0, 17, 4.0),
 (0, 33, 4.0),
 (1, 2, 5.0),
 (1, 43, 3.0),
 (2, 44, 4.0),
 (4, 17, 4.0),
 (5, 6, 4.0),
 (5, 3, 4.0),
 (6, 25, 5.0),
 (8, 0, 4.0),
 (9, 40, 4.0),
 (10, 34, 3.0),
 (11, 34, 4.0),
 (11, 10, 4.0),
 (12, 46, 3.0),
 (12, 37, 3.0),
 (13, 2, 4.0),
 (15, 8, 4.0),
 (15, 34, 3.0),
 (16, 17, 5.0),
 (16, 47, 4.0),
 (17, 33, 4.0),
 (17, 41, 5.0),
 (18, 16, 4.0),
 (20, 42, 4.0),
 (21, 23, 4.0),
 (22, 26, 3.0),
 (22, 33, 4.0),
 (23, 4, 4.0),
 (23, 28, 4.0),
 (25, 32, 3.0),
 (26, 47, 4.0),
 (27, 37, 3.0),
 (28, 31, 4.0),
 (30, 45, 4.0),
 (30, 3, 4.0),
 (32, 45, 4.0),
 (33, 1, 3.0),
 (34, 13, 3.0),
 (35, 41, 4.0),
 (35, 21, 4.0),
 (36, 34, 4.0),
 (37, 23, 4.0),
 (37, 47, 4.0),
 (38, 47, 4.0),
 (38, 25, 4.0),
 (39, 21, 5.0),
 (40, 37, 4.0),
 (40, 4, 4.0),
 (41, 37, 4.0),
 (42, 32, 4.0),
 (42, 36, 4.0),
 (43, 15, 4.0),
 (45, 21, 3.0),
 (46, 44, 3.0),
 (47, 12, 4.0),
 (47, 34, 4.0),
 (48, 37, 4.0),
 (48, 13, 5.0),
 (49, 12, 4.0),
 (49, 30, 4.0),
 (50, 37, 3.0),
 (50, 20, 4.0),
 (51, 0, 4.0)

In [21]:
# info of the first tourist
tourists[0]

{'languages': ['spanish'],
 'keywords': ['rafting', 'countryside', 'literature']}

In [22]:
# info of the organized tour of the first guide
tours[0]

{'guide': 0,
 'languages': ['english'],
 'city': 'Lecce',
 'attractions': ['Celestine Convent', 'Roman Theatre'],
 'keywords': ['archeology', 'literature'],
 'price': 22,
 'date': '2024-06-30',
 'duration': 12}

### Dataframes creation

In [25]:
guide_df = pd.DataFrame(guides)
# viewing first 10 guides
guide_df[:10]

Unnamed: 0,gender,name,birth_date,now_available,languages_spoken,price,education,biography,keywords,current_location,experience
0,male,Henry Robles,1996-10-16,True,[english],25,high-school,Community development worker,[museums],"{'lat': 40.342693584880706, 'lon': 18.16438078...",9
1,female,Raven Burke,2004-05-14,True,"[italian, dutch]",36,middle-school,Quantity surveyor,"[cinema, rafting, history, wine]","{'lat': 40.3367551413547, 'lon': 18.1569120995...",1
2,female,Nancy Keith,1984-08-04,True,"[chinese, french, english]",34,phd,Secretary/administrator,"[food, archeology, art]","{'lat': 40.362447049660204, 'lon': 18.14225129...",18
3,male,Michael Mitchell,1981-08-22,True,[bulgarian],46,bachelor,"Lighting technician, broadcasting/film/video","[countryside, rafting, art]","{'lat': 40.36584086550253, 'lon': 18.183002910...",18
4,male,John Bailey,1971-10-09,True,"[deutsche, french]",30,master,Estate manager/land agent,"[countryside, tracking, beer]","{'lat': 40.354724889812935, 'lon': 18.20308322...",22
5,male,Justin Allen,1987-10-04,True,"[deutsche, dutch, bulgarian]",27,middle-school,Microbiologist,[],"{'lat': 40.35854623058413, 'lon': 18.183459879...",4
6,male,Jose Russell,1995-01-09,True,"[english, chinese, french]",34,middle-school,"Pharmacist, community","[rafting, art, sport]","{'lat': 40.35774331848761, 'lon': 18.163273185...",1
7,male,Jack Smith,1981-07-28,True,"[spanish, dutch, french, italian]",33,bachelor,Theatre manager,"[museums, art, countryside, wine]","{'lat': 40.365193043282794, 'lon': 18.18548169...",5
8,male,Victor Gonzalez,2002-09-04,True,"[bulgarian, english, chinese]",31,middle-school,Geochemist,"[literature, cinema, music]","{'lat': 40.35265552781134, 'lon': 18.151723396...",4
9,male,John Brown,1990-02-18,True,"[chinese, bulgarian, dutch, english]",37,phd,"Development worker, international aid",[museums],"{'lat': 40.33774657696426, 'lon': 18.182064664...",15


In [26]:
tourist_df = pd.DataFrame(tourists)
# viewing first 10 tourists
tourist_df[:10]

Unnamed: 0,languages,keywords
0,[spanish],"[rafting, countryside, literature]"
1,[chinese],"[art, wine, literature, sport]"
2,[italian],"[sport, rafting, cinema, museums, music, art]"
3,[spanish],[beer]
4,[french],"[tracking, food, art, countryside, rafting, hi..."
5,[bulgarian],[countryside]
6,[italian],"[literature, cinema, music, archeology, art, m..."
7,[spanish],"[wine, music]"
8,[english],"[archeology, art, wine, cinema, music, rafting..."
9,[bulgarian],"[literature, sport, tracking]"


In [27]:
# visualize guides' rating
# 'tourist_id' gives 'rating' to 'guide_id'
guides_rating_df = pd.DataFrame(guides_ratings)
guides_rating_df.rename(columns={guides_rating_df.columns[0]: 'tourist_id',guides_rating_df.columns[1]: 'guide_id',guides_rating_df.columns[2]: 'rating'}, inplace=False)

Unnamed: 0,tourist_id,guide_id,rating
0,0,36,4.0
1,0,41,3.0
2,1,45,5.0
3,2,48,4.0
4,2,47,4.0
...,...,...,...
737,496,48,4.0
738,497,7,5.0
739,498,48,4.0
740,498,23,5.0


In [28]:
# dataframe of organized tours
tours_df = pd.DataFrame(tours)
tours_df

Unnamed: 0,guide,languages,city,attractions,keywords,price,date,duration
0,0,[english],Lecce,"[Celestine Convent, Roman Theatre]","[archeology, literature]",22,2024-06-30,12
1,1,"[italian, dutch]",Lecce,[Museo Faggiano],[museums],28,2024-06-26,9
2,2,"[chinese, french, english]",Lecce,"[Palazzo Carafa, Church of Santa Chiara]","[history, literature]",27,2024-06-03,11
3,3,[bulgarian],Lecce,"[Villa Comunale di Lecce, Church of San Matteo...","[countryside, literature, museums]",31,2024-06-29,11
4,4,"[deutsche, french]",Lecce,"[Torre del Parco, Palazzo dei Celestini]",[history],31,2024-06-17,12
5,5,"[deutsche, dutch, bulgarian]",Lecce,"[Museo Faggiano, Palazzo dei Celestini]","[history, museums]",26,2024-06-29,7
6,6,"[english, chinese, french]",Lecce,"[Piazza Sant'Oronzo, Villa Comunale di Lecce]","[art, countryside]",38,2024-06-28,10
7,7,"[spanish, dutch, french, italian]",Lecce,[Celestine Convent],[literature],34,2024-07-01,7
8,8,"[bulgarian, english, chinese]",Lecce,"[San Giovanni Battista Church, Roman Theatre, ...","[archeology, history, literature]",25,2024-06-10,11
9,9,"[chinese, bulgarian, dutch, english]",Lecce,[Palazzo dei Celestini],[history],29,2024-06-16,8


In [29]:
# rating to organized tours
tours_rating_df = pd.DataFrame(tours_ratings)
tours_rating_df.rename(columns={tours_rating_df.columns[0]: 'tourist_id',tours_rating_df.columns[1]: 'tour_id',guides_rating_df.columns[2]: 'rating'}, inplace=False)

Unnamed: 0,tourist_id,tour_id,rating
0,0,17,4.0
1,0,33,4.0
2,1,2,5.0
3,1,43,3.0
4,2,44,4.0
...,...,...,...
666,495,28,4.0
667,496,29,4.0
668,497,19,4.0
669,497,26,4.0


## Generate files in format .csv

In [30]:
# saving dataframes to files for later use
tourist_df.to_csv("Data/tourists_{:d}.csv".format(n_tourists), index=True, sep=';')
guide_df.to_csv("Data/guides_{:d}.csv".format(n_guides), index=True, sep=';')
tours_df.to_csv('Data/tours_{:d}.csv'.format(n_tours), index=True, sep=';')

In [31]:
# saving ratings
guides_rating_df.to_csv('Data/guides_ratings_{:d}_{:d}.csv'.format(n_tourists,n_guides), index=False, sep=';')
tours_rating_df.to_csv('Data/tours_ratings_{:d}_{:d}.csv'.format(n_tourists,n_tours), index=False, sep=';')