# Fake data generation

In [105]:
import numpy as np
import pandas as pd
import random

from faker import Faker

In [106]:
# Set random seed
seed = 666

random.seed(seed)
faker = Faker(seed=seed)
np.random.seed(seed)

## Attributes definition

In [107]:
languages = ['english', 'italian', 'french', 'spanish', 'deutsche', 'dutch', 'bulgarian', 'chinese']

In [108]:
keywords = [
    'archeology',
    'museums',
    'music',
    'art',
    'cinema',
    'countryside',
    'tracking',
    'rafting',
    'history',
    'literature',
    'sport',
    'food',
    'wine',
    'beer'
]

In [109]:
education = ['middle-school', 'high-school', 'bachelor', 'master', 'phd']

In [110]:
lecce_attractions = [
    ("Basilica di Santa Croce", "Historical Tour"),
    ("Piazza del Duomo", "City Tour"),
    ("Roman Amphitheatre", "Archaeological Tour"),
    ("Porta Napoli", "City Tour"),
    ("Castello di Carlo V", "Historical Tour"),
    ("Church of San Matteo", "Religious Tour"),
    ("Piazza Sant'Oronzo", "City Tour"),
    ("Church of Santa Chiara", "Religious Tour"),
    ("Roman Theatre", "Archaeological Tour"),
    ("Church of San Niccolò e Cataldo", "Religious Tour"),
    ("Palazzo dei Celestini", "Historical Tour"),
    ("Colonna di Sant'Oronzo", "City Tour"),
    ("Palazzo Carafa", "Historical Tour"),
    ("Museo Faggiano", "Museum Tour"),
    ("Torre del Parco", "Historical Tour"),
    ("Lecce Cathedral", "Religious Tour"),
    ("Celestine Convent", "Religious Tour"),
    ("Villa Comunale di Lecce", "Park Tour"),
    ("San Giovanni Battista Church", "Religious Tour"),
    ("Church of San Francesco della Scarpa", "Religious Tour")
]

In [111]:
keywords_mapping = {
    "Historical Tour": "history",
    "City Tour": "art",
    "Archaeological Tour": "archeology",
    "Religious Tour": "literature",
    "Museum Tour": "museums",
    "Park Tour": "countryside"
}

#lecce_attractions_with_keywords = [(attraction, keywords_mapping[tour_type]) for attraction, tour_type in lecce_attractions]
#lecce_attractions_with_keywords

In [112]:
lecce_attractions_dict = [{'name' : name, 'keywords' : keywords_mapping[tour_type]} for name, tour_type in lecce_attractions]
lecce_attractions_dict

[{'name': 'Basilica di Santa Croce', 'keywords': 'history'},
 {'name': 'Piazza del Duomo', 'keywords': 'art'},
 {'name': 'Roman Amphitheatre', 'keywords': 'archeology'},
 {'name': 'Porta Napoli', 'keywords': 'art'},
 {'name': 'Castello di Carlo V', 'keywords': 'history'},
 {'name': 'Church of San Matteo', 'keywords': 'literature'},
 {'name': "Piazza Sant'Oronzo", 'keywords': 'art'},
 {'name': 'Church of Santa Chiara', 'keywords': 'literature'},
 {'name': 'Roman Theatre', 'keywords': 'archeology'},
 {'name': 'Church of San Niccolò e Cataldo', 'keywords': 'literature'},
 {'name': 'Palazzo dei Celestini', 'keywords': 'history'},
 {'name': "Colonna di Sant'Oronzo", 'keywords': 'art'},
 {'name': 'Palazzo Carafa', 'keywords': 'history'},
 {'name': 'Museo Faggiano', 'keywords': 'museums'},
 {'name': 'Torre del Parco', 'keywords': 'history'},
 {'name': 'Lecce Cathedral', 'keywords': 'literature'},
 {'name': 'Celestine Convent', 'keywords': 'literature'},
 {'name': 'Villa Comunale di Lecce', 'k

## Data generation functions

In [113]:
import random
from datetime import datetime, timedelta

def generate_random_date(start_date, end_date):
    # Convert start and end dates to datetime objects
    start_dt = datetime.strptime(start_date, '%Y-%m-%d')
    end_dt = datetime.strptime(end_date, '%Y-%m-%d')

    # Calculate the range of days between start and end dates
    delta = end_dt - start_dt

    # Generate a random number of days within the range
    random_days = random.randint(0, delta.days)

    # Add the random number of days to the start date
    random_date = start_dt + timedelta(days=random_days)

    return random_date.strftime('%Y-%m-%d')

# Example usage
start_date = '2024-06-01'
end_date = '2024-07-01'
random_date = generate_random_date(start_date, end_date)
print("Random date between", start_date, "and", end_date, ":", random_date)

Random date between 2024-06-01 and 2024-07-01 : 2024-06-15


In [114]:
# guide profile generation
def generate_guide():
    
    guide = {}
    
    guide['gender'] = np.random.choice(['male', 'female'], size=1).item()
    guide['name'] = (faker.first_name_male() if guide['gender'] == 'male' else faker.first_name_female()) + ' ' + faker.last_name()
    guide['birth_date'] = faker.date_of_birth(maximum_age=65, minimum_age=20)
    guide['now_available'] = True
    guide['languages_spoken'] = list(np.random.choice(languages, size=1+np.random.poisson(lam=1, size=1).item(), replace=False))
    guide['price'] = int(max(10, np.abs(np.random.normal(30, 5))))
    guide['education'] = np.random.choice(education, size=1).item()
    guide['biography'] = faker.profile()['job']
    guide['keywords'] = list(np.random.choice(keywords, size=np.random.poisson(2, size=1).item(), replace=False))
    guide['current_location'] = {
        "lat": np.random.normal(40.3524, 0.01),
        "lon": np.random.normal(18.1732, 0.01)
    }
    guide['experience'] = random.randint(1,2024-guide['birth_date'].year-18)
    
    return guide

In [115]:
# user profile generation
def generate_tourist():

    tourist = {}

    tourist['languages'] = list(np.random.choice(languages, size=1, replace=False))
    tourist['keywords'] = list(np.random.choice(keywords, size=np.random.poisson(5, size=1).item(), replace=False))
    
    return tourist

In [82]:
random.sample(lecce_attractions_dict, random.randint(1,3))

[{'name': 'Museo Faggiano', 'keywords': 'museums'},
 {'name': 'Church of San Niccolò e Cataldo', 'keywords': 'literature'}]

In [116]:
def generate_tours(guide, index):
    
    tours = {}

    tours['guide'] = index
    tours['languages'] = guide['languages_spoken']
    tours['city'] = 'Lecce'
    # randomly choose 1-3 Lecce's attractions
    attractions = random.sample(lecce_attractions_dict, random.randint(1,3))
    tours['attractions'] = list(attraction['name'] for attraction in attractions)
    tours['keywords'] = list(np.unique([attraction['keywords'] for attraction in attractions]))
    tours['price'] = int(max(10, np.abs(np.random.normal(30, 5))))
    tours['date'] = generate_random_date(start_date, end_date)
    tours['duration'] = random.randint(5,12)

    return tours
    

In [117]:
import random 

# rating generation
def generate_rating_guides (tourists, guides):
    ratings = []
    for i in range(len(tourists)):
        
        # random choose guides from intersection of languages
        intersect_guides = [index for index in range(len(guides)) if np.all(np.in1d(tourists[i]['languages'], guides[index]['languages_spoken']))]
        other_guides = [index for index in range(len(guides)) if index not in intersect_guides]
        
        # random choose 3 guides without duplicates
        random_guides = np.random.choice(intersect_guides,2, replace=False)
        random_guide = np.random.choice(other_guides,1)
        
        # random choose 1 or 2 guides to rate
        for j in range(np.random.randint(1,3)):

            rating = round(np.random.normal(loc=4, scale=0.5, size=1).item(),0)
            ratings.append((i,random_guides[j],rating))

#         # random choose if rate badly
# #        if(np.random.binomial(1, 0.2)):
#             bad_rating = round(np.random.normal(loc=1.5, scale=0.3, size=1).item(),0)
#             # rate the last guide with bad rating
#             ratings.append((i,random_guide[0],bad_rating))
#
    return ratings

In [118]:
import random 

# rating generation
def generate_rating_tours (tourists, tours):
    ratings = []
    for i in range(len(tourists)):
        # random choose guides from intersection of languages
        intersect_tours = [index for index in range(len(tours)) if np.any(np.in1d(tourists[i]['keywords'], tours[index]['keywords']))]
        other_tours = [index for index in range(len(tours)) if index not in intersect_tours]

        # if not empty
        if(len(intersect_tours)!=0):
            # random choose 2 guides without duplicates
            random_tours = np.random.choice(intersect_tours, 2, replace=False)
            
            # random choose 1 or 2 guides to rate
            for j in range(np.random.randint(1,3)):
    
                rating = round(np.random.normal(loc=4, scale=0.5, size=1).item(),0)
                ratings.append((i,random_tours[j],rating))
        
    return ratings

## Dataset generation

In [119]:
# number of users (tourists and guides) 
n_tourists = 200
n_guides = 40

In [120]:
# generating user profiles
guides = [generate_guide() for _ in range(n_guides)]
tourists = [generate_tourist() for _ in range(n_tourists)]

In [121]:
# randomly generate ratings given by tourists to guides
guides_ratings = generate_rating_guides(tourists,guides)
guides_ratings

[(0, 10, 4.0),
 (0, 5, 4.0),
 (1, 9, 3.0),
 (1, 5, 4.0),
 (2, 28, 4.0),
 (3, 15, 5.0),
 (3, 19, 5.0),
 (4, 34, 5.0),
 (4, 16, 5.0),
 (5, 37, 4.0),
 (6, 5, 3.0),
 (6, 9, 4.0),
 (7, 34, 3.0),
 (7, 8, 4.0),
 (8, 24, 4.0),
 (9, 1, 4.0),
 (9, 5, 3.0),
 (10, 18, 4.0),
 (11, 35, 4.0),
 (12, 34, 4.0),
 (12, 39, 4.0),
 (13, 2, 4.0),
 (13, 29, 4.0),
 (14, 27, 4.0),
 (15, 28, 4.0),
 (15, 16, 3.0),
 (16, 16, 4.0),
 (17, 8, 3.0),
 (18, 34, 3.0),
 (18, 12, 3.0),
 (19, 39, 5.0),
 (20, 6, 2.0),
 (20, 2, 4.0),
 (21, 9, 5.0),
 (21, 16, 4.0),
 (22, 34, 4.0),
 (23, 29, 5.0),
 (23, 23, 4.0),
 (24, 33, 3.0),
 (25, 4, 4.0),
 (26, 10, 5.0),
 (27, 16, 4.0),
 (27, 4, 4.0),
 (28, 5, 4.0),
 (29, 30, 4.0),
 (29, 35, 4.0),
 (30, 20, 5.0),
 (30, 35, 3.0),
 (31, 32, 4.0),
 (31, 3, 4.0),
 (32, 8, 4.0),
 (32, 10, 4.0),
 (33, 22, 3.0),
 (34, 22, 4.0),
 (35, 30, 4.0),
 (35, 31, 4.0),
 (36, 36, 3.0),
 (36, 16, 4.0),
 (37, 13, 4.0),
 (37, 6, 4.0),
 (38, 18, 4.0),
 (39, 10, 4.0),
 (39, 4, 4.0),
 (40, 22, 3.0),
 (41, 32, 4.0

In [122]:
tours = []
for i in range(len(guides)):
    tours.append(generate_tours(guides[i], i))

tours

[{'guide': 0,
  'languages': ['english'],
  'city': 'Lecce',
  'attractions': ['Church of San Matteo', 'Roman Theatre'],
  'keywords': ['archeology', 'literature'],
  'price': 38,
  'date': '2024-06-27',
  'duration': 12},
 {'guide': 1,
  'languages': ['italian', 'dutch'],
  'city': 'Lecce',
  'attractions': ["Piazza Sant'Oronzo", 'Torre del Parco'],
  'keywords': ['art', 'history'],
  'price': 27,
  'date': '2024-06-02',
  'duration': 11},
 {'guide': 2,
  'languages': ['chinese', 'french', 'english'],
  'city': 'Lecce',
  'attractions': ["Colonna di Sant'Oronzo", 'Celestine Convent'],
  'keywords': ['art', 'literature'],
  'price': 29,
  'date': '2024-06-09',
  'duration': 12},
 {'guide': 3,
  'languages': ['bulgarian'],
  'city': 'Lecce',
  'attractions': ['Museo Faggiano'],
  'keywords': ['museums'],
  'price': 30,
  'date': '2024-06-26',
  'duration': 9},
 {'guide': 4,
  'languages': ['deutsche', 'french'],
  'city': 'Lecce',
  'attractions': ['Palazzo Carafa', 'Church of Santa Chi

In [123]:
n_tours = len(tours)

In [124]:
tourists[0]

{'languages': ['bulgarian'],
 'keywords': ['beer',
  'sport',
  'cinema',
  'tracking',
  'rafting',
  'music',
  'art']}

In [125]:
tours[0]

{'guide': 0,
 'languages': ['english'],
 'city': 'Lecce',
 'attractions': ['Church of San Matteo', 'Roman Theatre'],
 'keywords': ['archeology', 'literature'],
 'price': 38,
 'date': '2024-06-27',
 'duration': 12}

In [126]:
np.any(np.in1d(tourists[0]['keywords'], tours[0]['keywords']))

False

In [127]:
tours_ratings= generate_rating_tours(tourists, tours)
tours_ratings

[(0, 1, 4.0),
 (1, 24, 5.0),
 (2, 20, 5.0),
 (2, 24, 3.0),
 (4, 30, 4.0),
 (5, 13, 4.0),
 (5, 8, 4.0),
 (6, 15, 5.0),
 (7, 24, 4.0),
 (7, 29, 4.0),
 (8, 0, 4.0),
 (9, 5, 5.0),
 (9, 17, 4.0),
 (10, 5, 4.0),
 (11, 2, 4.0),
 (11, 38, 4.0),
 (12, 7, 4.0),
 (13, 34, 5.0),
 (13, 30, 4.0),
 (14, 19, 4.0),
 (16, 20, 4.0),
 (16, 18, 4.0),
 (17, 39, 3.0),
 (18, 33, 5.0),
 (20, 36, 4.0),
 (20, 21, 4.0),
 (21, 12, 4.0),
 (22, 22, 5.0),
 (23, 2, 4.0),
 (24, 37, 4.0),
 (25, 18, 4.0),
 (25, 16, 3.0),
 (27, 5, 4.0),
 (28, 38, 5.0),
 (29, 1, 4.0),
 (29, 8, 4.0),
 (30, 27, 4.0),
 (30, 10, 4.0),
 (32, 34, 5.0),
 (33, 6, 4.0),
 (34, 15, 3.0),
 (34, 19, 5.0),
 (35, 22, 4.0),
 (37, 26, 5.0),
 (38, 6, 5.0),
 (38, 11, 4.0),
 (39, 30, 5.0),
 (40, 7, 3.0),
 (40, 9, 4.0),
 (42, 16, 3.0),
 (42, 36, 4.0),
 (44, 2, 5.0),
 (45, 11, 5.0),
 (46, 0, 4.0),
 (46, 16, 4.0),
 (47, 13, 4.0),
 (48, 22, 4.0),
 (48, 0, 4.0),
 (49, 25, 4.0),
 (49, 32, 4.0),
 (50, 32, 4.0),
 (50, 10, 5.0),
 (51, 30, 5.0),
 (52, 1, 3.0),
 (53, 2,

### Dataframes creation

In [128]:
guide_df = pd.DataFrame(guides)
# viewing first 10 guides
guide_df[:10]

Unnamed: 0,gender,name,birth_date,now_available,languages_spoken,price,education,biography,keywords,current_location,experience
0,male,Gary Baker,1984-07-08,True,[english],25,high-school,Astronomer,[museums],"{'lat': 40.342693584880706, 'lon': 18.16438078...",13
1,female,Lisa Swanson,1982-09-27,True,"[italian, dutch]",36,middle-school,"Designer, jewellery","[cinema, rafting, history, wine]","{'lat': 40.3367551413547, 'lon': 18.1569120995...",14
2,female,Carla Dillon,1995-01-04,True,"[chinese, french, english]",34,phd,Travel agency manager,"[food, archeology, art]","{'lat': 40.362447049660204, 'lon': 18.14225129...",5
3,male,Jesse Schaefer,1990-11-12,True,[bulgarian],46,bachelor,Financial planner,"[countryside, rafting, art]","{'lat': 40.36584086550253, 'lon': 18.183002910...",1
4,male,David Davis,1998-03-06,True,"[deutsche, french]",30,master,"Pilot, airline","[countryside, tracking, beer]","{'lat': 40.354724889812935, 'lon': 18.20308322...",6
5,male,Joseph James,1974-05-30,True,"[deutsche, dutch, bulgarian]",27,middle-school,"Designer, exhibition/display",[],"{'lat': 40.35854623058413, 'lon': 18.183459879...",8
6,male,Steven Schneider,1980-08-19,True,"[english, chinese, french]",34,middle-school,"Engineer, land","[rafting, art, sport]","{'lat': 40.35774331848761, 'lon': 18.163273185...",2
7,male,Reginald Berg,1971-01-06,True,"[spanish, dutch, french, italian]",33,bachelor,Financial risk analyst,"[museums, art, countryside, wine]","{'lat': 40.365193043282794, 'lon': 18.18548169...",10
8,male,Gregory Lee,1973-12-11,True,"[bulgarian, english, chinese]",31,middle-school,Audiological scientist,"[literature, cinema, music]","{'lat': 40.35265552781134, 'lon': 18.151723396...",25
9,male,Alex Perkins,1963-07-30,True,"[chinese, bulgarian, dutch, english]",37,phd,"Engineer, materials",[museums],"{'lat': 40.33774657696426, 'lon': 18.182064664...",41


In [129]:
tourist_df = pd.DataFrame(tourists)
# viewing first 10 tourists
tourist_df[:10]

Unnamed: 0,languages,keywords
0,[bulgarian],"[beer, sport, cinema, tracking, rafting, music..."
1,[bulgarian],"[history, food, tracking]"
2,[spanish],"[tracking, cinema, wine, literature, history]"
3,[deutsche],"[sport, tracking]"
4,[spanish],"[beer, tracking, food, literature, sport, muse..."
5,[dutch],"[tracking, food, wine, art, sport, music]"
6,[dutch],"[tracking, countryside, art, literature, food,..."
7,[bulgarian],"[rafting, literature, tracking, food, museums,..."
8,[bulgarian],"[rafting, literature]"
9,[dutch],"[museums, tracking, cinema, beer]"


In [130]:
guides_rating_df = pd.DataFrame(guides_ratings)
guides_rating_df.rename(columns={guides_rating_df.columns[0]: 'tourist_id',guides_rating_df.columns[1]: 'guide_id',guides_rating_df.columns[2]: 'rating'}, inplace=False)

Unnamed: 0,tourist_id,guide_id,rating
0,0,10,4.0
1,0,5,4.0
2,1,9,3.0
3,1,5,4.0
4,2,28,4.0
...,...,...,...
293,197,27,5.0
294,198,34,5.0
295,198,27,3.0
296,199,7,5.0


In [131]:
tours_df = pd.DataFrame(tours)
tours_df

Unnamed: 0,guide,languages,city,attractions,keywords,price,date,duration
0,0,[english],Lecce,"[Church of San Matteo, Roman Theatre]","[archeology, literature]",38,2024-06-27,12
1,1,"[italian, dutch]",Lecce,"[Piazza Sant'Oronzo, Torre del Parco]","[art, history]",27,2024-06-02,11
2,2,"[chinese, french, english]",Lecce,"[Colonna di Sant'Oronzo, Celestine Convent]","[art, literature]",29,2024-06-09,12
3,3,[bulgarian],Lecce,[Museo Faggiano],[museums],30,2024-06-26,9
4,4,"[deutsche, french]",Lecce,"[Palazzo Carafa, Church of Santa Chiara]","[history, literature]",30,2024-06-03,11
5,5,"[deutsche, dutch, bulgarian]",Lecce,"[Villa Comunale di Lecce, Church of San Matteo...","[countryside, literature, museums]",35,2024-06-29,11
6,6,"[english, chinese, french]",Lecce,"[Torre del Parco, Palazzo dei Celestini]",[history],37,2024-06-17,12
7,7,"[spanish, dutch, french, italian]",Lecce,"[Museo Faggiano, Palazzo dei Celestini]","[history, museums]",28,2024-06-29,7
8,8,"[bulgarian, english, chinese]",Lecce,"[Piazza Sant'Oronzo, Villa Comunale di Lecce]","[art, countryside]",38,2024-06-28,10
9,9,"[chinese, bulgarian, dutch, english]",Lecce,[Celestine Convent],[literature],34,2024-07-01,7


In [132]:
tours_rating_df = pd.DataFrame(tours_ratings)
tours_rating_df.rename(columns={tours_rating_df.columns[0]: 'tourist_id',tours_rating_df.columns[1]: 'tour_id',guides_rating_df.columns[2]: 'rating'}, inplace=False)

Unnamed: 0,tourist_id,tour_id,rating
0,0,1,4.0
1,1,24,5.0
2,2,20,5.0
3,2,24,3.0
4,4,30,4.0
...,...,...,...
257,197,35,4.0
258,197,1,4.0
259,198,1,5.0
260,199,29,5.0


Generate files in format .csv

In [133]:
# saving dataframes to files for later use
tourist_df.to_csv("Data/tourists_{:d}.csv".format(n_tourists), index=True, sep=';')
guide_df.to_csv("Data/guides_{:d}.csv".format(n_guides), index=True, sep=';')
guides_rating_df.to_csv('Data/guides_ratings_{:d}_{:d}.csv'.format(n_tourists,n_guides), index=False, sep=';')

In [134]:
tours_df.to_csv('Data/tours_{:d}.csv'.format(n_tours), index=True, sep=';')
tours_rating_df.to_csv('Data/tours_ratings_{:d}_{:d}.csv'.format(n_tourists,n_tours), index=False, sep=';')