# Fake data generation

In [2]:
from faker import Faker
import numpy as np
import pandas as pd
import random

In [3]:
# Set random seed
seed = 666

random.seed(seed)
faker = Faker(seed=seed)
np.random.seed(seed)

## Attributes definition

In [4]:
languages = ['english', 'italian', 'french', 'spanish', 'deutsche', 'dutch', 'bulgarian', 'chinese']

In [5]:
keywords = [
    'archeology',
    'museums',
    'music',
    'art',
    'cinema',
    'countryside',
    'tracking',
    'rafting',
    'history',
    'literature',
    'sport',
    'food',
    'wine',
    'beer'
]

In [6]:
education = ['middle-school', 'high-school', 'bachelor', 'master', 'phd']

In [7]:
lecce_attractions = [
    ("Basilica di Santa Croce", "Historical Tour"),
    ("Piazza del Duomo", "City Tour"),
    ("Roman Amphitheatre", "Archaeological Tour"),
    ("Porta Napoli", "City Tour"),
    ("Castello di Carlo V", "Historical Tour"),
    ("Church of San Matteo", "Religious Tour"),
    ("Piazza Sant'Oronzo", "City Tour"),
    ("Church of Santa Chiara", "Religious Tour"),
    ("Roman Theatre", "Archaeological Tour"),
    ("Church of San Niccolò e Cataldo", "Religious Tour"),
    ("Palazzo dei Celestini", "Historical Tour"),
    ("Colonna di Sant'Oronzo", "City Tour"),
    ("Palazzo Carafa", "Historical Tour"),
    ("Museo Faggiano", "Museum Tour"),
    ("Torre del Parco", "Historical Tour"),
    ("Lecce Cathedral", "Religious Tour"),
    ("Celestine Convent", "Religious Tour"),
    ("Villa Comunale di Lecce", "Park Tour"),
    ("San Giovanni Battista Church", "Religious Tour"),
    ("Church of San Francesco della Scarpa", "Religious Tour")
]

In [19]:
keywords_mapping = {
    "Historical Tour": "history",
    "City Tour": "art",
    "Archaeological Tour": "archeology",
    "Religious Tour": "literature",
    "Museum Tour": "museums",
    "Park Tour": "countryside"
}

#lecce_attractions_with_keywords = [(attraction, keywords_mapping[tour_type]) for attraction, tour_type in lecce_attractions]
#lecce_attractions_with_keywords

In [23]:
lecce_attractions_dict = [{'name' : name, 'keywords' : keywords_mapping[tour_type]} for name, tour_type in lecce_attractions]
lecce_attractions_dict

[{'name': 'Basilica di Santa Croce', 'keywords': 'history'},
 {'name': 'Piazza del Duomo', 'keywords': 'art'},
 {'name': 'Roman Amphitheatre', 'keywords': 'archeology'},
 {'name': 'Porta Napoli', 'keywords': 'art'},
 {'name': 'Castello di Carlo V', 'keywords': 'history'},
 {'name': 'Church of San Matteo', 'keywords': 'literature'},
 {'name': "Piazza Sant'Oronzo", 'keywords': 'art'},
 {'name': 'Church of Santa Chiara', 'keywords': 'literature'},
 {'name': 'Roman Theatre', 'keywords': 'archeology'},
 {'name': 'Church of San Niccolò e Cataldo', 'keywords': 'literature'},
 {'name': 'Palazzo dei Celestini', 'keywords': 'history'},
 {'name': "Colonna di Sant'Oronzo", 'keywords': 'art'},
 {'name': 'Palazzo Carafa', 'keywords': 'history'},
 {'name': 'Museo Faggiano', 'keywords': 'museums'},
 {'name': 'Torre del Parco', 'keywords': 'history'},
 {'name': 'Lecce Cathedral', 'keywords': 'literature'},
 {'name': 'Celestine Convent', 'keywords': 'literature'},
 {'name': 'Villa Comunale di Lecce', 'k

## Data generation functions

In [27]:
import random
from datetime import datetime, timedelta

def generate_random_date(start_date, end_date):
    # Convert start and end dates to datetime objects
    start_dt = datetime.strptime(start_date, '%Y-%m-%d')
    end_dt = datetime.strptime(end_date, '%Y-%m-%d')

    # Calculate the range of days between start and end dates
    delta = end_dt - start_dt

    # Generate a random number of days within the range
    random_days = random.randint(0, delta.days)

    # Add the random number of days to the start date
    random_date = start_dt + timedelta(days=random_days)

    return random_date.strftime('%Y-%m-%d')

# Example usage
start_date = '2024-06-01'
end_date = '2024-07-01'
random_date = generate_random_date(start_date, end_date)
print("Random date between", start_date, "and", end_date, ":", random_date)

Random date between 2024-06-01 and 2024-07-01 : 2024-06-15


In [24]:
# guide profile generation
def generate_guide():
    
    guide = {}
    
    guide['gender'] = np.random.choice(['male', 'female'], size=1).item()
    guide['name'] = (faker.first_name_male() if guide['gender'] == 'male' else faker.first_name_female()) + ' ' + faker.last_name()
    guide['birth_date'] = faker.date_of_birth(maximum_age=65, minimum_age=20)
    guide['now_available'] = True
    guide['languages_spoken'] = list(np.random.choice(languages, size=1+np.random.poisson(lam=1, size=1).item(), replace=False))
    guide['price'] = int(max(10, np.abs(np.random.normal(30, 5))))
    guide['education'] = np.random.choice(education, size=1).item()
    guide['biography'] = faker.profile()['job']
    guide['keywords'] = list(np.random.choice(keywords, size=np.random.poisson(2, size=1).item(), replace=False))
    guide['current_location'] = {
        "lat": np.random.normal(40.3524, 0.01),
        "lon": np.random.normal(18.1732, 0.01)
    }
    guide['experience'] = random.randint(1,2024-guide['birth_date'].year-18)
    
    return guide

In [160]:
# user profile generation
def generate_tourist():

    tourist = {}

    tourist['languages'] = list(np.random.choice(languages, size=1, replace=False))
    tourist['keywords'] = list(np.random.choice(keywords, size=np.random.poisson(5, size=1).item(), replace=False))
    
    return tourist

In [105]:
random.sample(lecce_attractions_dict, random.randint(1,3))

[{'name': 'Church of Santa Chiara', 'keywords': 'literature'}]

In [173]:
def generate_tours(guide):
    
    tours = {}

    tours['guide'] = guide
    tours['languages'] = guide['languages_spoken']
    tours['city'] = 'Lecce'
    # randomly choose 1-3 Lecce's attractions
    attractions = random.sample(lecce_attractions_dict, random.randint(1,3))
    tours['attractions'] = [attraction['name'] for attraction in attractions]
    tours['keywords'] = np.unique([attraction['keywords'] for attraction in attractions])
    tours['price'] = int(max(10, np.abs(np.random.normal(30, 5))))
    tours['date'] = generate_random_date(start_date, end_date)
    tours['duration'] = random.randint(5,12)

    return tours
    

In [120]:
import random 

# rating generation
def generate_rating_guides (tourists, guides):
    ratings = []
    for i in range(len(tourists)):
        
        # random choose guides from intersection of languages
        intersect_guides = [index for index in range(len(guides)) if np.all(np.in1d(tourists[i]['languages'], guides[index]['languages_spoken']))]
        other_guides = [index for index in range(len(guides)) if index not in intersect_guides]
        
        # random choose 3 guides without duplicates
        random_guides = np.random.choice(intersect_guides,2, replace=False)
        random_guide = np.random.choice(other_guides,1)
        
        # random choose 1 or 2 guides to rate
        for j in range(np.random.randint(1,3)):

            rating = round(np.random.normal(loc=4, scale=0.5, size=1).item(),0)
            ratings.append((i,random_guides[j],rating))

#         # random choose if rate badly
# #        if(np.random.binomial(1, 0.2)):
#             bad_rating = round(np.random.normal(loc=1.5, scale=0.3, size=1).item(),0)
#             # rate the last guide with bad rating
#             ratings.append((i,random_guide[0],bad_rating))
#
    return ratings

In [209]:
import random 

# rating generation
def generate_rating_tours (tourists, tours):
    ratings = []
    for i in range(len(tourists)):
        # random choose guides from intersection of languages
        intersect_tours = [index for index in range(len(tours)) if np.any(np.in1d(tourists[i]['keywords'], tours[index]['keywords']))]
        other_tours = [index for index in range(len(tours)) if index not in intersect_tours]

        # if not empty
        if(len(intersect_tours)!=0):
            # random choose 2 guides without duplicates
            random_tours = np.random.choice(intersect_tours, 2, replace=False)
            
            # random choose 1 or 2 guides to rate
            for j in range(np.random.randint(1,3)):
    
                rating = round(np.random.normal(loc=4, scale=0.5, size=1).item(),0)
                ratings.append((i,random_tours[j],rating))
        
    return ratings

## Dataset generation

In [182]:
# number of users (tourists and guides) 
n_tourists = 200
n_guides = 40

In [183]:
# generating user profiles
guides = [generate_guide() for _ in range(n_guides)]
tourists = [generate_tourist() for _ in range(n_tourists)]

In [205]:
# randomly generate ratings given by tourists to guides
guides_ratings = generate_rating_guides(tourists,guides)
guides_ratings

[(0, 10, 4.0),
 (1, 39, 4.0),
 (2, 38, 4.0),
 (3, 6, 4.0),
 (3, 36, 4.0),
 (4, 8, 4.0),
 (5, 13, 4.0),
 (5, 33, 4.0),
 (6, 17, 3.0),
 (6, 16, 4.0),
 (7, 8, 4.0),
 (8, 11, 5.0),
 (9, 26, 4.0),
 (9, 35, 3.0),
 (10, 28, 5.0),
 (11, 37, 3.0),
 (12, 19, 5.0),
 (12, 12, 4.0),
 (13, 15, 3.0),
 (14, 39, 5.0),
 (14, 0, 3.0),
 (15, 38, 4.0),
 (15, 39, 4.0),
 (16, 32, 4.0),
 (16, 3, 4.0),
 (17, 6, 4.0),
 (18, 13, 3.0),
 (18, 22, 4.0),
 (19, 14, 4.0),
 (19, 20, 4.0),
 (20, 19, 3.0),
 (20, 20, 4.0),
 (21, 32, 4.0),
 (22, 32, 4.0),
 (23, 4, 4.0),
 (23, 20, 5.0),
 (24, 20, 3.0),
 (24, 13, 4.0),
 (25, 10, 4.0),
 (25, 23, 4.0),
 (26, 26, 4.0),
 (26, 13, 5.0),
 (27, 29, 4.0),
 (27, 19, 3.0),
 (28, 17, 4.0),
 (28, 14, 4.0),
 (29, 20, 4.0),
 (29, 25, 4.0),
 (30, 23, 4.0),
 (31, 17, 3.0),
 (31, 6, 4.0),
 (32, 14, 4.0),
 (32, 23, 4.0),
 (33, 8, 5.0),
 (33, 30, 3.0),
 (34, 32, 4.0),
 (35, 17, 4.0),
 (36, 3, 3.0),
 (36, 24, 4.0),
 (37, 7, 4.0),
 (37, 30, 3.0),
 (38, 30, 4.0),
 (38, 35, 4.0),
 (39, 24, 4.0),
 

In [192]:
tours = []
for guide in guides:
    tours.append(generate_tours(guide))

tours

[{'guide': {'gender': 'male',
   'name': 'Scott Sanchez',
   'birth_date': datetime.date(1987, 4, 4),
   'now_available': True,
   'languages_spoken': ['spanish', 'french', 'italian'],
   'price': 34,
   'education': 'bachelor',
   'biography': 'Cartographer',
   'keywords': ['tracking'],
   'current_location': {'lat': 40.37088264862189, 'lon': 18.162287973839227},
   'experience': 9},
  'languages': ['spanish', 'french', 'italian'],
  'city': 'Lecce',
  'attractions': ['Church of San Niccolò e Cataldo'],
  'keywords': array(['literature'], dtype='<U10'),
  'price': 36,
  'date': '2024-06-17',
  'duration': 11},
 {'guide': {'gender': 'male',
   'name': 'Thomas Duran',
   'birth_date': datetime.date(1995, 11, 23),
   'now_available': True,
   'languages_spoken': ['italian'],
   'price': 27,
   'education': 'bachelor',
   'biography': 'Patent attorney',
   'keywords': ['food', 'history'],
   'current_location': {'lat': 40.35466861158162, 'lon': 18.18516627544087},
   'experience': 11},
 

In [213]:
n_tours = len(tours)

In [193]:
tourists[0]

{'languages': ['english'],
 'keywords': ['museums',
  'countryside',
  'archeology',
  'history',
  'literature',
  'music',
  'sport']}

In [194]:
tours[0]

{'guide': {'gender': 'male',
  'name': 'Scott Sanchez',
  'birth_date': datetime.date(1987, 4, 4),
  'now_available': True,
  'languages_spoken': ['spanish', 'french', 'italian'],
  'price': 34,
  'education': 'bachelor',
  'biography': 'Cartographer',
  'keywords': ['tracking'],
  'current_location': {'lat': 40.37088264862189, 'lon': 18.162287973839227},
  'experience': 9},
 'languages': ['spanish', 'french', 'italian'],
 'city': 'Lecce',
 'attractions': ['Church of San Niccolò e Cataldo'],
 'keywords': array(['literature'], dtype='<U10'),
 'price': 36,
 'date': '2024-06-17',
 'duration': 11}

In [203]:
np.any(np.in1d(tourists[0]['keywords'], tours[0]['keywords']))

True

In [210]:
tours_ratings= generate_rating_tours(tourists, tours)
tours_ratings

[(0, 30, 4.0),
 (1, 20, 4.0),
 (1, 31, 4.0),
 (3, 29, 4.0),
 (4, 15, 4.0),
 (4, 21, 5.0),
 (5, 25, 4.0),
 (6, 34, 4.0),
 (6, 14, 4.0),
 (7, 21, 4.0),
 (8, 31, 3.0),
 (9, 9, 4.0),
 (9, 25, 3.0),
 (10, 25, 4.0),
 (10, 30, 4.0),
 (11, 6, 4.0),
 (11, 0, 4.0),
 (12, 4, 4.0),
 (13, 35, 4.0),
 (13, 2, 4.0),
 (15, 1, 4.0),
 (15, 30, 3.0),
 (16, 10, 4.0),
 (16, 2, 4.0),
 (17, 37, 4.0),
 (18, 30, 5.0),
 (18, 2, 4.0),
 (19, 24, 4.0),
 (19, 38, 3.0),
 (20, 35, 4.0),
 (20, 38, 4.0),
 (22, 35, 3.0),
 (22, 2, 5.0),
 (23, 15, 3.0),
 (24, 32, 4.0),
 (24, 7, 4.0),
 (25, 35, 4.0),
 (25, 15, 3.0),
 (26, 27, 3.0),
 (26, 13, 3.0),
 (28, 27, 4.0),
 (29, 18, 4.0),
 (30, 33, 3.0),
 (30, 24, 3.0),
 (31, 15, 4.0),
 (31, 6, 5.0),
 (32, 5, 3.0),
 (33, 1, 3.0),
 (33, 25, 4.0),
 (34, 25, 4.0),
 (35, 38, 4.0),
 (35, 34, 4.0),
 (36, 12, 4.0),
 (37, 9, 4.0),
 (38, 11, 4.0),
 (38, 37, 4.0),
 (39, 8, 3.0),
 (40, 5, 3.0),
 (40, 13, 4.0),
 (41, 7, 4.0),
 (41, 20, 4.0),
 (42, 35, 4.0),
 (44, 34, 4.0),
 (45, 25, 4.0),
 (45, 

### Dataframes creation

In [14]:
guide_df = pd.DataFrame(guides)
# viewing first 10 guides
guide_df[:10]

Unnamed: 0,gender,name,birth_date,now_available,languages_spoken,price,education,biography,keywords,current_location,experience
0,male,Kevin Rodriguez,1967-11-01,True,[english],25,high-school,"Engineer, manufacturing",[museums],"{'lat': 40.342693584880706, 'lon': 18.16438078...",30
1,female,Diana Barnes,1965-09-19,True,"[italian, dutch]",36,middle-school,Animal technologist,"[cinema, rafting, history, wine]","{'lat': 40.3367551413547, 'lon': 18.1569120995...",25
2,female,Kristin Rogers,1978-10-12,True,"[chinese, french, english]",34,phd,"Editor, commissioning","[food, archeology, art]","{'lat': 40.362447049660204, 'lon': 18.14225129...",14
3,male,Jeremy Bowman,1992-05-30,True,[bulgarian],46,bachelor,Retail banker,"[countryside, rafting, art]","{'lat': 40.36584086550253, 'lon': 18.183002910...",5
4,male,Justin Lynch,1976-12-24,True,"[deutsche, french]",30,master,Secondary school teacher,"[countryside, tracking, beer]","{'lat': 40.354724889812935, 'lon': 18.20308322...",17
5,male,Charles Dunn,1988-06-19,True,"[deutsche, dutch, bulgarian]",27,middle-school,Oncologist,[],"{'lat': 40.35854623058413, 'lon': 18.183459879...",1
6,male,Mitchell Duncan,1987-12-29,True,"[english, chinese, french]",34,middle-school,Analytical chemist,"[rafting, art, sport]","{'lat': 40.35774331848761, 'lon': 18.163273185...",18
7,male,Charles Clarke,1990-07-17,True,"[spanish, dutch, french, italian]",33,bachelor,Associate Professor,"[museums, art, countryside, wine]","{'lat': 40.365193043282794, 'lon': 18.18548169...",11
8,male,Scott Sawyer,1992-06-10,True,"[bulgarian, english, chinese]",31,middle-school,Theatre manager,"[literature, cinema, music]","{'lat': 40.35265552781134, 'lon': 18.151723396...",2
9,male,Joseph Bradford,2004-04-17,True,"[chinese, bulgarian, dutch, english]",37,phd,"Therapist, occupational",[museums],"{'lat': 40.33774657696426, 'lon': 18.182064664...",1


In [15]:
tourist_df = pd.DataFrame(tourists)
# viewing first 10 tourists
tourist_df[:10]

Unnamed: 0,languages,keywords
0,[bulgarian],[cinema]
1,[deutsche],[]
2,[spanish],"[rafting, museums]"
3,[deutsche],"[cinema, history, food]"
4,[chinese],"[wine, rafting]"
5,[italian],"[literature, museums, countryside]"
6,[bulgarian],[music]
7,[chinese],"[wine, museums, literature]"
8,[chinese],"[food, countryside, archeology]"
9,[spanish],[history]


In [150]:
guides_rating_df = pd.DataFrame(guides_ratings)
guides_rating_df.rename(columns={guides_rating_df.columns[0]: 'tourist_id',guides_rating_df.columns[1]: 'guide_id',guides_rating_df.columns[2]: 'rating'}, inplace=False)

Unnamed: 0,tourist_id,guide_id,rating
0,0,24,4.0
1,1,25,3.0
2,2,15,4.0
3,2,24,4.0
4,3,5,5.0
...,...,...,...
294,197,34,4.0
295,197,12,4.0
296,198,26,4.0
297,198,37,4.0


In [212]:
tours_rating_df = pd.DataFrame(tours_ratings)
tours_rating_df.rename(columns={tours_rating_df.columns[0]: 'tourist_id',tours_rating_df.columns[1]: 'tour_id',guides_rating_df.columns[2]: 'rating'}, inplace=False)

Unnamed: 0,tourist_id,tour_id,rating
0,0,30,4.0
1,1,20,4.0
2,1,31,4.0
3,3,29,4.0
4,4,15,4.0
...,...,...,...
265,195,29,4.0
266,196,33,4.0
267,197,7,4.0
268,197,37,4.0


Generate files in format .csv

In [17]:
# saving dataframes to files for later use
tourist_df.to_csv("Data/tourists_{:d}.csv".format(n_tourists), index=True, sep=';')
guide_df.to_csv("Data/guides_{:d}.csv".format(n_guides), index=True, sep=';')
rating_df.to_csv('Data/ratings_{:d}_{:d}.csv'.format(n_tourists,n_guides), index=False, sep=';')

In [214]:
tours_rating_df.to_csv('Data/tours_ratings_{:d}_{:d}.csv'.format(n_tourists,n_tours), index=False, sep=';')