In [1]:
# Based on the model developed by Elina Leblanc (University of Geneva)

import csv
import json
from collections import defaultdict

places_all = []  # List with all places (normalized name)
places_single = []  # Deduplicated list of places
list_places = []  # List of places with their coordinates, Wikidata id and type
list_plays = []  # List of each place with a sublist with information about the document where it appears
list_places_nbTimes = []

def parse_float(value):
    # Replace special negative sign if present and convert to float
    return float(value.replace('−', '-').replace(',', ''))

with open('LOPE_PLACES_ALL.csv', encoding='utf-8') as f:
    csv_file = csv.reader(f)  # We parse the CSV file
    next(csv_file)  # We skip the first line

    for line in csv_file:
        if line[13] != '':  # Place_names_normalised_spanish
            # List with only the normalized names
            places_all.append(line[13])
            if line[13] not in places_single:
                places_single.append(line[13])

            id_place = line[14]  # GeoNames_ID
            occurrences = int(line[10])

            list_places_nbTimes.append([line[13], occurrences])  # Place_names, Occurrences

            # Check if latitude and longitude are not empty
            if line[15] != '' and line[16] != '':
                try:
                    latitude = parse_float(line[15])
                    longitude = parse_float(line[16])
                except ValueError:
                    latitude = None
                    longitude = None
            else:
                latitude = None
                longitude = None

            # List with information about the place
            place_info = {
                '@id': id_place,
                'uuid': 'https://www.geonames.org/' + id_place,
                'type': 'Feature',
                'properties': {
                    'title': line[13],
                    'occurrences': [{'value': occurrences}]
                },
                'geometry': {
                    'type': 'Point',
                    'coordinates': [longitude, latitude]
                } if latitude is not None and longitude is not None else None,
                'descriptions': [],
                'names': [{'toponym': line[7], 'lang': 'es'}],
                'links': [
                    {'type': 'closeMatch', 'identifier': 'https://pleiades.stoa.org/places/746826'},
                    {'type': 'closeMatch', 'identifier': 'http://www.geonames.org/' + id_place}
                ]
            }
            list_places.append(place_info)

            # List with information about the document where a place appears
            list_plays.append([
                line[13],
                {
                    'relationType': 'gvp:aat2208_locus-setting_for',
                    'title': line[1],  # Title
                    'genre': line[6],  # Genre                                        
                    'subgenre': line[7],  # Subgenre
                    'Microgenre': line[8],  # Microgenre
                    'Book title': line[2],  # Publication title
                    'Publication date': line[3],  # Publication date
                    'Date': line[4],  # Composition date
                    'Period': line[5],  # Period
                    'Type': line[11],  # Type of place
                    'occurrences': line[10]  # Occurrences
                }
            ])

# Deduplicate places using a dictionary with the title as the key
list_places_deduplicated_dict = {}
for place in list_places:
    place_title = place['properties']['title']
    if place_title not in list_places_deduplicated_dict:
        list_places_deduplicated_dict[place_title] = place

list_places_deduplicated = list(list_places_deduplicated_dict.values())

# Associate each place with a sublist containing all the documents where it appears
data = defaultdict(list)
for key, value in list_plays:
    data[key].append(value)

# Aggregate occurrences correctly
number = defaultdict(list)
for k, v in list_places_nbTimes:
    number[k].append(v)

list_places_nb = [list(n) for n in number.items()]

for place in list_places_deduplicated:
    place_title = place['properties']['title']
    # For each place, we add the list of documents
    if place_title in data:
        place['relations'] = data[place_title]

    # For each place, we add the number of occurrences
    if place_title in number:
        total_occurrences = sum(number[place_title])
        place['properties']['occurrences'] = [{'value': total_occurrences}]
        place['descriptions'] = [{'value': f"{total_occurrences} ocurrencia(s)"}]

# Save the results in a JSON file without printing to the console
with open('test.json', 'w', encoding='utf-8') as f:
    json.dump(list_places_deduplicated, f, ensure_ascii=False, indent=4)

