In [10]:
"""Parser API KudaGo (parser_3, v.2.0, 19_04_2020)."""

"""Changes:"""
"""v.0 - unstructured, page of places/events collectedd"""
"""v.1.0 - places/events collected individually"""
"""v.2.0:"""
""" - 'get_data(url)' returns 'dict' (not 'str')"""
""" - elements loaded from web iteratively (int "bulk")"""

"""Add information to list of places given by parser_1"""
"""API information at 'https://docs.kudago.com/api'."""

"""Input: .json file with list structured as:"""
"""[{'id': id_1,'name': name_1}, {*place_2},...]"""
"""Output: .json file consisting full information"""
"""on places available from API KudaGom."""
"""If data on some of places could not be collected their"""
""" 'id'`s will be placed in another (error).json file"""


from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import json
from json.decoder import JSONDecodeError

"""'get_data' input: url of API KudaGo page to parce"""
"""'get_data' output: data from url in 'dict' type"""

def get_data(url):
    try:
        html = urlopen(url)
    except HTTPError:
        return HTTPError
    try:
        data = json.loads(html.read())
        #print(type(data), data)
    except AttributeError:
        return AttributeError
    return data

location = "spb"    # Choose city here (spb, msk, ...)
bulk = 10000    # elements loaded in file in one iteration

source_url = "https://kudago.com/public-api/v1.4/events/"
expand_url = "/?lang=en&expand=images,location,dates"

in_directory = "D:/Work/Data_files/KudaGo_database/events_1/"
in_file = "events_1_" + location
out_directory = "D:/Work/Data_files/KudaGo_database/events_2/"
res_file = "events_2_expand_" + location
error_file = "events_2_error_" + location

"""In case the file did not exist yet"""
open(out_directory + res_file + ".json", 'a').close()
open(out_directory + error_file + ".json", 'a').close()

with open (in_directory + in_file + ".json",
           'r', encoding = "utf-8") as inf:
    init_list = json.load(inf)
    with open (out_directory + res_file + ".json",
           'r+', encoding = "utf-8") as ouf,\
    open (out_directory + error_file + ".json",
           'r+', encoding = "utf-8") as errf: 
        try:
            ouf.seek(0)
            elements_list = json.load(ouf)
        except JSONDecodeError:    # file is empty yet
            elements_list = []
        try:
            err_list = json.load(errf)
        except JSONDecodeError:
            err_list = []
        current = {}
        start_elem = 0
        if bulk < len(init_list):
            end_elem = bulk
        else:
            end_elem = -1
        """find last already loaded and start from next:"""
        if elements_list != []:
            for i in range(len(init_list)):
                if(init_list[i]['id'] == elements_list[-1]['id']):
                    start_elem = i+1
                    if start_elem + bulk < len(init_list):
                        end_elem = start_elem + bulk
                    else:
                        end_elem = -1
        progress = 0
        persent_done = 0
        to_execute = len(init_list[start_elem : end_elem : 1])
        for elem in init_list[start_elem : end_elem : 1]:
            elem_url = (source_url + str(elem['id'])
                         + expand_url)
            if(elem == init_list[start_elem]):
                print("Source:\n", elem_url)
            try:
                current = get_data(elem_url)
                #print(type(place_data), place_data)
                if current not in elements_list:
                    elements_list.append(current)
            except:
                if elem not in err_list:
                    elements_list.append(elem)
            if(persent_done < int((progress / to_execute) * 100)):
                persent_done = int((progress / to_execute) * 100)
                print("Progress, %: ", persent_done)
            progress += 1
        ouf.seek(0)
        errf.seek(0)
        json.dump(elements_list, ouf)
        json.dump(err_list, errf)
print("Saved in file:\n",
      out_directory + res_file + ".json")
print("Error file:\n",
      out_directory + error_file + ".json")

with open (out_directory + res_file + ".json",
           'r', encoding = "utf-8") as inf,\
open (out_directory + error_file + ".json",
           'r', encoding = "utf-8") as errf:
    data = json.load(inf)
    print("Number of elements in output file:", len(data))
    if(len(data) != 0):
        print("First element:\n", data[0])
        print("Last element:\n", data[-1])
    err_list = json.load(errf)
    print("Number of elements in error file:", len(err_list))
    if(len(err_list) != 0):
        print("First element:\n", err_list[0])
        print("Last element:\n", err_list[-1])

Saved in file:
 D:/Work/Data_files/KudaGo_database/events_2/events_2_expand_spb.json
Error file:
 D:/Work/Data_files/KudaGo_database/events_2/events_2_error_spb.json
Number of elements in output file: 54126
First element:
 {'id': 85, 'publication_date': 1351509149, 'dates': [{'start_date': '2012-03-28', 'start_time': '10:00:00', 'start': 1332914400, 'end_date': '2014-04-09', 'end_time': '22:00:00', 'end': 1397066400, 'is_continuous': False, 'is_endless': False, 'is_startless': False, 'schedules': [], 'use_place_schedule': False}], 'title': 'выставка Наташи Ван Будман «Look Out / Человейник»', 'slug': 'natasha-van-budman-chelovejnik-new-look', 'place': {'id': 157}, 'description': '<p>А знаете ли вы, что из всех насекомых наиболее похожими на людей признаны муравьи? Как и у нас, у муравьев существует разделение на социальные слои и роли. С другой стороны, часто мы сами копошимся и спешим куда-то, словно муравьи...</p>\n', 'body_text': '<p>По всей видимости, именно такие мысли легли в осн