In [2]:
"""Parser API KudaGo (parser_3, v.2.0, 19_04_2020)."""

"""Changes:"""
"""v.0 - unstructured, page of places/events collectedd"""
"""v.1.0 - places/events collected individually"""
"""v.2.0 - function get_data returns 'dict' (not 'str')"""

"""Add information to list of places given by parser_1"""
"""API information at 'https://docs.kudago.com/api'."""

"""Input: .json file with list structured as:"""
"""[{'id': id_1,'name': name_1}, {*place_2},...]"""
"""Output: .json file consisting full information"""
"""on places available from API KudaGom."""
"""If data on some of places could not be collected their"""
""" 'id'`s will be placed in another (error).json file"""


from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import json


"""'get_data' input: API KudaGo page url to parce."""
"""'get_data' output: data from url in 'dict' type."""

def get_data(url):
    try:
        html = urlopen(url)
    except HTTPError:
        return HTTPError
    try:
        data = json.loads(html.read())
        #print(type(data), data)
    except AttributeError:
        return AttributeError
    return data

location = "spb"    # Choose city here (spb, msk, ...)

source_url = "https://kudago.com/public-api/v1.4/places/"
expand_url = "/?expand=images"

directory = "D:/Work/Data_files/working_dir/"
in_file = "places_1_" + location
res_file = "places_2_expand_" + location
error_file = "places_2_error_" + location

with open (directory + in_file + ".json",
           'r', encoding = "utf-8") as inf:
    places_base = json.load(inf)
    with open (directory + res_file + ".json",
           'w', encoding = "utf-8") as ouf,\
    open (directory + error_file + ".json",
           'w', encoding = "utf-8") as errf: 
        place_data = {}
        places_list = []
        err_list = []
        for place in places_base: 
            place_url = (source_url + str(place['id'])
                         + expand_url)
            #print("Source:\n", place_url)
            try:
                place_data = get_data(place_url)
                #print(type(place_data), place_data)
                places_list.append(place_data)
            except:
                err_list.append(place)
                print("Invalid place data, id: ", place['id'])
        json.dump(places_list, ouf)
        json.dump(err_list, errf)
print("Saved in file:\n",
      directory + res_file + ".json")
print("Error file:\n",
      directory + error_file + ".json")

with open (directory + res_file + ".json",
           'r', encoding = "utf-8") as inf,\
open (directory + error_file + ".json",
           'r', encoding = "utf-8") as errf:
    data = json.load(inf)
    print("Number of elements in output file:", len(data))
    if(len(data) != 0):
        print("First element:\n", data[0])
        print("Last element:\n", data[-1])
    err_list = json.load(errf)
    print("Number of elements in error file:", len(err_list))
    if(len(err_list) != 0):
        print("First element:\n", err_list[0])
        print("Last element:\n", err_list[-1])

Saved in file:
 D:/Work/Data_files/working_dir/places_2_expand_spb.json
Error file:
 D:/Work/Data_files/working_dir/places_2_error_spb.json
Number of elements in output file: 3732
First element:
 {'id': 1, 'title': 'Эрмитаж (Зимний дворец, Главный музейный комплекс)', 'slug': 'ermitazh', 'address': 'пл. Дворцовая, д. 2', 'timetable': 'вт–вс 12:00–21:00', 'phone': '+7 812 571-34-65', 'is_stub': False, 'body_text': '<p>В 1754 году архитектор Растрелли начал строительство Зимнего дворца — новой царской резиденции. При Екатерине II возвели Малый и Большой Эрмитаж, Эрмитажный театр.</p>\n<p>Годом основания музея считается 1764-й: именно тогда императрица приобрела коллекцию, собранную купцом Иоганном Эрнстом Гоцковским. В неё входили полотна Рембрандта, Питера Пауля Рубенса, Антониса ван Дейка, Якоба Йорданса, Яна Стена и других мэтров живописи. Шли годы, менялись правители, росло эрмитажное собрание. В 1814 году оно пополнилось полотнами из коллекции императрицы Жозефины. В 1826 году откры