In [1]:
"""Transforms 'timetable' in KudaGo dataset (str):"""
""" 'timetable': 'вт, чт 10:30–18:00, ср 10:30–21:00' """
"""to the format of FIFA (ITMO) dataset:"""
"""dict of lists[int_1(opening time), int_2(closing time)]"""
""" 'open_hours': {"1":[1030,1800],"2":[1030,1800], ...}"""


# 1 - Get original KudaGo timetable from dataset
import json
from json import JSONDecodeError

# 2 - Digitalisation of KudaGo timetable
import re
import string
from itertools import tee
from collections import Counter    # for testing


# In / out files and directorires:
direct = "D:/Work/Data_files/working_dir/"
in_file_1 = "Saint+Petersburg-finalized"
in_file_2 = "Moscow-finalized"
out_file = "all_timetables"

In [2]:
# 1

"""Get original tametable from KudaGo datasets

with open(direct + in_file_1 + ".json",
           'r', encoding = "utf-8") as inf_1, \
     open(direct + in_file_2 + ".json",
           'r', encoding = "utf-8") as inf_2, \
     open(direct + out_file + ".json",
           'w', encoding = "utf-8") as ouf:
    timetables = []
    try:
        places = json.load(inf_1)
        for place in places:
            timetable = place["open_hours"]
            if timetable not in timetables:
                timetables.append(timetable)
        places = json.load(inf_2)
        for place in places:
            timetable = place["open_hours"]
            if timetable not in timetables:
                timetables.append(timetable)
    except JSONDecodeError:
        print("Input file cannot be read")
    try:
        json.dump(timetables, ouf)
    except JSONDecodeError:
        print("Out file writing error")"""

'Get original tametable from KudaGo datasets\n\nwith open(direct + in_file_1 + ".json",\n           \'r\', encoding = "utf-8") as inf_1,      open(direct + in_file_2 + ".json",\n           \'r\', encoding = "utf-8") as inf_2,      open(direct + out_file + ".json",\n           \'w\', encoding = "utf-8") as ouf:\n    timetables = []\n    try:\n        places = json.load(inf_1)\n        for place in places:\n            timetable = place["open_hours"]\n            if timetable not in timetables:\n                timetables.append(timetable)\n        places = json.load(inf_2)\n        for place in places:\n            timetable = place["open_hours"]\n            if timetable not in timetables:\n                timetables.append(timetable)\n    except JSONDecodeError:\n        print("Input file cannot be read")\n    try:\n        json.dump(timetables, ouf)\n    except JSONDecodeError:\n        print("Out file writing error")'

In [3]:
# 2
# Finds all (considered) time-defining notations in text
def find_patterns(text):
    
    # Considered regular expressions:
    
    # Time range, example: "10:00-0:10"
    time_reg = r'\d{1,2}:\d\d[^,]\d{1,2}:\d\d'    #! [^,] \W

    # Week days, example: "пн"
    mon = r'\bпн\b'
    tue = r'\bвт\b'
    wed = r'\bср\b'
    thu = r'\bчт\b'
    fri = r'\bпт\b'
    sat = r'\bсб\b'
    sun = r'\bвс\b'
    week_day_reg = [mon, tue, wed, thu, fri, sat, sun]

    # Other notations
    notation_1 = "IMPOSSIBLE"
    
    timetable.replace(' ', '')
    time_ranges = re.finditer(time_reg, timetable)
    days = [re.finditer(day, timetable) for day in week_day_reg]
    notations = re.finditer(notation_1, timetable)
    text_patterns = {'time_ranges': time_ranges, 'days': days,
                     'notations': notations}
    return text_patterns


In [16]:
# 3
# Digitalise time-patterns in text:

# Check for empty iterator
def next_element(my_iter):
    try:
        element = next(my_iter)
    except StopIteration:
        return None
    return element

# Convert time from 'str' to 'list[int_1, int_2]'
# Example: ('9:00-18:00') to [900, 1800]
def convert_time(time_str):
    time_range = re.split(r'[^\d, :]', time_str[0])
    open_time = re.split(r'[^\d]', time_range[0])
    close_time = re.split(r'[^\d]', time_range[1])
    return [int(open_time[0] + open_time[1]),\
            int(close_time[0] + close_time[1])]

# Convert days in the form: 'пн' - '1', 'вт' - '2', ...
def convert_day(day):
    before = ['пн', 'вт', 'ср', 'чт', 'пт', 'сб', 'вс']
    after = ['1', '2', '3', '4', '5', '6', '7']
    converted = None
    for i in range(len(before)):
        if day[0] == before[i]:
            converted = after[i]
    return converted

# Defines working hours if there is time_range in text
def time_case(text_patterns):
    days = []
    for day in text_patterns['days']:
        day, isDay = tee(day)
        if next_element(isDay):
            days.append(next_element(day))
    for day in days:
        convert_day(day)
    for time_patt in text_patterns['time_ranges']:
        time_range = convert_time(time_patt)
    open_hours = {day[0]: time_range for day in days}
            #if day.end() < time_patt.start():
                
    return open_hours


def notation_case(text_patterns):   
    pass

# Returns timetable in final format
def find_open_hours(text_patterns):
    
    # Final format of timetable:
    # keys = days of week, values = open hours
    open_hours = {'1', '2', '3', '4', '5', '6', '7'}
    
    text_patterns['time_ranges'], \
                time_range = tee(text_patterns['time_ranges'])
    text_patterns['notations'], \
                notation = tee(text_patterns['notations'])
    if next_element(time_range):
        open_hours = time_case(text_patterns)
    if next_element(notation):
        open_hours = notation_case(text_patterns)
    
    return open_hours

def digitalise(text):
    text_patterns = find_patterns(text)
    open_hours = find_open_hours(text_patterns)
    return open_hours


In [17]:
# 4
# "main", writes new file with digitalised time
counter = Counter()
with open(direct + out_file + ".json",
          'r', encoding = "utf-8") as inf:
    timetables = json.load(inf)
    for timetable in timetables:
        open_hours = digitalise(timetable)
        print(open_hours)
        #print(timetable)
        
        
        #counter.update(open_hours, timetable)
    #for entity in counter.most_common():
        #print (entity)

{'вт': [1030, 2100], 'ср': [1030, 2100], 'чт': [1030, 2100], 'пт': [1030, 2100], 'сб': [1030, 2100], 'вс': [1030, 2100]}
{'вт': [1000, 1630], 'вс': [1000, 1630]}
{'3', '7', '5', '2', '1', '6', '4'}
{}
{'3', '7', '5', '2', '1', '6', '4'}
{'пн': [1000, 2230], 'вт': [1000, 2230], 'вс': [1000, 2230]}
{}
{}
{}
{'ср': [1100, 1700], 'вс': [1100, 1700]}
{'пн': [1000, 2200], 'ср': [1000, 2200], 'вс': [1000, 2200]}
{}
{'пн': [1100, 1800], 'сб': [1100, 1800], 'вс': [1100, 1800]}
{}
{'вт': [1100, 1630], 'сб': [1100, 1630]}
{'пн': [1100, 1900], 'пт': [1100, 1900], 'сб': [1100, 1900], 'вс': [1100, 1900]}
{}
{'3', '7', '5', '2', '1', '6', '4'}
{}
{'пн': [2200, 600], 'сб': [2200, 600]}
{}
{'пн': [1000, 1800], 'пт': [1000, 1800], 'сб': [1000, 1800]}
{}
{}
{'пн': [1100, 2130], 'сб': [1100, 2130]}
{}
{}
{}
{'3', '7', '5', '2', '1', '6', '4'}
{'пн': [1000, 2000], 'вт': [1000, 2000], 'чт': [1000, 2000], 'вс': [1000, 2000]}
{}
{'ср': [1200, 1900], 'вс': [1200, 1900]}
{'пн': [1100, 1900], 'пт': [1100, 1900],

IndexError: list index out of range

In [2]:
# Time range, example: "10:00-0:10"
time_patt = r'\d{1,2}:\d\d[^,]\d{1,2}:\d\d'    #! [^,] \W

# Week days, example: "пн"
mon = r'\bпн\b'
tue = r'\bвт\b'
wed = r'\bср\b'
thu = r'\bчт\b'
fri = r'\bпт\b'
sat = r'\bсб\b'
sun = r'\bвс\b'
week_days = [mon, tue, wed, thu, fri, sat, sun]

# Convert time from 'str' to 'list[int_1, int_2]'
# Example: ('9:00-18:00') to [900, 1800]
def convert_time_1(time_str):
    time_range = re.split(r'[^\d, :]', time_str[0])
    open_time = re.split(r'[^\d]', time_range[0])
    close_time = re.split(r'[^\d]', time_range[1])
    if open_time and close_time:
        time = [int(open_time[0] + open_time[1]),\
            int(close_time[0] + close_time[1])]
    else:
        time = None
    return time

def define_days():
    pass

def digitalise_1(text):
    timetable = {}
    shift = 0
    time = re.search(time_patt, text[shift:-1])
    while time:
        hours = convert_time_1(time)
        for day in range(7):    # represents week days
            if re.search(week_days[day], text[0:shift]):
                # monday == 1 when day == 0
                timetable[str(day+1)] = hours    
        days = define_days()
        shift += time.end()
        time = re.search(time_patt, text[shift:-1])
    return timetable

with open(direct + out_file + ".json",
          'r', encoding = "utf-8") as inf:
    timetables = json.load(inf)
    for text in timetables:
        open_hours = digitalise_1(text)
        print(text)
        print(open_hours)

вт, чт, сб, вс 10:30–18:00, ср, пт 10:30–21:00
{}
вт–вс 10:00–18:00 (касса: вт–вс 10:00–16:30)
{'2': [1000, 1630], '7': [1000, 1630]}

{}
ежедневно 11:00–22:00
{}
Расписание событий текущего месяца – на официальном сайте.
{}
пн 15:00–22:30, вт–вс 10:00–22:30
{}
ежедневно 9:00–21:00
{}
касса: ежедневно 11:00–19:00
{}
касса: ежедневно 12:00–21:00
{}
ср–вс 11:00–18:00 (касса: ср–вс 11:00–17:00)
{'3': [1100, 1700], '7': [1100, 1700]}
пн, ср–вс 10:00–22:00
{}
ежедневно 11:00–21:00 (касса: ежедневно 11:00–20:30)
{}
пн–сб 11:00–20:00, вс 11:00–18:00
{}
ежедневно 8:00–0:00
{}
вт–сб 11:00–17:00 (касса: вт–сб 11:00–16:30)
{'2': [1100, 1630], '6': [1100, 1630]}
касса: пн–пт 11:00–20:00, сб, вс 11:00–19:00
{}
касса: ежедневно 11:00–20:00
{}
ежедневно весь день
{}
ежедневно 16:00–21:00 (касса: ежедневно 11:00–15:00)
{}
пн–сб 22:00–6:00
{}
ежедневно 10:00–22:00
{}
пн–пт 9:00–20:00, сб 10:00–18:00
{}
касса: ежедневно 10:00–21:00
{}
ежедневно 11:30–23:30
{}
пн–сб 11:00–21:30
{}
ежедневно 18:00–6:00
{}

{}
пн–пт 9:00–22:30, сб, вс 9:00–20:00
{}
ср–вс 11:00–19:00
{}
ежедневно 11:00–16:30
{}
вт–пт 12:00–19:00, сб, вс 10:00–19:00
{}
пн–ср, пт–вс 11:00–19:00, чт 11:00–21:00
{}
кассы: пн–вс 11:00–14:00, 15:00–19:00
{}
вт–сб 13:00–22:00
{}
пн–вс 09:00–18:00
{}
ежедневно 8:00–21:00 (касса: вт–вс 10:00–18:00)
{}
ср–вс 13:00–20:00
{}
ежедневно 10:00–22:00 (касса: ежедневно 11:00–19:30)
{}
ср 11:00–19:00, чт, пт 12:00–21:00, сб, вс 11:00–18:00
{'3': [1200, 2100]}
пн–сб 14:00–20:00 (по предварительной договоренности)
{}
пт 21:00–6:00, сб 21:00–6:00, вс 23:00–8:00
{'5': [2100, 600]}
чт-вс - с 18:00 до 1:00
{}
сб 11:00–18:00, пн–пт 11:00–19:00, вс 12:00–18:00
{'6': [1100, 1900]}
касса: ежедневно 11:00–15:00 15:30–20:00
{}
пн–пт 8:00–20:00, сб, вс 9:00–19:00
{}
пн–вс 08:00–21:00, сб 11:00–19:00
{}
касса: ежедневно 10:00–19:00
{}
касса: пн 12:00–15:00 16:00–18:00, вт–вс 12:00–15:00 16:00–20:00
{'1': [1200, 1500]}
касса: ежедневно 12:30–15:00 16:00–19:00
{}
вт, ср, сб 11:00–18:00, чт, пт 12:00–21:00
