### Parse the expected outputs from RTF files and store them in a csv

In [42]:
import polars as pl

### Read RTF files and consolidate them as a single CSV

In [43]:
from os import listdir, path

year_directories = "../data/raw/Podatki - rtvslo.si"

years = listdir(year_directories)

file_list = []

for year in years:
    year_path = year_directories + "/" + year

    if path.isdir(year_path) != True:
        continue

    year_months = listdir(year_path)

    year_stripped = year.replace("Promet ", "")

    for month in year_months:
        month_path = year_path + "/" + month

        month_files = listdir(month_path)
        
        month_stripped = month.replace(" " + year_stripped, "")

        for single_file in month_files:

            file_stripped = single_file.replace(".rtf", "")

            file_list.append({
                "year": year_stripped,
                "month": month_stripped,
                "file_name": file_stripped,
                "path": month_path + "/" + single_file,
            })

In [44]:
from tqdm import tqdm
from striprtf import striprtf

outputs = {
    "year": [],
    "month": [],
    "file_name": [],
    "content": []
}

for item in tqdm(file_list):
    with open(item['path'], 'r') as file:
        raw_content = file.read()
        text = striprtf.rtf_to_text(raw_content)
        text = text.replace("\n\n", " \n\n ").replace("\n\x00", "").strip()

        outputs["year"].append(item['year'])
        outputs["month"].append(item['month'])
        outputs["file_name"].append(item['file_name'])
        outputs["content"].append(text)

100%|██████████| 28037/28037 [00:04<00:00, 5745.76it/s]


In [45]:
dfOutputs = pl.DataFrame(outputs)

In [46]:
dfOutputs.describe()

statistic,year,month,file_name,content
str,str,str,str,str
"""count""","""28037""","""28037""","""28037""","""28037"""
"""null_count""","""0""","""0""","""0""","""0"""
"""mean""",,,,
"""std""",,,,
"""min""","""2022""","""April""","""Promet""",""""""
"""25%""",,,,
"""50%""",,,,
"""75%""",,,,
"""max""","""2024""","""September""","""TMP9-2024-99""","""Štajerska avtocesta je med pri…"


### Extract some extra data from content

In [47]:
import re

def isNujna(text):
    if re.search('Nujna prometna informacija', text, re.IGNORECASE):
        return 1
    if re.search('Nujna prometne informacije', text, re.IGNORECASE):
        return 1
    if re.search('Nujne prometna informacija', text, re.IGNORECASE):
        return 1
    if re.search('Nujne prometne informacije', text, re.IGNORECASE):
        return 1
    
    return 0

def isNova(text):
    if re.search('Nova prometna informacija', text, re.IGNORECASE):
        return 1
    if re.search('Nova prometne informacije', text, re.IGNORECASE):
        return 1
    if re.search('Nove prometna informacija', text, re.IGNORECASE):
        return 1
    if re.search('Nove prometne informacije', text, re.IGNORECASE):
        return 1
    
    return 0

In [48]:
# extract things like 1., 2. in 3. program

# Alex
def clean_programs(programs):
    if isinstance(programs, str):
        if re.match(r"^\d{2},", programs):
            return programs[3:]
    return programs

def extractPrograms(text):
    pattern = r'(\d+\.)\s*(?:in|,|&)?\s*(?=\d+\.|program)'

    matches = re.findall(pattern, text)

    m = []

    for match in matches:
        p = match.strip().replace(".", "")
        if int(p) <= 3:
            m.append(p)

    p = ",".join(m)

    return clean_programs(p)

In [49]:
dfOutputs = dfOutputs.with_columns([
    (pl.col("content").map_elements(lambda x: isNujna(x), return_dtype=pl.Int8).alias('nujna')),
    (pl.col("content").map_elements(lambda x: isNova(x), return_dtype=pl.Int8).alias('nova')),
    (pl.col("content").map_elements(lambda x: extractPrograms(x), return_dtype=pl.String).alias('programs')),
])

### Extract Date and Time from the content and generate IDs

In [50]:
import re

def matchDatetime(text: str):
    pattern = (
        r'(\d{1,2}(?:\s*\.\s*00)?)'     # Day (e.g., 15 or 11.00)
        r'(?:\s?\.*?\,?\s?)'                # Separator (dot, comma or space)
        r'(\d{1,2})'                    # Month
        r'(?:\s?\.*?\,?\s?)'                # Separator (dot, comma or space)
        r'(\d{2,4})'                    # Year
        r'\s*(?:-\s*)?'                 # spaces and Optional dash separator
        r'(\d{1,2}\s?[.:,]?\s?\d{2})'    # Time (with optional separator)
    )

    matches = re.search(pattern, text)

    dateString = None
    timeString = None

    try:
        if matches:
            day = matches.group(1).strip().replace(" ", "")
            month = matches.group(2).strip()
            year = matches.group(3).strip()

            # fix some dates that are like 13.00
            if day.find(".00") != -1:
                day = day.replace(".00", "")

            # fix two digit years
            if len(year) == 2:
                year = "20" + year

            # add leading zeroes where needed
            if len(day) == 1:
                day = "0" + day
            if len(month) == 1:
                month = "0" + month

            dateString = day + '.' + month + '.' + year
            
            timeString = matches.group(4).strip()

        
            timeString = timeString.replace(".", ":").replace("..", ":").replace(",", ":").replace(" ", "")
        
        # check if separator exists, if not add, to change time like 1300 to 13:00
        if timeString.find(":") == -1:
            timeString = timeString[:len(timeString)-2] + ":" + timeString[len(timeString)-2:]
    except:
        pass

    return dateString, timeString

In [51]:
from datetime import datetime

def parseDiffFormats(dateString: str, timeString: str):
    # https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes
    possible_formats = [
        '%d.%m.%Y - %H:%M',
    ]

    date = "0000-00-00"
    time = "00:00:00"

    try:
        dt_string = dateString.replace(" ", "") + " - " + timeString
    except AttributeError:
        return date, time

    for fmt in possible_formats:
        try:
            dt = datetime.strptime(dt_string, fmt)
            date = dt.strftime('%Y-%m-%d')
            time = dt.strftime('%H:%M:%S')
            break
        except ValueError:
            continue

    return date, time

In [52]:
import uuid

def generate_id(content):
    return str(uuid.uuid5(uuid.NAMESPACE_URL, content))

In [53]:
def extractDatetime(text):
    # split text by "\n\n" and match first item, same for nujna, nova and programs
    rawDate, rawTime = matchDatetime(text)

    date = "0000-00-00"
    time = "00:00:00"

    date, time = parseDiffFormats(rawDate, rawTime)
    
    return {
        "datetime": date + " " + time
    }

In [54]:
dfOutputs = dfOutputs.with_columns([
    (pl.col("content").map_elements(lambda x: extractDatetime(x)['datetime'], return_dtype=pl.String).alias('datetime')),
])

dfOutputs = dfOutputs.with_columns([
    (pl.col("datetime").map_elements(lambda x: generate_id(x), return_dtype=pl.String).alias('id')),
])

In [55]:
dfOutputs = dfOutputs.select(["id", "datetime", "nujna", "nova", "programs", "content", "file_name"]) # "year", "month"

In [56]:
dfOutputs = dfOutputs.sort("datetime")

In [57]:
dfOutputsBad = dfOutputs.filter(pl.col("datetime").str.contains("0000-00-00"))

dfOutputsBad.write_csv("../data/outputs-bad-dates.csv")

In [58]:
dfOutputsClean = dfOutputs.filter(pl.col("datetime").str.contains("0000-00-00").not_())

dfOutputsClean.write_csv("../data/outputs-clean.csv")

In [59]:
dfOutputs.row(1, named=True)

{'id': 'a1af76d5-5e74-53c4-bca1-d8e8a05ce0b7',
 'datetime': '0000-00-00 00:00:00',
 'nujna': 0,
 'nova': 0,
 'programs': '',
 'content': 'ZA ODDAJO:  \n\n Še vedno je 6-kilometrski zastoj pred predorom Karavanke proti Avstriji. Vozniki namenjenim proti Kranjski Gori in Jesenicam uporabite izvoz Lesce, saj je izvoz Jesenice-vzhod proti Avstriji je zaprt.  \n\n Gost promet z zastoji, ki je bil cel dan na primorski avtocesti proti Ljubljani, se je umiril. Nekaj gneče je še od Vrhnike proti Ljubljani.  \n\n Na mejnih prehodih Gruškovje in Starod vozniki osebnih vozil na izstop iz države čakajo do 1 uro, na Obrežju, Dragonji, v Jelšanah in Sečovljah pa pol ure. \nPri vstopu v Slovenijo v Gruškovju vozniki čakajo 2 uri, v Jelšanah pa pol ure.',
 'file_name': 'TMP-29'}

In [60]:
dfOutputsBad.describe()

statistic,id,datetime,nujna,nova,programs,content,file_name
str,str,str,f64,f64,str,str,str
"""count""","""430""","""430""",430.0,430.0,"""430""","""430""","""430"""
"""null_count""","""0""","""0""",0.0,0.0,"""0""","""0""","""0"""
"""mean""",,,0.025581,0.004651,,,
"""std""",,,0.158067,0.06812,,,
"""min""","""a1af76d5-5e74-53c4-bca1-d8e8a0…","""0000-00-00 00:00:00""",0.0,0.0,"""""","""""","""TMP-135"""
"""25%""",,,0.0,0.0,,,
"""50%""",,,0.0,0.0,,,
"""75%""",,,0.0,0.0,,,
"""max""","""a1af76d5-5e74-53c4-bca1-d8e8a0…","""0000-00-00 00:00:00""",1.0,1.0,"""3,1""","""Štajerska avtocesta je med pri…","""TMP9-2024-721"""


In [61]:
dfOutputsClean.describe()

statistic,id,datetime,nujna,nova,programs,content,file_name
str,str,str,f64,f64,str,str,str
"""count""","""27607""","""27607""",27607.0,27607.0,"""27607""","""27607""","""27607"""
"""null_count""","""0""","""0""",0.0,0.0,"""0""","""0""","""0"""
"""mean""",,,0.008295,0.065237,,,
"""std""",,,0.0907,0.246948,,,
"""min""","""00019211-9609-5556-9ce3-0d2daa…","""2022-01-01 06:00:00""",0.0,0.0,"""""","""*NE UPORABLJAJ ZA VAJO Prom…","""Promet"""
"""25%""",,,0.0,0.0,,,
"""50%""",,,0.0,0.0,,,
"""75%""",,,0.0,0.0,,,
"""max""","""fffbbef5-2d94-54ea-944d-a6313c…","""2024-12-31 21:00:00""",1.0,1.0,"""3,2""","""ujna prometna informacija. …","""TMP9-2024-99"""


In [62]:
# 28 037 = 27 607 - 430