In [84]:
import polars as pl

In [85]:
df_inputs = pl.read_csv("../data/inputs-clean.csv")
df_outputs = pl.read_csv("../data/outputs-clean.csv")

In [86]:
# rename some columns that are identical for inputs and outputs
df_inputs = df_inputs.rename({"id": "id_input", "datetime": "datetime_original_input"})
df_outputs = df_outputs.rename({"id": "id_output", "datetime": "datetime_original_output"})

In [87]:
from datetime import datetime, timedelta

# prev output time -3 min, output time -3 min 
# - timedelta(minutes=1)

fmt = '%Y-%m-%d %H:%M:%S'

def str_to_datetime(dt_str):
    return datetime.strptime(dt_str, fmt)

def adjust_time(dt):
    adjusted_dt = dt - timedelta(minutes=3)
    return adjusted_dt

In [88]:
# add parsed datetime objects to separate columns, needed for matching inputs and outputs

df_inputs = df_inputs.with_columns([
    (pl.col("datetime_original_input").map_elements(lambda x: str_to_datetime(x), return_dtype=pl.Datetime).alias('datetime_input')),
])

df_outputs = df_outputs.with_columns([
    (pl.col("datetime_original_output").map_elements(lambda x: str_to_datetime(x), return_dtype=pl.Datetime).alias('datetime_output')),
])

In [89]:
# sort, just in case
df_inputs = df_inputs.sort("datetime_input")
df_outputs = df_outputs.sort("datetime_output")

In [90]:
# add different effective date as input published seconds before output cannot be valid when humans generated the outputs
# prev output time -3 min, output time -3 min 

df_outputs = df_outputs.with_columns([
    (pl.col("datetime_output").map_elements(lambda x: adjust_time(x), return_dtype=pl.Datetime).alias('datetime_output_adjusted')),
])

In [91]:
# join, input before output
result = df_inputs.join_asof(
    df_outputs,
    left_on="datetime_input",
    right_on="datetime_output_adjusted",
    strategy="forward"  # finds the first output datetime after input datetime
)

# store the previous output datetime as a new column
df_outputs = df_outputs.with_columns([
    pl.col("datetime_output_adjusted").shift(1).alias("prev_output_datetime")
])

# add the previous output datetime to the results
result = result.join(
    df_outputs.select(["datetime_output_adjusted", "prev_output_datetime"]),
    on="datetime_output_adjusted"
)

In [92]:
final = result.filter(
    (pl.col("datetime_input") > pl.col("prev_output_datetime")) | pl.col("prev_output_datetime").is_null()
)

In [93]:
# sometimes, there are inputs at night after 8pm, but the next output is in the morning. We may need to ignore inputs that are older than 3 hours, or that are from the previous day

# output 3fb25683-bf09-59ea-b2b4-b9b4a393dce6 has from previous night, too many inputs and they are old

def check_if_input_too_old(input_dt, output_dt):
    threshold = timedelta(hours=3)
    diff = output_dt - input_dt
    if diff > threshold:
        return True
    return False


final = (final.with_columns(
    pl.struct('datetime_input','datetime_output')
    .map_elements(lambda x: check_if_input_too_old(x['datetime_input'], x['datetime_output']), return_dtype=pl.Boolean)
    .alias('input_too_old')
))

final = final.filter(pl.col("input_too_old") == False)

In [94]:
final.columns

['id_input',
 'datetime_original_input',
 'A1',
 'B1',
 'C1',
 'TitlePomembnoSLO',
 'ContentPomembnoSLO',
 'TitleNesreceSLO',
 'ContentNesreceSLO',
 'TitleZastojiSLO',
 'ContentZastojiSLO',
 'TitleVremeSLO',
 'ContentVremeSLO',
 'TitleOvireSLO',
 'ContentOvireSLO',
 'TitleDeloNaCestiSLO',
 'ContentDeloNaCestiSLO',
 'TitleOpozorilaSLO',
 'ContentOpozorilaSLO',
 'TitleMednarodneInformacijeSLO',
 'ContentMednarodneInformacijeSLO',
 'TitleSplosnoSLO',
 'ContentSplosnoSLO',
 'datetime_input',
 'id_output',
 'datetime_original_output',
 'nujna',
 'nova',
 'programs',
 'content',
 'file_name',
 'datetime_output',
 'datetime_output_adjusted',
 'prev_output_datetime',
 'input_too_old']

In [95]:
final.write_csv("../data/dataset-full-debug.csv")

In [96]:
#final.filter(final['id_output'] == '80beefa5-6ea9-568c-88ec-92b024447401')

In [97]:
final_ordered_columns = ["id_output", "id_input", "datetime_original_input", "datetime_original_output", "nujna", "nova", "programs", "content", "A1", "B1", "C1", "TitlePomembnoSLO", "ContentPomembnoSLO", "TitleNesreceSLO", "ContentNesreceSLO", "TitleZastojiSLO", "ContentZastojiSLO", "TitleVremeSLO", "ContentVremeSLO", "TitleOvireSLO", "ContentOvireSLO", "TitleDeloNaCestiSLO", "ContentDeloNaCestiSLO", "TitleOpozorilaSLO", "ContentOpozorilaSLO", "TitleMednarodneInformacijeSLO", "ContentMednarodneInformacijeSLO", "TitleSplosnoSLO", "ContentSplosnoSLO"]

final_stripped = final.select(final_ordered_columns)

final_stripped = final_stripped.rename({"datetime_original_input": "datetime_input", "datetime_original_output": "datetime_output"})

In [98]:
final_stripped.write_csv("../data/dataset-full.csv")

### Split train and test

In [99]:
from sklearn.model_selection import train_test_split

In [100]:
unique_outputs = final_stripped.select(["id_output", "datetime_output"]).unique(subset="id_output", maintain_order=True)

In [101]:
train, test = train_test_split(unique_outputs, test_size=0.20, shuffle=False)

In [102]:
df_train = final_stripped.filter(
    pl.col("id_output").is_in(train["id_output"]),
)

df_test = final_stripped.filter(
    pl.col("id_output").is_in(test["id_output"]),
)

In [103]:
df_train.write_csv("../data/dataset-train.csv")
df_test.write_csv("../data/dataset-test.csv")

In [104]:
df_test.unique(subset="id_output", maintain_order=True).describe()
df_test.describe()

statistic,id_output,id_input,datetime_input,datetime_output,nujna,nova,programs,content,A1,B1,C1,TitlePomembnoSLO,ContentPomembnoSLO,TitleNesreceSLO,ContentNesreceSLO,TitleZastojiSLO,ContentZastojiSLO,TitleVremeSLO,ContentVremeSLO,TitleOvireSLO,ContentOvireSLO,TitleDeloNaCestiSLO,ContentDeloNaCestiSLO,TitleOpozorilaSLO,ContentOpozorilaSLO,TitleMednarodneInformacijeSLO,ContentMednarodneInformacijeSLO,TitleSplosnoSLO,ContentSplosnoSLO
str,str,str,str,str,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""count""","""35009""","""35009""","""35009""","""35009""",35009.0,35009.0,"""35009""","""35009""","""35009""","""35009""","""35009""","""0""","""2561""","""0""","""16978""","""122""","""26775""","""837""","""9005""","""74""","""22725""","""10470""","""35009""","""6399""","""7930""","""23970""","""6725""","""31351""","""10254"""
"""null_count""","""0""","""0""","""0""","""0""",0.0,0.0,"""0""","""0""","""0""","""0""","""0""","""35009""","""32448""","""35009""","""18031""","""34887""","""8234""","""34172""","""26004""","""34935""","""12284""","""24539""","""0""","""28610""","""27079""","""11039""","""28284""","""3658""","""24755"""
"""mean""",,,,,0.004884,0.048502,,,,,,,,,,,,,,,,,,,,,,,
"""std""",,,,,0.069719,0.214827,,,,,,,,,,,,,,,,,,,,,,,
"""min""","""00019211-9609-5556-9ce3-0d2daa…","""00013a62-33b6-5afb-aad0-034f36…","""2024-05-01 04:57:16""","""2024-05-01 07:00:00""",0.0,0.0,"""""","""NOVA Prometna informacija …","""NULL""","""Burja Na vipavski hitri ces…","""NULL""",,"""Avtocesta med razcepom Gabrk i…",,""" Na cesti Krvavi Potok - Ko…","""Zastoji, povečan promet""",""" Na glavnih in regionalnih …","""Burja""",""" Cesta Brestanica - Gaj - S…","""Italija""",""" Na cesti Kočevje - Lju…","""Delovne zapore""",""" Cesta Litija - Zagorje, pr…","""Prireditve""",""" Več.""","""Prireditve""",""" Prireditve na povezavi.""","""Italija""",""" Na cesti Ormož - Središče …"
"""25%""",,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,
"""50%""",,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,
"""75%""",,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,
"""max""","""ffe6165f-d231-59e6-99c8-3e8cdd…","""fffcc5ca-6395-521e-9d3d-b9391b…","""2024-12-31 20:24:03""","""2024-12-31 21:00:00""",1.0,1.0,"""3,2""","""rometne informacije 16. …","""Pozor! Štajersko avtocesto …","""ovire Na primorski avtocest…","""NULL""",,"""Štajersko avtocesto med Rogozo…",,"""Železniki - Petrovo Brdo, pri …","""Zastoji, povečan promet""","""Štajerska:- na avtocesti med Ž…","""Burja""","""Zimske razmere:- Prepoved za p…","""Okvari vozil""","""Živali, oviran promet:- na vip…","""Popolne zapore""","""Štajerske avtoceste med Blagov…","""ovire""","""Štajerska avtocesta bo zaprta …","""Vreme""","""• Zastoj tovornih vozil je na …","""Tovorni promet""","""Zaradi tehnične okvare ne delu…"


In [105]:
test.describe()

statistic,id_output,datetime_output
str,str,str
"""count""","""4625""","""4625"""
"""null_count""","""0""","""0"""
"""mean""",,
"""std""",,
"""min""","""00019211-9609-5556-9ce3-0d2daa…","""2024-05-01 07:00:00"""
"""25%""",,
"""50%""",,
"""75%""",,
"""max""","""ffe6165f-d231-59e6-99c8-3e8cdd…","""2024-12-31 21:00:00"""
