In [44]:
import re
from jinja2 import Environment, FileSystemLoader, select_autoescape
import os.path
import pandas as pd
import glob

In [45]:
env = Environment(
    loader=FileSystemLoader("."),
    autoescape=select_autoescape(),
    trim_blocks=True,
    lstrip_blocks=True,
)
template = env.get_template("duplicates.html.j2")

#project_path = "/home/guest/sischr001/LOKI-Pipeline/data/output/LOKI_PS122-2-18-73_01"
# project_path = "/home/guest/sischr001/LOKI-Pipeline/experiments"
project_path = "/home/guest/sischr001/LOKI-Pipeline/data/output/22-10-27-LOKI_46-24hours_01"

In [46]:
object_id_pat = re.compile("(\d{8} \d{6}  \d{3})  (\d{6}) (\d{4} \d{4})")

def parse_object_id(object_id):
    match = object_id_pat.fullmatch(object_id)
    if match is None:
        raise ValueError(f"No match: {object_id}")

    return pd.Series([match[1], match[2], match[3]])

parse_object_id("20200104 104722  087  000002 2355 0084")

0    20200104 104722  087
1                  000002
2               2355 0084
dtype: object

In [47]:
def load_images(project_path):
    print("Loading images...")

    duplicates_name = "duplicates"
    dup_path = os.path.join(project_path, duplicates_name)

    patterns = [os.path.join(dup_path, "**/*.jpg"), os.path.join(dup_path, "**/*.bmp")]

    df = pd.DataFrame(
        {"img_fn": (os.path.relpath(x, dup_path) for pat in patterns for x in glob.iglob(pat, recursive=True)) }
    )

    if df.empty:
        raise ValueError("No images found")

    print("Parsing filenames...")
    df["object_id"] = df["img_fn"].map(
        lambda x: os.path.splitext(os.path.basename(x))[0]
    )

    df[["frame_id", "seq", "object_loc"]] = df["object_id"].apply(parse_object_id)

    dupset_id = df["img_fn"].map(lambda x: os.path.dirname(x))

    df["dupset_id"] = dupset_id.where(dupset_id.str.len() > 0, df["object_id"])

    df["img_fn"] = df["img_fn"].map(lambda x: os.path.join(duplicates_name, x))

    df = df.sort_values("frame_id")

    # Make colors
    colors = pd.Series(["red", "cyan", "blue", "purple", "yellow", "lime", "magenta", "orange", "brown", "green"])
    codes, uniques = pd.factorize(df["dupset_id"])
    df["color"] = colors.take(codes % len(colors)).values

    return df

df = load_images(project_path)
df

Loading images...
Parsing filenames...


Unnamed: 0,img_fn,object_id,frame_id,seq,object_loc,dupset_id,color
2716,duplicates/20200710 112358 091 000000 2478 1...,20200710 112358 091 000000 2478 1401,20200710 112358 091,000000,2478 1401,20200710 112358 091 000000 2478 1401,red
16,duplicates/20200710 112358 091 000000 2478 1...,20200710 112358 091 000000 2478 1401,20200710 112358 091,000000,2478 1401,20200710 112358 091 000000 2478 1401,red
1145,duplicates/20200710 112358 091 000000 2478 1...,20200710 112358 200 000000 2478 1398,20200710 112358 200,000000,2478 1398,20200710 112358 091 000000 2478 1401,red
1714,duplicates/20200710 112358 091 000000 2478 1...,20200710 112358 309 000000 2478 1398,20200710 112358 309,000000,2478 1398,20200710 112358 091 000000 2478 1401,red
2922,duplicates/20200710 112358 091 000000 2478 1...,20200710 112358 418 000000 2478 1398,20200710 112358 418,000000,2478 1398,20200710 112358 091 000000 2478 1401,red
...,...,...,...,...,...,...,...
3513,duplicates/20200710 112813 050 000001 2544 0...,20200710 112832 181 000001 2544 0471,20200710 112832 181,000001,2544 0471,20200710 112813 050 000001 2544 0468,purple
1009,duplicates/20200710 112358 091 000000 2478 1...,20200710 112832 181 000000 2478 1398,20200710 112832 181,000000,2478 1398,20200710 112358 091 000000 2478 1401,red
3522,duplicates/20200710 112813 050 000001 2544 0...,20200710 112832 398 000001 2541 0468,20200710 112832 398,000001,2541 0468,20200710 112813 050 000001 2544 0468,purple
3528,duplicates/20200710 112813 050 000001 2544 0...,20200710 112832 561 000001 2541 0471,20200710 112832 561,000001,2541 0471,20200710 112813 050 000001 2544 0468,purple


In [48]:
if True:
    df = df[df["object_id"] == df["dupset_id"]]
    index_name = "single"
else:
    index_name = "index"

In [49]:
def gen_rows(df):
    max_age = 2
    age = {}

    columns = []
    for frame_id, group in df.groupby("frame_id"):

        row = {}
        for item in group.itertuples():
            try:
                idx = columns.index(item.dupset_id)
            except ValueError:
                try:
                    idx = columns.index(None)
                except ValueError:
                    columns.append(item.dupset_id)
                    idx = len(columns) - 1
                else:
                    columns[idx] = item.dupset_id

            row[idx] = dict(
                object_id=item.object_id,
                img_fn=item.img_fn,
                color=item.color,
                dupset_id=item.dupset_id,
                object_loc=item.object_loc,
            )

            age[item.dupset_id] = 0

        # Clean up columns
        columns = [c if c is None or age[c] < max_age else None for c in columns]

        age = {k: v + 1 for k, v in age.items() if v < max_age}

        # Convert row to list
        row = {
            "frame_id": frame_id,
            "objects": [row.get(i, None) for i in range(max(row.keys()) + 1)],
        }

        yield row


In [50]:
import itertools

def split_dataframe(df, chunk_size = 10000): 
    chunks = []
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df.iloc[i*chunk_size:(i+1)*chunk_size])
    return chunks

print("Writing output...")
rows = gen_rows(df)

for i in itertools.count():
    row_chunk = list(itertools.islice(rows, 5000))
    if not row_chunk:
        break
    
    index_fn = f"{index_name}_{i:03d}.html"
    print(index_fn)
    template.stream(project_id="LOKI_PS122-2-18-73_01", rows=row_chunk).dump(
        os.path.join(project_path, index_fn)
    )

Writing output...
single_000.html
