In [None]:
%autosave 60
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
from pathlib import Path


In [None]:
import json
import os
import pickle
from collections import Counter
from io import BytesIO
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union, cast

import cv2
import matplotlib as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn as skl
from IPython.display import Image, display
from matplotlib.patches import Rectangle
from tqdm import tqdm

import PIL.Image as pil_img
from matplotlib_inline.backend_inline import set_matplotlib_formats


In [None]:
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", 15)
pd.set_option("display.max_rows", 50)
# Suitable default display for floats
pd.options.display.float_format = "{:,.2f}".format
plt.rcParams["figure.figsize"] = (12, 10)

# This one is optional -- change graphs to SVG only use if you don't have a
# lot of points/lines in your graphs. Can also just use ['retina'] if you
# don't want SVG.
%config InlineBackend.figure_formats = ["retina"]
set_matplotlib_formats("pdf", "png")

In [None]:
VIDEO_PATH = Path("/shared/g-luo/geoguessr/videos").resolve()
OUT_PATH = Path("/shared/gbiamby/geo").resolve()
assert VIDEO_PATH.exists()
assert OUT_PATH.exists()

In [None]:
files = sorted(VIDEO_PATH.glob("**/*.mp4"))
print("total video files found: ", len(files))

In [None]:
def load_metadata(path: Path):
    """
    Load metadata for a single .mp4, from the .info.json file. Drops some of the really verbose json keys before returning:
        "formats", "thumbnails", "automatic_captions", "http_headers"
    """
    if path.suffix:
        path = path.with_suffix("")
    info_path = path.with_suffix(".info.json")
    data = json.load(open(info_path, "r"))
    drop_keys = set(["formats", "thumbnails", "automatic_captions", "http_headers"])
    for k in drop_keys.intersection(data.keys()):
        del data[k]
    data["path"] = path
    # print(data.keys())
    # print(path, path.exists())
    return data


all_metadata = []
for f in tqdm(files):
    all_metadata.append(load_metadata(f))

In [None]:
df_all = pd.DataFrame(all_metadata)
df_all.insert(2, "yt_url", df_all.id.apply(lambda x: f"https://www.youtube.com/watch?v={x}"))
df_all

In [None]:
df_suspect = df_all[
    (~df_all.title.apply(lambda x: "geoguessr" in x.casefold()))
    & (~df_all.description.apply(lambda x: "geoguessr" in x.casefold()))
    & (~df_all.title.apply(lambda x: "battle royale" in x.casefold()))
    & (~df_all.description.apply(lambda x: "battle royale" in x.casefold()))
    & (~df_all.uploader.apply(lambda x: "geowizard" in x.casefold()))
    & (~df_all.uploader.apply(lambda x: "geoguessr" in x.casefold()))
]
df_suspect.shape
df_geo = df_all[~df_all.id.isin(df_suspect.id)].copy(deep=True)

In [None]:
# with pd.option_context("display.max_rows", None, "display.max_columns", None):
#     display(df_suspect)

In [None]:
pickle.dump(df_geo, open(OUT_PATH / "df_geo.pkl", "wb"))

In [None]:
df_geo.columns

## Sample Frames from Videos, Save as JPG's

In [None]:
import cv2


def sample_frames(video_path: Path, out_path: Path, sample_every_seconds=20.0):
    """
    Sample a frame every `sample_every_seconds` seconds from the specified video, saving it to
    """
    path = video_path.with_suffix(".mp4")
    assert path.exists(), str(path)
    cap = cv2.VideoCapture(str(path))
    if not cap.isOpened():
        print("could not open :", video_path)
    num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    print(f"num_frames: {num_frames:,}")
    seconds = round(0, 2)
    sample_every_seconds = 5.0
    count = 0
    success = True
    print(path)
    while success:
        cap.set(cv2.CAP_PROP_POS_MSEC, (seconds * 1000))
        success, image = cap.read()
        if success:
            frame_out_path = out_path / "screen_samples_auto" / f"{path.stem}/frame_{count:08}.jpg"
            frame_out_path.parent.mkdir(exist_ok=True, parents=True)
            # print(frame_out_path)
            cv2.imwrite(str(frame_out_path), image)
        seconds = round(seconds + sample_every_seconds, 2)
        count += 1
    cap.release()
    print("total frames captured: ", count, ", seconds: ", seconds)


for i, (idx, row) in tqdm(enumerate(df_sampled.iterrows()), total=len(df_sampled)):
    # if i > 10:
    #     break
    sample_frames(row.path, OUT_PATH, 20.0)

In [None]:
df_sampled = df_geo.sample(100)
df_sampled

In [None]:
print(i)

In [None]:
f.stem

In [None]:
coco = json.load(open(Path("/shared/gbiamby/geo/geoscreens_000.json")))

In [None]:
coco.keys()

In [None]:
print(
    f"images: {len(coco['images'])}, categories: {len(coco['categories'])}, annotations: {len(coco['annotations'])}"
)

In [None]:
imgs_with_anns = {ann["image_id"] for ann in coco["annotations"] if len(ann["bbox"]) == 4}
# imgs_all = set(coc
img_ids = {img["id"] for img in coco["images"]}
print(len(imgs_with_anns), len(img_ids))

---

## Reformat the label-studio Task List to include "video_id"

No longer need this, it's part of the predict.ipynb notebook now.

In [None]:
# tasks = json.load(open(Path("/shared/gbiamby/geo/annotations.json")))
# print(len(tasks), " total tasks")
# print(tasks[0])

In [None]:
# for i, t in enumerate(tasks):
#     # if i > 200:
#     #     break
#     img = t["data"]["image"]
#     img = Path(img)
#     # print(img.parent.name)
#     t["data"]["video_id"] = str(img.parent.name)

In [None]:
# json.dump(tasks, open(Path("/shared/gbiamby/geo/tasks_001.json"), "w"), indent=4, sort_keys=True)

## Convert exported img paths to use as coco

In [None]:
tasks = json.load(open(Path("/home/gbiamby/proj/geo/datasets/geoscreens_001/geoscreens_001.json")))
print(len(tasks), " total tasks")

In [None]:
tasks["categories"]

## Look at Coco formatted anns

In [None]:
tasks = json.load(open(Path("/home/gbiamby/proj/geo/datasets/geoscreens_001/geoscreens_001.json")))
print(len(tasks), " total tasks")
anns = tasks["annotations"]
print("total anns: ", len(anns))

### Fix the image path

In [None]:
for img in tasks["images"]:
    # print(img)
    # print(img["file_name"].replace("/data/local-files/?d=", "/shared/gbiamby/geo/screenshots/"))
    img["file_name"] = img["file_name"].replace(
        "/data/local-files/?d=", "/shared/gbiamby/geo/screenshots/"
    )

json.dump(
    tasks,
    open(Path("/home/gbiamby/proj/geo/datasets/geoscreens_001/geoscreens_001.json"), "w"),
    indent=4,
    sort_keys=True,
)

In [None]:
Counter([img["width"] for img in tasks["images"]]), Counter([img["height"] for img in tasks["images"]])

In [None]:
dims = [
    {
        "x": ann["bbox"][0],
        "y": ann["bbox"][1],
        "width": ann["bbox"][2],
        "height": ann["bbox"][3],
        "width_scaled": ann["bbox"][2] * (640 / 1280),
        "height_scaled": ann["bbox"][3] * (640 / 720),
        "area_scaled": (ann["bbox"][2] * (640 / 1280) * ann["bbox"][3] * (640 / 1280)),
        "ratio_wh": float(ann["bbox"][2] * (640 / 1280)) / (ann["bbox"][3] * (640 / 1280)),
        "ratio_hw": float(ann["bbox"][3] * (640 / 1280)) / (ann["bbox"][2] * (640 / 1280)),
    }
    for ann in anns
]
df = pd.DataFrame(dims)
dims[:10]

In [None]:
df["area_bin"] = df.area_scaled.apply(
    lambda x: "small" if x <= 32 ** 2 else "medium" if x <= 96 ** 2 else "large"
)

In [None]:
df

In [None]:
anchor_sizes = tuple(
    (x, int(x * 2 ** (1.0 / 3)), int(x * 2 ** (2.0 / 3))) for x in [32, 64, 128, 256, 512]
)
aspect_ratios = ((0.08, 0.16, 0.25, 0.36, 0.5, 0.7, 1.0, 2.0),) * len(anchor_sizes)
anchor_sizes, aspect_ratios

In [None]:
pd.DataFrame(df.area_bin.value_counts()).plot.bar()

In [None]:
df.hist(column="width_scaled", bins=100, figsize=(48, 32), grid=True, by="area_bin")

In [None]:
df.hist(column="height_scaled", bins=100, figsize=(48, 32), grid=True, by="area_bin")

In [None]:
df.hist(column="ratio_wh", bins=100, figsize=(48, 32), grid=True, by="area_bin")

In [None]:
df.hist(column="ratio_hw", bins=100, figsize=(48, 32), grid=True, by="area_bin")

In [None]:
for img in tasks["images"]:
    # print(img)
    # print(img["file_name"].replace("/data/local-files/?d=", "/shared/gbiamby/geo/screenshots/"))
    img["file_name"] = img["file_name"].replace(
        "/data/local-files/?d=", "/shared/gbiamby/geo/screenshots/"
    )

json.dump(
    tasks,
    open(Path("/home/gbiamby/proj/geo/datasets/geoscreens_001/geoscreens_001.json"), "w"),
    indent=4,
    sort_keys=True,
)

In [None]:
tuple((x, int(x * 2 ** (1.0 / 3)), int(x * 2 ** (2.0 / 3))) for x in [16, 32, 64, 128, 256])

In [None]:
anchor_sizes = tuple(
    (x, int(x * 2 ** (1.0 / 3)), int(x * 2 ** (2.0 / 3))) for x in [32, 64, 128, 256, 512]
)
aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
print(anchor_sizes, aspect_ratios)

### Train val Split

In [None]:
from collections import Counter

Counter([Path(img["file_name"]).parent.name for img in coco["images"]])