In [None]:
%autosave 60
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import json
import os
import pickle
from collections import Counter, OrderedDict, defaultdict
from copy import deepcopy
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import PIL.Image as pil_img
import seaborn as sns
import sklearn as skl
from IPython.display import Image, display
from matplotlib.patches import Rectangle
from matplotlib_inline.backend_inline import set_matplotlib_formats
from tqdm.contrib import tenumerate, tmap, tzip
from tqdm.contrib.bells import tqdm, trange

from geoscreens.consts import (
    EXTRACTED_FRAMES_PATH,
    FRAMES_METADATA_PATH,
    LATEST_DETECTION_MODEL_NAME,
    VIDEO_PATH,
)
from geoscreens.data import get_all_geoguessr_split_metadata
from geoscreens.data.metadata import GOOGLE_SHEET_IDS, FramesList
from geoscreens.utils import batchify, load_json, save_json, timeit_context

In [None]:
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", 15)
pd.set_option("display.max_rows", 50)
# Suitable default display for floats
pd.options.display.float_format = "{:,.2f}".format
plt.rcParams["figure.figsize"] = (12, 10)

# This one is optional -- change graphs to SVG only use if you don't have a
# lot of points/lines in your graphs. Can also just use ['retina'] if you
# don't want SVG.
%config InlineBackend.figure_formats = ["retina"]
set_matplotlib_formats("pdf", "png")

* [x] Load in_game frames
* [x] load detections for all videos
* [x] filter to in_game frames
* [ ] crop images
* [ ] ocr cropped images
* [ ] save results

## Functions

## Load in_game Frames

In [None]:
df_ingame = pickle.load(open("/shared/gbiamby/geo/segment/in_game_frames_000.pkl", "rb"))

In [None]:
df_url_frames = df_ingame[df_ingame.labels.apply(lambda l: "url" in l)].copy(deep=True)

In [None]:
print(df_ingame.shape, df_url_frames.shape)

In [None]:
df_ingame.video_id.nunique(), df_url_frames.video_id.nunique()

In [None]:
df_url_frames.head(1).T

In [None]:
import operator

import easyocr


def last_index(lst, value):
    return len(lst) - operator.indexOf(reversed(lst), value) - 1


reader = easyocr.Reader(["en"])

In [None]:
urls = defaultdict(list)

for i, (idx, row) in tenumerate(df_url_frames.iterrows(), total=len(df_url_frames)):
    # if i >= 100:
    #     break
    # print(row)
    video_id = row.video_id
    url_idx = last_index(row.labels, "url")
    # Crop:
    img = pil_img.open(row.file_path)
    # display(img)
    url_area = row.bboxes[url_idx]
    url_area = (url_area["xmin"], url_area["ymin"], url_area["xmax"], url_area["ymax"])
    img_cropped = img.crop(url_area)
    # display(img_cropped)
    result = reader.recognize(np.array(img_cropped))
    urls[video_id].append({**row.to_dict(), "ocr": result})
    # print(result)

### Show the cropped URL bar

In [None]:
display(img_cropped)

### OCR on the cropped URL bar

In [None]:
result = reader.recognize(np.array(img_cropped))
result

In [None]:
df_url_frames["row_num"] = df_url_frames.reset_index().index
df_url_frames["gpu_id"] = df_url_frames.row_num.apply(lambda x: x % 3)

In [None]:
df_url_frames["gpu_id"] = df_url_frames.row_num.apply(lambda x: x % 3)

In [None]:
results = pickle.load(open("/shared/gbiamby/geo/data/urls/url_ocr_raw.pkl", "rb"))

In [None]:
results

---

## Load Raw OCR Results, Clean up the URLs and Group Them by video_id + game_num

In [None]:
ocr = pickle.load(open("/shared/gbiamby/geo/data/urls/url_ocr_raw.pkl", "rb"))

In [None]:
ocr["--0Kbpo9DtE"][0]

In [None]:
# How many ocr outputs have more than one result?

# rawr = []
# for i, (video_id, frames) in tenumerate(ocr.items()):
#     for f in frames:
#         if len(f["ocr"]) > 1:
#             rawr.append({"video_id": video_id, "ocr": f["ocr"]})
# print(len(rawr))

In [None]:
ocr_clean = []
for i, (video_id, frames) in tenumerate(ocr.items()):
    for f in frames:
        for ocr_result in f["ocr"]:
            ocr_clean.append(
                {
                    "video_id": video_id,
                    "ocr": ocr_result[1],
                    "file_path": f["file_path"],
                    "round_num": f["round_num"],
                }
            )

In [None]:
len(ocr_clean)

In [None]:
df_ocr = pd.DataFrame(ocr_clean)
df_ocr.ocr = (
    df_ocr.ocr.astype("string")
    .str.replace("\s\s+", " ")
    .str.replace("^[0-9]*\s*[l|]*", "", regex=True)
    .str.replace("| |", "||", regex=False)
    .str.replace("https||", "", regex=False)
    .str.replace("https|", "", regex=False)
    .str.replace("https[|l]{0,2}", "")
    .str.replace("Secure |", "", regex=False)
    .str.replace("Secure", "")
    .str.replace("||", "", regex=False)
    .str.replace("Il", "", regex=False)
    .str.replace("ssrcon", "ssr.com", regex=False)
    .str.replace("con/", "com/", regex=False)
    .str.replace("cor/", "com/", regex=False)
    .str.replace("[.\s]*c[cao0][mnr]\s*/", ".com/")
    .str.replace("[.\s]*c[cao0][mnr]\s*", ".com")
    .str.replace(".*?eoguessr", "geoguessr")
    .str.replace("g.*?oguessr", "geoguessr")
    .str.replace("ge.*?guessr", "geoguessr")
    .str.replace("geo.*?uessr", "geoguessr")
    .str.replace("geog.*?essr", "geoguessr")
    .str.replace("geogues.*?r", "geoguessr")
    .str.replace("geoguess.*?", "geoguessr")
    .str.replace("geoguessrr", "geoguessr")
    # Two
    .str.replace("..oguessr", "geoguessr")
    .str.replace(".e.guessr", "geoguessr")
    .str.replace(".eo.uessr", "geoguessr")
    .str.replace(".eoguess.", "geoguessr")
    .str.replace("g.*?oguess.*?", "geoguessr")
    .str.replace("g.*?o.?uess.*?", "geoguessr")
    # two
    .str.replace("g.*?.*?guessr", "geoguessr")
    .str.replace("g.{0,3}?uessr", "geoguessr")
    .str.replace("g.*?o.uessr", "geoguessr")
    .str.replace("g.*?oguess.", "geoguessr")
    .str.replace(".+?eo.+?uess[a-zA-Z]{1}", "geoguessr")
    .str.replace("ld\s*/?\s*play", ".com/play")
    .str.replace("^eog", "geog")
    .str.replace("geoguessr[^.]+?co", "geoguessr.co")
    # .str.replace("geoguess co[mnr]", "geoguessr.com")
    # .str.replace("geoguessco[mnr]", "geoguessr.com")
    # .str.replace("geoguessrco[mnr]", "geoguessr.com")
    .str.replace("geoguessr.*com", "geoguessr.com")
    .str.replace("geoguessr\s*o[mnr]", "geoguessr.com")
    # Strip text before "geoguess"
    .str.replace("^.+(?=geoguess)", "")
    #
    .str.replace("geoguessr.*\.com", "geoguessr.com")
    #
    .str.replace("geoguessr\.com.*challenge", "geoguessr.com/challenge")
    .str.replace("geoguessr\.com/challenge[^/]{1}", "geoguessr.com/challenge/")
    .str.replace("geoguessr\.com.*play", "geoguessr.com/play")
    .str.replace("geoguessr\.com/play[^/]{1}", "geoguessr.com/play/")
    .str.replace("(?<=.)?comuk[/]?", "com/uk/")
    .str.replace("/.{1,2}lay", "/play")
    .str.strip()
)
exclude = set(["did you enjoy", "channel v"])
for e in exclude:
    df_ocr = df_ocr[~(df_ocr.ocr.str.lower().str.contains(e))].copy(deep=True)
df_ocr["url_count"] = df_ocr.merge(df_ocr.groupby("ocr").count(), on="ocr")[["video_id_y"]]
df_ocr[~df_ocr.ocr.str.contains("geoguessr.com")].sort_values("url_count")

In [None]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(
        pd.DataFrame(df_ocr[~df_ocr.ocr.str.contains("geoguessr.com")].ocr.value_counts()).head()
    )

In [None]:
exclude = set(["geoguessr.com/play"])
df_clean = df_ocr[
    (df_ocr.ocr.str.contains("geoguessr.com"))
    & ~(df_ocr.ocr.str.contains("retro"))
    & ~(df_ocr.ocr.isin(exclude))
].copy(deep=True)
df_clean["slug"] = (
    df_clean.ocr.str.replace("geoguessr.com", "")
    .str.replace("challenge", "")
    .str.replace("play", "")
    .str.replace("/", "")
)
df_clean["game_num"] = df_clean.round_num.apply(lambda rn: rn // 5)
df_clean["slug_len"] = df_clean.slug.apply(lambda s: len(s))
df_clean = df_clean[(df_clean.slug_len > 10) & (df_clean.slug_len < 80)].copy(deep=True)
df_clean.sort_values("slug_len")

In [None]:
df_clean2 = (
    pd.DataFrame(
        df_clean.groupby(["video_id", "game_num", "ocr"]).agg(
            url_count=("ocr", "count"),
            file_path=("file_path", "max"),
        )
    )
    .reset_index()
    .sort_values(["video_id", "game_num", "url_count"], ascending=[True, True, False])
)
df_clean2["ocr_rank"] = (
    df_clean2.groupby(["video_id", "game_num"])["url_count"]
    .transform(lambda x: x.rank(method="first", ascending=False))
    .astype("int")
)
df_clean2 = df_clean2[["video_id", "game_num", "ocr", "ocr_rank", "url_count", "file_path"]]

In [None]:
print("Total video_ids: ", df_clean2.video_id.nunique())
print("Total games: ", len(df_clean2.groupby(["video_id", "game_num"]).count()))

In [None]:
df_clean2

In [None]:
print(
    "Number of video_id's with URL detections from OCR, that are not in the google sheet: ",
    len(set(df_clean2.video_id.values.tolist()) - (set(GOOGLE_SHEET_IDS))),
)

In [None]:
if True:
    # pickle.dump(df_clean2, open("/shared/gbiamby/geo/data/urls/url_ocrs_cleaned.pkl", "wb"))
    df_clean2.to_pickle("/shared/gbiamby/geo/data/urls/url_ocrs_cleaned-protocol_5.pkl", protocol=5)
    df_clean2.to_pickle("/shared/gbiamby/geo/data/urls/url_ocrs_cleaned-protocol_4.pkl", protocol=4)
    df_clean2.to_csv("/shared/gbiamby/geo/data/urls/url_ocrs_cleaned.csv", index=False, header=True)

In [None]:
# with pd.option_context("display.max_rows", None, "display.max_columns", None):
#     # display(pd.DataFrame(df_clean[(df_clean.slug_len < 8)].ocr.value_counts()))
#     display(pd.DataFrame(df_clean[(df_clean.slug_len > 0)].ocr.value_counts()))

In [None]:
df_clean.slug_len.plot.hist(bins=50)

---

Junk