# Scrape Shot Chart PoC

NOTE: install tesseract first

In [None]:
!pip install numpy pillow pytesseract pymupdf

In [None]:
from PIL import Image, ImageDraw, ImageFilter
import pytesseract
from pytesseract import Output

import numpy as np
import pymupdf
from typing import List, Dict
from pathlib import Path

# replace with the path to tesseract on your machine
# from a terminal run: 'which tesseract'
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

In [None]:
def pdf_to_pngs(pdf_fname: str) -> List:
    pdffile = Path(pdf_fname)
    img_fnames = []
    doc = pymupdf.open(pdffile)
    for page_index in range(doc.page_count):
        page = doc.load_page(page_index)  
        pix = page.get_pixmap(dpi=600)
        output_fname = Path(f"{pdffile.stem}_{str(page_index)}.png")
        img_fnames.append(output_fname)
        pix.save(output_fname)

    return img_fnames

In [None]:
def split_team_imgs(shot_chart_img_fname: str) -> (Image, Image):
    img = Image.open(shot_chart_img_fname).convert('RGB')

    left = 1200
    top = 1400
    right = img.size[0] - 1200
    bottom = img.size[1] - 1000
    img = img.crop((left, top, right, bottom))
    # use only the red_channel from the image
    red_channel, _, blue_channel = img.split()
    red_channel

    # adjust the pixels on the left side
    a = np.asarray(red_channel).copy()
    #a[a > 25] = 255
    a[a != 0] = 255
    team_a_img = Image.fromarray(a)

    # adjust the pixels on the right side
    a = np.asarray(blue_channel).copy()
    a[a != 0] = 255
    team_b_img = Image.fromarray(a).convert('RGB')
    
    return (team_a_img, team_b_img)

In [None]:
def has_numbers(in_str: str) -> bool:
    return any(char.isdigit() for char in in_str.strip())

def has_shot_status_code(in_str: str) -> bool:
    return any(c in "<>+-o" for c in in_str.strip())

In [None]:
def extract_team_shots(team_shot_chart_img: Image) -> (List, Image):
    #processed_img = team_shot_chart_img.copy().filter(ImageFilter.CONTOUR)
    #processed_img = team_shot_chart_img.filter(ImageFilter.MinFilter(3))
    #processed_img = team_shot_chart_img.filter(ImageFilter.MinFilter(3)).filter(ImageFilter.SHARPEN)
    #processed_img = team_shot_chart_img.filter(ImageFilter.SHARPEN)
    processed_img = team_shot_chart_img.filter(ImageFilter.EDGE_ENHANCE_MORE)
    

    custom_config = r'-c tessedit_char_whitelist=1234567890><+-o --psm 11'
    #custom_config = r'--psm 6'
    #custom_config = r'--psm 11'
    
    img = np.array(processed_img)
    d = pytesseract.image_to_data(img, output_type=Output.DICT, config=custom_config)
    n_boxes = len(d['level'])
    data = []
    draw = ImageDraw.Draw(processed_img)

    for i in range(n_boxes):
        #if (len(d['text'][i].strip()) > 1):
        if (has_numbers(d['text'][i])) & (has_shot_status_code(d['text'][i])) & (" " not in d['text'][i]):
            (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
            draw.rectangle([x, y, x + w, y + h], outline="red", width=2) # in-place operation
            data.append({"label": d['text'][i], "chart_xy": (x, y)})

    return (data, processed_img)

In [None]:
def get_shots_summary(team_data) -> Dict:
    results = {"goals" : 0, "ssp" : 0, "ssg" : 0, "spg" : 0}
    for d in team_data:
        label = d['label']
        code = label[0] if has_shot_status_code(label[0]) else label[-1]

        match code:
            case 'o':
                results["goals"] += 1
            case '<':
                results["ssp"] += 1
            case '>':
                results["ssp"] += 1
            case '+':
                results["ssg"] += 1
            case '-':
                results["spg"] += 1
                
    return results

In [None]:
def scrape_shot_chart(shot_chart_img_fname: str, include_imgs=False) -> Dict:
    team_a_img, team_b_img = split_team_imgs(shot_chart_img_fname)

    team_a_data, team_a_data_img = extract_team_shots(team_a_img)
    team_b_data, team_b_data_img = extract_team_shots(team_b_img)

    results = {
        "team_a": {"data" : team_a_data, "shots_summary" : get_shots_summary(team_a_data)},
        "team_b": {"data" : team_b_data, "shots_summary" : get_shots_summary(team_b_data)}
    }
    
    if include_imgs:
        results["team_a"]["imgs"] = [team_a_img, team_a_data_img]
        results["team_b"]["imgs"] = [team_b_img, team_b_data_img]

    return results

In [None]:
!curl -v --skip-existing "https://www.iihf.com/pdf/757/ihm757a04_77a_3_0" -o "2024-12-26_FIN-CAN.pdf"

In [None]:
pdf_to_pngs("2024-12-26_FIN-CAN.pdf")

In [None]:
results = scrape_shot_chart("2024-12-26_FIN-CAN_0.png")

In [None]:
results

In [None]:
img = Image.open("2024-12-26_FIN-CAN_0.png")
img