* [TinyDB Docs](https://tinydb.readthedocs.io/en/latest/usage.html)
* [Pillow docs](https://pillow.readthedocs.io/en/stable/)

In [57]:
from tinydb import TinyDB, Query
import webbrowser
from collections import Counter
from IPython.core.display import display, HTML

In [30]:
db = TinyDB('../data/DocumentCloud/db.json')
empty_file_ids = !find ../data/DocumentCloud/text -size 0 | awk -F/ '{print $5}' | awk -F. '{print $1}'
recs = db.search(Query().id.one_of(empty_file_ids))

In [33]:
Counter(x['source'] for x in recs).most_common()

[('Detroit City Council', 62),
 ('Detroit Economic Development Corporation', 26),
 ('Detroit Land Bank Authority', 19),
 ('Detroit Downtown Development Authority', 19),
 ('Chicago City Council', 16),
 ('Summit County Land Bank', 11),
 ('Chicago Department of Planning and Development', 8),
 ('Chicago Special Service Area #28 Six Corners', 7),
 ('Chicago Public Schools', 7),
 ('Wayne County Government', 7),
 ('Cook County Health and Hospitals System', 5),
 ('Detroit City Planning Commission', 4),
 ('Detroit General Retirement System', 4),
 ('Detroit Local Development Finance Authority', 4),
 ('Cook County Government', 3),
 ('Detroit Eight Mile Woodward Corridor Improvement Authority', 3),
 ('Chicago Police Department', 3),
 ('Public Building Commission of Chicago', 2),
 ('Detroit Police Department', 2),
 ('Detroit Police and Fire Retirement System', 2),
 ('Chicago Special Service Area #5 Commercial Ave', 2),
 ('City Bureau', 2),
 ('Chicago Committee on Standards and Tests', 1),
 ('Illino

In [53]:
Counter(x['pages'] for x in recs).most_common(10)

[(1, 54),
 (2, 30),
 (3, 18),
 (7, 13),
 (4, 13),
 (11, 9),
 (10, 8),
 (5, 6),
 (14, 5),
 (8, 4)]

In [96]:
class DocDisplayWidget:
    def __init__(self, data):
        self.data = data
    def _repr_html_(self):
        html = f"""
        <div style="width: 200px">
        <a href=\"{self.data['canonical_url']}\"><img style="border: 1px solid black" src="{self.data['resources']['thumbnail']}"></a>
        <br>{self.data['title']} [<a href=\"{self.data['resources']['text']}\">TXT</a>]
        </div>
        """
        return html
    def __repr__(self):
        return f"{self.data['title']} ({self.data['id']})"

    
class DocList:
    def __init__(self,docs=None):
        if docs is None:
            docs = []
        self.docs = docs
    def add(self, doc):
        self.docs.append(doc)
    def _repr_html_(self):
        return "<hr>".join(x._repr_html_() for x in self.docs)
    

In [97]:
DocList([DocDisplayWidget(x) for x in recs[:3]])

In [109]:
import requests
from PIL import Image
from io import BytesIO


def getcolors(doc):
    """Return the colors for the thumbnail in this document record"""
    thumb_url = doc['resources']['thumbnail']
    r = requests.get(thumb_url)
    i = Image.open(BytesIO(r.content))
    return i.getcolors()

chk = []
blank = []

for x in recs:
    if len(getcolors(x)) > 1:
        chk.append(x)
    else:
        blank.append(x)
print(f"Done, {len(chk)} worth checking")        


Done, 204 worth checking


In [115]:
# TODO: maybe something more than colors, because there are a couple that are nearly blank but have 
# just a little schmutz in them, like...
DocDisplayWidget(chk[2])

In [111]:
DocList([DocDisplayWidget(x) for x in chk[10:20]])