In [None]:
import os
import site
os.environ["PYTHONPATH"] = site.USER_BASE + "/lib/python2.7/site-packages:" + os.environ["PYTHONPATH"]

In [None]:
print(os.environ["PYTHONPATH"])

In [None]:
import os.path
from lxml import etree
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
def extract_words_coords(filename):
    """
    Parse a BL_newspaper XML file and extract words and coordinates. 
    Both BL_page with pageWord sub-elements or BL_article with
    articleWord sub-elements are handled.
    
    :param filename: file name
    :type filename: str or unicode
    :return: list of ([X, Y, W, H], word)
    :rtype: list(tuple(list(int), str or unicode))
    """
    with open(filename, "r") as f:
        parser = etree.XMLParser(recover=False)
        document_tree = etree.parse(f, parser)
    root_element = document_tree.getroot()
    elements = document_tree.xpath("/BL_newspaper/BL_page")
    query = "//pageWord"
    if not elements:
        elements = document_tree.xpath("/BL_newspaper/BL_article")
        query = "//articleWord"
    element = elements[0]
    print("Word element: " + query)
    word_elts = element.xpath(query)
    assert word_elts, ("No " + query + " elements")
    print(("Number of words: " + str(len(word_elts))))
    word_coords = []
    for word_elt in word_elts:
        coords = word_elt.attrib["coord"].split(",")
        coords = [int(c) for c in coords]
        word = word_elt.text
        word_coords.append((coords, word))
    return word_coords

In [None]:
def plot_words(word_coords, subplot):
    """
    Plot words on a subplot.
    
    :param word_coords: list of ([X, Y, W, H], word)
    :rtype word_coords: list(tuple(list(int), str or unicode))
    :param subplot: file name
    :type subplot: matplotlib.axes._subplots.AxesSubplot
    """
    for ([x, y, w, h], word) in word_coords:
        subplot.text(x, y, word, fontsize=6)

In [None]:
def create_text_subplot(title, word_coords):
    """
    Create a figure and plot words in a sub-plot.

    :param title: figure title
    :type type: str or unicode
    :param word_coords: list of ([X, Y, W, H], word)
    :rtype word_coords: list(tuple(list(int), str or unicode))
    :return: figure
    :rtype: matplotlib.figure.Figure
    """
    figure = plt.figure(figsize=(10, 20), dpi=100)
    figure.suptitle(title, fontsize=14, fontweight='bold')
    subplot = figure.add_subplot(111)  # 1x1 grid, 1st subplot
    subplot.axis([0, 3000, 6000, 0])  # Invert Y axis. More intuitive than plt.gca().invert_yaxis()
    # plt.autoscale(False)
    subplot.autoscale_view(scalex=False, scaley=False)
    subplot.axes.get_xaxis().set_visible(False)
    subplot.axes.get_yaxis().set_visible(False)
    plot_words(word_coords, subplot)
    return figure

In [None]:
def create_text_plot(directory, filename, title, limit=-1):
    """
    Create a figure and plot words in a sub-plot.

    :param directory: directory name
    :type directory: str or unicode
    :param filename: file name, relative to directory
    :type filename: str or unicode
    :param title: figure title
    :type type: str or unicode
    :param limit: number of words to plot
    :type limit: int
    :return: figure
    :rtype: matplotlib.figure.Figure
    """
    filepath = os.path.join(directory, filename)
    word_coords = extract_words_coords(filepath)
    if limit != -1:
        word_coords = word_coords[0:limit]
    plot = create_text_subplot(title + " : " + filename, word_coords)
    return plot

In [None]:
directory = "/mnt/lustre/at003/at003/shared/findmypast/BNA/0000083/1847/1002/"

filename = "WO1_MRTM_1847_10_02-0001.xml"
page_plot = create_text_plot(directory, filename, "BLN page")

In [None]:
filename = "WO1_MRTM_1847_10_02-0002.xml"
page_plot = create_text_plot(directory, filename, "BLN page")

In [None]:
filename = "WO1_MRTM_1847_10_02-0003.xml"
page_plot = create_text_plot(directory, filename, "BLN page")

In [None]:
filename = "WO1_MRTM_1847_10_02-0004.xml"
page_plot = create_text_plot(directory, filename, "BLN page")

In [None]:
filename = "WO1_MRTM_1847_10_02-0005.xml"
page_plot = create_text_plot(directory, filename, "BLN page")

In [None]:
filename = "WO1_MRTM_1847_10_02-0006.xml"
page_plot = create_text_plot(directory, filename, "BLN page")

In [None]:
filename = "WO1_MRTM_1847_10_02-0007.xml"
page_plot = create_text_plot(directory, filename, "BLN page")

In [None]:
filename = "WO1_MRTM_1847_10_02-0008.xml"
page_plot = create_text_plot(directory, filename, "BLN page")

In [None]:
filename = "WO1_MRTM_1847_10_02-0002-002.xml"
article_plot = create_text_plot(directory, filename, "BLN article")