# Scraping and Parsing Matplotlib rcParams Options

This is a simple notebook to read and parse Matplotlib pyplot options listed on the package's website.

URL: https://matplotlib.org/3.1.3/tutorials/introductory/customizing.html

**Note**: The notebook has a variable parameter for matplotlib version.

In [1]:
# Standard imports
import re
import urllib.request as ureq

# Third-party import
import matplotlib.pyplot as plt


# Version of matplotlib to query (current is 3.1.3 as of 2020-03-14)
mpl_version = "3.1.3"


# Create full url to ~/customizations.html page
url = f"https://matplotlib.org/{mpl_version}/tutorials/introductory/customizing.html"

# Read and decode data
with ureq.urlopen(url) as resp:
    data = resp.read().decode("utf-8")

#### Search function
Add search function for checking data if unsure.

In [2]:
def find_keyword(keyword = "whiskerprops"):
    msg = "Keyword not found!"
    output = list()

    for idx, line in enumerate(lines):
        if keyword in line:
            output.append([idx, line])

    if len(output) > 0:
        msg = "\nKeyword: '{keyword}' found!\n\n"
        tmp = "\n".join([f"Line no.: {i[0]}\nText: {i[1]}\n" for i in output])
        msg += tmp
    print(msg)

#### Tokenizer lambda function
Add a simple function for splitting lines by a separator.

In [3]:
tokenizer = lambda x, split_by="\n": [str(i).strip() for i in x.split(split_by) if len(i) > 0]

In [4]:
# Tokenize lines
lines = tokenizer(data)

# Uncomment and run search function here if validating results.
# find_keyword("whiskerprops")

Create a regular expression to get the desired content.

Print a sample to confirm.

In [5]:
item_ptrn = r'<span\s+?class\="c1">(.+?)<\/span>'
p = re.compile(item_ptrn, flags=re.M)
all_items = p.findall(data)
all_items = [str(i).strip().replace("`",'"') for i in all_items]

print("\n".join([itm for itm in all_items[:10]]))

#### MATPLOTLIBRC FORMAT
## This is a sample matplotlib configuration file - you can find a copy
## of it on your system in
## site-packages/matplotlib/mpl-data/matplotlibrc.  If you edit it
## there, please note that it will be overwritten in your next install.
## If you want to keep a permanent local copy that will not be
## overwritten, place it in the following location:
## unix/linux:
##      $HOME/.config/matplotlib/matplotlibrc or
##      $XDG_CONFIG_HOME/matplotlib/matplotlibrc (if $XDG_CONFIG_HOME is set)


Here, we're fine tuning some of the data for final output.

1. Get only results that start with a single octothorpe
2. Reformat areas around colon (:) to create a single space on each side

In [6]:
attrs_ptrn = r"^\#(?!#).+$"
pattr = re.compile(attrs_ptrn, flags = re.I)
cmds = [i for i in all_items if pattr.search(i)]

#### Final: Create dictionary to store the three components.

In [7]:
def create_dictionary(iterable, delims=[":", "##"]):
    """Function to create a collection of command/options/extras data."""
    # Define an empty dictionary collection
    ddict = dict(
        command=[],
        option=[], 
        extra=[]
        )
    # Transverse over our iterable
    for cmd in iterable:
        if delims[0] in cmd:
            tmp1, tmp2 = cmd.split(delims[0], 1)
            # Check for second delim and process as usual
            # Set third temp variable to empty string otherwise.
            if delims[1]:
                try:
                    tmp2, tmp3 = tmp2.split(delims[1])
                except ValueError:
                    tmp3 = ""
            else:
                tmp3 = ""
                
            # Sub out our initial octothorpe
            tmp1 = re.sub(r"\#(.+)", r"\1", tmp1)
            
            # Append temp variables to proper portion of dictionary
            ddict["command"].append(tmp1.strip())
            ddict["option"].append(tmp2.strip())
            ddict["extra"].append(tmp3.strip())
    return ddict

In [8]:
# Run our function with the cmds variable to produce our result.
output = create_dictionary(cmds)

#### Format a print a sample output.

In [9]:
n_values = len(output["option"])
template = "{a} : {b} // {c}"
res = ["{a} : {b} | {c}".format(a=output["command"][i], b=output["option"][i], c=output["extra"][i]) \
       if output["extra"][i] \
       else "{a} : {b}".format(a=output["command"][i], b=output["option"][i]) \
       for i in range(n_values)]

print("\n".join(res[:10]))

backend : Agg
webagg.port : 8988
webagg.address : 127.0.0.1
webagg.port_retries : 50
webagg.open_in_browser : True
backend_fallback : True
interactive : False
toolbar : toolbar2 | None | toolbar2  (&quot;classic&quot; is deprecated)
timezone : UTC | a pytz timezone string, e.g., US/Central or Europe/Paris
datapath : /home/jdhunter/mpldata
