# Scraping and Parsing Matplotlib rcParams Options

This is a simple notebook to read and parse Matplotlib pyplot options listed on the package's website.

URL: https://matplotlib.org/3.1.3/tutorials/introductory/customizing.html

**Note**: The notebook has a variable parameter for matplotlib version.

In [2]:
# Standard imports
import re
import urllib.request as ureq

# Third-party import
import matplotlib.pyplot as plt


# Version of matplotlib to query (current is 3.1.3 as of 2020-03-14)
mpl_version = "3.1.3"


# Create full url to ~/customizations.html page
url = f"https://matplotlib.org/{mpl_version}/tutorials/introductory/customizing.html"

# Read and decode data
with ureq.urlopen(url) as resp:
    data = resp.read().decode("utf-8")

Autosaving every 60 seconds


'Tokenize' lines and create a simple function to print enumerated lines if a keyword is found

In [6]:
lines = [str(i).strip() for i in data.split("\n") if len(i) > 0]

def find_keyword(keyword = "whiskerprops"):
    msg = "Keyword not found!"
    output = list()

    for idx, line in enumerate(lines):
        if keyword in line:
            output.append([idx, line])

    if len(output) > 0:
        msg = "\nKeyword: '{keyword}' found!\n\n"
        tmp = "\n".join([f"Line no.: {i[0]}\nText: {i[1]}\n" for i in output])
        msg += tmp
    print(msg)
            
find_keyword("whiskerprops")


Keyword: '{keyword}' found!

Line no.: 398
Text: <span class="c1">#boxplot.whiskerprops.color     : black</span>

Line no.: 399
Text: <span class="c1">#boxplot.whiskerprops.linewidth : 1.0</span>

Line no.: 400
Text: <span class="c1">#boxplot.whiskerprops.linestyle : -</span>



Create a regular expression to get the desired content.

Print a sample to confirm.

In [8]:
item_ptrn = r'<span\s+?class\="c1">(.+?)<\/span>'
p = re.compile(item_ptrn, flags=re.M)
all_items = p.findall(data)
all_items = [str(i).strip().replace("`",'"') for i in all_items]

print("\n".join([itm for itm in all_items[:10]]))

#### MATPLOTLIBRC FORMAT
## This is a sample matplotlib configuration file - you can find a copy
## of it on your system in
## site-packages/matplotlib/mpl-data/matplotlibrc.  If you edit it
## there, please note that it will be overwritten in your next install.
## If you want to keep a permanent local copy that will not be
## overwritten, place it in the following location:
## unix/linux:
##      $HOME/.config/matplotlib/matplotlibrc or
##      $XDG_CONFIG_HOME/matplotlib/matplotlibrc (if $XDG_CONFIG_HOME is set)


Here, we're fine tuning some of the data for final output.

1. Get only results that start with a single octothorpe
2. Reformat areas around colon (:) to create a single space on each side

In [9]:
attrs_ptrn = r"^\#[a-z0-9].+$"
pattr = re.compile(attrs_ptrn, flags = re.I)
cmds = [i for i in all_items if pattr.search(i)]

reformat_space_ptrn = r"#([\w_\.-]+)\s*:\s*(.+?)\s*(?:#{2,})?(.*)?"
sub_pat_1 = r"\1`\2`\3"
rsp = re.compile(reformat_space_ptrn)
cmds1 = [rsp.sub(rfrmt_sub_pat, i) for i in cmds]

Print a sample of our reformatted data to confirm that results meet expectations.

In [10]:
for cmd in cmds1[20:50]:
    items = re.split(r"\s:\s", cmd)
    opt = re.sub(r"\#(.+)\s?", r"\1", items[0])
    val = re.split(r"\s+", items[1])[0]
    print(f"{opt} : {val}")

lines.solid_joinstyle : round
lines.solid_capstyle : projecting
lines.antialiased : True
lines.dashed_pattern : 3.7,
lines.dashdot_pattern : 6.4,
lines.dotted_pattern : 1,
lines.scale_dashes : True
markers.fillstyle : full
patch.linewidth : 1
patch.facecolor : C0
patch.edgecolor : black
patch.force_edgecolor : False
patch.antialiased : True
hatch.color : black
hatch.linewidth : 1.0
boxplot.notch : False
boxplot.vertical : True
boxplot.whiskers : 1.5
boxplot.bootstrap : None
boxplot.patchartist : False
boxplot.showmeans : False
boxplot.showcaps : True
boxplot.showbox : True
boxplot.showfliers : True
boxplot.meanline : False
boxplot.flierprops.color : black
boxplot.flierprops.marker : o
boxplot.flierprops.markerfacecolor : none
boxplot.flierprops.markeredgecolor : black
boxplot.flierprops.markeredgewidth : 1.0


In [None]:
output = list()
errs = list()
for cmd in cmds2:
    try:
        opt, val, extra = re.split(r"`", cmd)
        opt = re.sub(r"\#(.+)", r"\1", opt)
        # print(f"{opt} : {val} -> {extra}")
        output.append([opt, val, extra])
    except ValueError:
        errs.append(cmd)