In [1]:
import json
import pickle
import traceback

from bs4 import BeautifulSoup
import requests



In [2]:
def get_all_chem_records():
    print("GET ALL RECORDS")
    url = 'https://en.wikipedia.org/w/api.php?action=query&list=embeddedin&eititle=Template:Chembox&format=json&eilimit=500'
    resp=requests.get(url)
    data=json.loads(resp.text)
    full_data=data['query']['embeddedin']
    while 'continue' in data:
        print(data['continue']['eicontinue'].split("|")[1],end="\r")
        resp=requests.get(url+"&eicontinue="+data['continue']['eicontinue'])
        data=json.loads(resp.text)
        full_data.extend(data['query']['embeddedin'])
    return full_data

def get_raw_wiki_data(full_data,book=None,max=None):
    all_data={} if book is None else book
    ini_len=len(all_data)
    l=len(full_data)
    title=" "*10
    if max is None:
        max=2*len(full_data)
    for i,page in enumerate(full_data):
        if page['ns'] in [1,2,3,4,5,6,10,11]:
            continue # skip namesapces https://en.wikipedia.org/wiki/Wikipedia:Namespace

        if len(all_data)-ini_len >= max:
            break
        pid=page['pageid']
        if pid in all_data:
            continue
        try:
            url = 'https://en.wikipedia.org/w/api.php?action=parse&format=json&pageid={}&prop=parsetree'.format(pid)
            resp=requests.get(url)
            data=json.loads(resp.text)
            d=data['parse']['parsetree']['*']
            #print(data.keys())
            ntitle=data['parse']['title']
            print(i,"/",l," "*len(title),end="\r")
            print(i,"/",l,ntitle,end="\r")
            title=ntitle
            soup=BeautifulSoup(d,)
            for comment in soup.findAll("comment"):
                comment.decompose()
            for t in soup.findAll(text=True):
                text = str(t).strip()
                t.replaceWith(text.lower())

            box=soup.find("title",string="chembox")
            if box is None:
                for t in soup.findAll("title"):
                    if t.text == "chembox":
                        box = t
                        break

            if box is None:
                box=soup.find("title",string="infobox chemical")
                if box is None:
                    for t in soup.findAll("title"):
                        if t.text == "infobox chemical":
                            box = t
                            break
            box = box.parent

            def dispatch_template(tl):
                parts=tl.findAll("part",recursive=False)

                if len(parts)<1:
                    return tl.find("title").text
                data={}
                for part in parts:
                    name=part.findAll("name",recursive=False)
                    assert len(name) == 1
                    value=part.findAll("value",recursive=False)
                    assert len(value) == 1
                    name=name[0]
                    value=value[0]

                    if name.text == "":
                        key=name.attrs["index"]
                    else:
                        key=name.text
                    if key.startswith("section"):
                        try:
                            key =value.find("template").find("title").text
                        except:
                            pass
                    #print(key)
                    valchildren = value.findChildren(recursive=False)
                    if len(valchildren) == 0:
                        if value.text == "":
                            continue
                        data[key]=value.text
                    else:
                        vc_data=[]
                        for vc in valchildren:
                            if vc.name == "template":
                                vc_data.append(dispatch_template(vc))
                            elif vc.name == "ext":
                                vc.decompose()
                                vc_data.append(value.text)
                            else:
                                raise NotImplementedError(vc)
                        data[key] = vc_data

                if list(data.keys()) == [str(i) for i in range(1,len(data)+1)]:
                    data=[data[str(i)] for i in range(1,len(data)+1)]
                return data
            data = dispatch_template(box)
            all_data[pid]={'data':data,'title':title}
        except:
            traceback.print_exc()
            print(soup.findAll("title"))
            print(page)
            break
    return all_data

In [3]:
try:
    with open("all_chem_records.pickle","rb") as f:
        all_chem_records = pickle.load(f)
except:
    with open("all_chem_records.pickle","w+b") as f:
        pickle.dump(get_all_chem_records(),f)
    with open("all_chem_records.pickle","rb") as f:
        all_chem_records=pickle.load(f)

try:
    with open("all_chem_raw_wiki_data.pickle","rb") as f:
        all_chem_raw_wiki_data = pickle.load(f)
except:
    all_chem_raw_wiki_data={}

l=len(all_chem_raw_wiki_data)
all_chem_raw_wiki_data=get_raw_wiki_data(all_chem_records,book=all_chem_raw_wiki_data,max=10)
while len(all_chem_raw_wiki_data)>l:
    l=len(all_chem_raw_wiki_data)
    all_chem_raw_wiki_data=get_raw_wiki_data(all_chem_records,book=all_chem_raw_wiki_data,max=10)
    with open("all_chem_raw_wiki_data.pickle","w+b") as f:
        pickle.dump(all_chem_raw_wiki_data,f)
    with open("all_chem_raw_wiki_data.pickle","rb") as f:
        all_chem_raw_wiki_data=pickle.load(f)

In [78]:
all_keys=set()

ignore_keys={'imagefile',
             'imagefile1_ref', 'imagefiler2', 'imagename',
             'imagefilel1',
             'imagealt1',
             'imagealtr1',
             'imagenamer2',
             'imagesizel2',
             'imagecaption2',
             'imagefilel3',
             'imagesize3',
             'imagenamer3',
             'imagefiler3_ref',
             'imagefilel3_ref',
             'imagefile1', 'imagenamer1',
             'style',
             'imagealt2',
             'pronounce',
             }

to_expand={'data',
           #'chembox hazards'
           }
to_save={'title':'title',
         'iupacname':'iupacname',
         'othernames':'othernames',
         'casno':'casno',
         }
as_list={'systematicname'}


known_keys = ignore_keys.union(to_expand).union(to_save.keys())

extracted_data={}

error_extract=set()

def unlist(data,):
    if not isinstance(data,list):
        return data
    assert len(data)==1,"len not 1({}) {}".format(len(data),data)
    return data[0]
def tolist(data):
    if not isinstance(data,list):
            return [data]
    return data

def expand(data,pid):

     all_keys.update(data.keys())
     all_keys.update(data.keys())
     for k,v in data.items():
         if k in ignore_keys:
             continue
         if k in as_list:
             v=tolist(v)
         else:
            v=unlist(v)
         if k in to_save:
             assert not isinstance(v,dict), "isdict"
             assert k not in extracted_data[pid],"already there"
             extracted_data[pid][k]=v
         if k in to_expand:
             expand(v,pid)



for k,v in all_chem_raw_wiki_data.items():
    try:
        extracted_data[k]={}
        expand(v,k)
    except Exception as e:
        del extracted_data[k]
        error_extract.add(k)
        print(v)
        print(e)
        break

unknown_keys=all_keys-known_keys
print(error_extract)
print(unknown_keys)

extracted_data

{'data': {'verifiedrevid': '477004453', 'imagefilel1': 'carbon-monoxide-3d-balls.png', 'imagenamel1': 'ball-and-stick model of carbon monoxide', 'imagefiler1': 'carbon-monoxide-3d-vdw.png', 'imagenamer1': 'spacefill model of carbon monoxide', 'imagefile2': 'carbon monoxide 2d.svg', 'imagesize2': '170px', 'imagename2': 'model of carbon monoxide', 'pin': 'carbon monoxide', 'othernames': 'carbon monooxide<br />carbonous oxide<br />carbon(ii) oxide<br />carbonyl<br/>flue gas<br/>monoxide', 'chembox identifiers': [{'inchi': '1/co/c1-2', 'inchikey': 'ugfairiumavxcw-uhfffaoyat', 'casno': '630-08-0', 'casno_ref': [['correct', 'cas']], 'pubchem': '281', 'chemspiderid': '275', 'chemspiderid_ref': [['correct', 'chemspider']], 'unii_ref': [['correct', 'fda']], 'unii': '7u1ee4v452', 'einecs': '211-128-3', 'unnumber': '1016', 'kegg_ref': [['correct', 'kegg']], 'kegg': 'd09706', 'meshname': 'carbon+monoxide', 'chebi_ref': [['correct', 'ebi']], 'chebi': '17245', 'chembl': '1231840', 'rtecs': 'fg350000

{897: {'title': 'Arsenic'},
 1365: {'othernames': 'azane (only intended for use in naming derivatives of ammonia)<br /> hydrogen nitride<br />\nr-717 (refrigerant)<br />\nr717 (refrigerant alternative spelling)',
  'title': 'Ammonia'},
 1778: {'title': 'Acetylene'},
 1800: {'title': 'Adenosine triphosphate'},
 2268: {'iupacname': "(5''r'')-[(1''s'')-1,2-dihydroxyethyl]-3,4-dihydroxyfuran-2(5''h'')-one",
  'othernames': 'vitamin c',
  'title': 'Chemistry of ascorbic acid'},
 2752: {'iupacname': 'methyl <small>l</small>-α-aspartyl-<small>l</small>-phenylalaninate',
  'othernames': "''n''-(<small>l</small>-α-aspartyl)-<small>l</small>-phenylalanine,<br />1-methyl ester",
  'title': 'Aspartame'},
 3252: {'iupacname': "[(6-''o''-β-<small>d</small>-glucopyranosyl-β-<small>d</small>-glucopyranosyl)oxy](phenyl)acetonitrile",
  'title': 'Amygdalin'},
 3370: {'iupacname': 'boron nitride', 'title': 'Boron nitride'},
 3378: {'title': 'Beryllium'},
 3380: {'iupacname': 'lithium',
  'title': 'Wikipe