In [1]:
%matplotlib inline
import json
import pandas as pd

In [2]:
with open("mike/output2.json", "r") as f:
    data = json.loads(f.read())

In [3]:
frequencies = dict()
for page in data:
    for ks, vs in page['table_data']:
        for key_ in ks:
            key = key_.strip()
            try:
                frequencies[key] += 1
            except KeyError:
                frequencies[key] = 1
frequencies = pd.Series(data=frequencies)
frequencies.head()

Type            59
Industry        37
Founded         49
Headquarters    52
Key people      24
dtype: int64

In [4]:
print("Number of distinct table headers:", len(frequencies))

Number of distinct table headers: 1072


In [5]:
mask = frequencies > 5
frequencies[mask].sort_values(ascending=False)

                         318
Website                  112
Type                      59
Headquarters              52
Founded                   49
Industry                  37
Type of site              32
Operating system          31
Alexa                     28
rank                      28
Owner                     28
Registration              27
Launched                  27
Current status            27
Available in              25
Developer(s)              24
Key people                24
License                   24
Stable release            23
Revenue                   21
Year                      21
Number of employees       20
Initial release           18
Founder(s)                16
Employees                 16
Traded as                 15
Products                  15
Country                   14
Commercial                14
Area served               13
Pop.                      13
ISIN                      13
Net income                11
Historical population     10
Created by    

# Analysis

Above are listed the most frequent table row headers for the pages scraped. By inspecting each one, I am going to categorize them in one of two categories:

* Relationship - The key represents a relationship between the parent node. The value(s) correpsonding to the key represent nodes that are in a relationship with the parent node.
* Properties - The key represents a property of the parent node.
* Unknown - What the key represents is unknown without further analysis or context.

The data structures below realize these categories. The "relationships" dictionary defines key-value pairs, where they keys are table headers and the value is the type of relationship it defines, and indicates whether the relationship edge is "incoming" or "outgoing" relative to the parent. For example "Founder" and "Founder(s)" are distinct table keys that might define the same relationship "Founded", with the edge type "incoming".

For example if "Amazon" had the pair "Founded: (Jeff Bezos, incoming)", then it means there are two nodes "Amazon" and "Jeff Bezos" with an Founded-type edge starting at the node "Jeff Bezos" and terminating at the node "Amazon".

In [6]:
relationships = {
    'Founder(s)': ('Founded', 'incoming'),
    'Owner': ('Owns', 'incoming'),
    'Developer(s)': ('Developed by', 'incoming'),
    'Key people': ('Is key to', 'incoming'),
    'Products': ('Produces', 'outgoing'),
    'Created by': ('Created by', 'incoming'),
    'Founder': ('Founded', 'incoming'),
    'Parent': ('Owns', 'incoming'),
    'Subsidiaries': ('Owns', 'outgoing'),
    'Original author(s)': ('Authored', 'incoming'),
}

properties = [
    'Founded',
    'Website',
    'Type',
    'Headquarters',
    'Industry',
    'Operating system',
    'Alexa rank',
    'Launched',
    'Current status',
    'Available in',
    'License',
    'Revenue',
    'Number of employees',
    'Initial release',
    'Employees',
    'Type of business',
    'Name',
    'Total assets',
    'Operating income',
    'Advertising',
    'Version',
    'Available in',
    'Area served',
    'Release date',
    'Title',
    'Location',
    'Written in',
]

In [7]:
import re

internal_wiki_pattern = re.compile(r'.+:.+')

table_datas = [(e['title'], e.get('table_data', None))
               for e in data
               if 'title' in e
               and bool(e['title'])
               and not internal_wiki_pattern.match(e['title'])]
print("Number of entities:", len(table_datas))

Number of entities: 793


In [14]:
%%time
import re
import itertools
import string

output_json = {}

invalid_value_pattern = re.compile(r'[\s{}]+'.format(string.punctuation))

def canonical_key(keys_):
    '''
    Gets the "canonical key" from the unstructured list of keys
    '''
    keys = [k.strip() for k in keys_]
    key = ' '.join(keys)
    key = re.sub(r'\s+', ' ', key)
    return key

def canonical_values(values_):
    '''
    Clean up list of values by:
        * Flattening lists
        * Normalizing whitespace
        * Removing spurious values, e.g. those that consist of whitespace and punctuation alone
        * Strip leading and trailing whitespace
    '''
    values = []
    for v in values_:
        if isinstance(v, list):
            values.extend(v)
        else:
            values.append(v)
    values = [re.sub(r'\s+', ' ', v)
              for v in values if bool(v) and not invalid_value_pattern.match(v)]
    # Strip leading whitespace
    values = [re.sub(r'\s*(.*)', r'\1', v) for v in values]
    # Strip trailing whitespace
    values = [re.sub(r'\s*(.*)', r'\1', v[-1::-1])[-1::-1] for v in values]
    return values

for title, table_data in table_datas:
    if title not in output_json:
        output_json[title] = dict()
    for keys_, values_ in table_data:
        key = canonical_key(keys_)
        if not key:
            continue
        if key not in relationships and key not in properties:
            continue
        values = canonical_values(values_)
        try:
            output_json[title][key].extend(values)
        except KeyError:
            output_json[title][key] = values
        output_json[title][key] = list(set(output_json[title][key]))

Wall time: 46 ms


In [15]:
import json

with open("cleaned_scraped_data.json", "w") as f:
    f.write(json.dumps(output_json, sort_keys=True, indent=4))