In [1]:
import spacy
import en_core_web_lg
from spacy import displacy
import collections
import pandas as pd

nlp = en_core_web_lg.load()
print(type(nlp))

<class 'spacy.lang.en.English'>


In [2]:
import json

with open('../scraping/mike/output2.json', 'r') as f:
    docs = json.loads(f.read())

In [3]:
from bs4 import BeautifulSoup
from bs4.element import Comment

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.find(id='bodyContent').findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.replace('\xa0', ' ').strip() for t in visible_texts)

In [4]:
%%time
entities = collections.Counter()
for doc in nlp.pipe([text_from_html(doc['html']) for doc in docs],
                     batch_size=50*3):
    entities.update([(ent.text, ent.label_) for ent in doc.ents])

Wall time: 6h 34min 6s


In [5]:
len(entities)

190056

In [6]:
elist = list(entities.items())
entity_df = pd.DataFrame(data={'term': [e[0][0] for e in elist],
                               'label': [e[0][1] for e in elist],
                               'count': [e[1] for e in elist]})
entity_df.head()

Unnamed: 0,term,label,count
0,LinkedIn Corporation Initial,ORG,3
1,May 2010,DATE,104
2,9 years ago,DATE,13
3,2010-05,DATE,3
4,Platform Web Browser,ORG,3


In [7]:
# entity_df.to_csv('entity_list_with_labels.tsv', sep='\t', index=False)

In [8]:
entity_df = pd.read_csv('entity_list_with_labels.tsv', sep='\t')

In [9]:
entity_df['count'] = entity_df['count'].astype(dtype='int')

In [10]:
entity_df['len'] = entity_df['term'].astype('str').apply(len)
entity_df.head()

Unnamed: 0,term,label,count,len
0,LinkedIn Corporation Initial,ORG,3,28
1,May 2010,DATE,104,8
2,9 years ago,DATE,13,11
3,2010-05,DATE,3,7
4,Platform Web Browser,ORG,3,20


In [12]:
entity_df.label.value_counts()

ORG            50846
PERSON         42851
DATE           26202
CARDINAL       15170
GPE            11735
WORK_OF_ART    10940
PRODUCT         5487
FAC             4709
MONEY           4256
NORP            3512
LOC             3262
PERCENT         3065
QUANTITY        2767
EVENT           2039
LAW             1861
TIME             762
LANGUAGE         323
ORDINAL          269
Name: label, dtype: int64

In [13]:
entity_df['count'].quantile([0.8, 0.9, 0.95, 0.99])

0.80     2.0
0.90     4.0
0.95     8.0
0.99    39.0
Name: count, dtype: float64

In [17]:
len(entity_df[entity_df['count'] > 2])

33856

In [18]:
count_mask = entity_df['count'] > 2
N = 10
quantiles = entity_df[count_mask].len.quantile([(1/N)*i for i in range(1, N)])
quantiles

0.1     4.0
0.2     5.0
0.3     7.0
0.4     9.0
0.5    10.0
0.6    12.0
0.7    14.0
0.8    16.0
0.9    18.0
Name: len, dtype: float64

In [19]:
len(entity_df[(entity_df.len <= quantiles.iloc[0]) & count_mask])

5230

In [20]:
def jaccard(s1, s2):
    s1, s2 = set(s1), set(s2)
    return len(s1.intersection(s2)) / len(s1.union(s2))
jaccard(entity_df.iloc[0, 0], entity_df.iloc[1, 0])

0.1

In [65]:
%%time
product_df = None
excluded_labels = ['DATE', 'CARDINAL']
for label in entity_df.label.value_counts().index:
    tmp_df = None
    print(label)
    if label in excluded_labels:
        continue
    for length in quantiles:
        mask = ((entity_df.len >= length * (1 - 0.2)) &
                (entity_df.len <= length * (1 + 0.2)) &
                count_mask &
                (entity_df.label == label))
        print(len(entity_df[mask])**2)
        index = pd.MultiIndex.from_product([entity_df[mask].term, entity_df[mask].term], names=["term1", "term2"])
        if tmp_df is not None:
            tmp_df = tmp_df.append(pd.DataFrame(index=index).reset_index(),
                                   ignore_index=True)
        else:
            tmp_df = pd.DataFrame(index=index).reset_index()
    tmp_df['label'] = label
    if product_df is not None:
        product_df = product_df.append(tmp_df, ignore_index=True)
    else:
        product_df = tmp_df

ORG
217156
1535121
1507984
1340964
3013696
2134521
1612900
2617924
1926544
PERSON
249001
3880900
3794704
1790244
4305625
3147076
1825201
1428025
391876
DATE
CARDINAL
GPE
16641
518400
1115136
592900
994009
286225
76729
66049
26896
WORK_OF_ART
529
4624
4624
7569
20736
13225
8281
13689
11236
PRODUCT
5329
74529
70756
51984
87616
32761
9409
14641
8100
FAC
16
441
1225
2401
7569
11025
13225
20736
13456
MONEY
361
5041
1681
2401
33124
40401
13456
1936
361
NORP
900
49729
148996
108900
198025
74529
32041
23409
6889
LOC
400
5929
7744
5929
18225
23409
23409
30276
16129
PERCENT
11025
145924
961
6241
7921
3600
289
361
169
QUANTITY
144
5041
5041
1089
1600
289
64
36
25
EVENT
9
121
289
1156
3136
2704
2304
2916
3025
LAW
4
36
625
1600
2704
2209
1089
2025
1296
TIME
16
121
625
1296
2116
1024
289
256
81
LANGUAGE
1
1089
4225
1369
1681
225
9
4
0
ORDINAL
3600
6889
196
49
49
1
0
0
0
Wall time: 37.6 s


In [66]:
print(product_df.shape)
product_df.head()

(42038088, 3)


Unnamed: 0,term1,term2,label
0,iPod,iPod,ORG
1,iPod,CNBC,ORG
2,iPod,GFDL,ORG
3,iPod,CNET,ORG
4,iPod,HTML,ORG


In [67]:
%%time
product_df['term1'] = product_df['term1'].astype(str)
product_df['term2'] = product_df['term2'].astype(str)

Wall time: 7.89 s


In [68]:
%%time
product_df = product_df[product_df.term1 < product_df.term2]

Wall time: 7.74 s


In [125]:
%%time
import jellyfish
# A fraction of the available entities... but choose a small amount
# for fast iteration initially
sampled_pdf = product_df.sample(int(1e5))
sampled_pdf['jaro_winkler'] = \
    sampled_pdf.apply(lambda row: jellyfish.jaro_winkler(row[0], row[1]),
                     axis=1)

Wall time: 5.12 s


In [126]:
print(sampled_pdf.shape)
sampled_pdf.head()

(100000, 4)


Unnamed: 0,term1,term2,label,jaro_winkler
38052550,Pashto,Terpstra,GPE,0.527778
2527786,CNN.com,Nature,ORG,0.436508
40101343,San Antonio,Thessaloniki,GPE,0.482323
30807573,Whitney Wolfe,lifecasting,PERSON,0.473776
36538703,Bill Clinton 's,Margaret Thatcher,PERSON,0.250327


In [80]:
# sampled_pdf.to_csv('jaro_winkler_blocked_by_labels.tsv', sep='\t', index=False)

In [133]:
%%time
import numpy as np
blocks = {}
jaro_winkler_threshold = 0.9
excluded_mask = sampled_pdf.jaro_winkler <= jaro_winkler_threshold
# Iteratively blocks entities.
# Once an entity is matched to a block, it will be excluded from matching
# others by adding its index to the excluded_mask
# Excluded_mask does not exclude anything
N = len(excluded_mask)
sampled_pdf = sampled_pdf.reset_index(drop=True)
cnt = 0
while np.sum(excluded_mask) < N:
    row = sampled_pdf[~excluded_mask].sample(1)
    excluded_mask[row.index] = True
    term = row.iloc[0, 0]
    label = row.iloc[0, 2]
    mask = ((~excluded_mask)
            & (sampled_pdf.label == label)
            & ((sampled_pdf.term1 == term) | (sampled_pdf.term2 == term))
            & (sampled_pdf.jaro_winkler > jaro_winkler_threshold))
    candidate_df = sampled_pdf[mask]
    for canonical_term, block_mask in list(blocks.items()):
        if np.sum(block_mask & mask) > 0:
            block_mask |= mask
            blocks[canonical_term] = block_mask
            excluded_mask |= block_mask
            break
    else:
        blocks[term] = mask
        excluded_mask |= mask
    if cnt % 10 == 0:
        cnt = 0
        print("Remaining:", N - np.sum(excluded_mask))
        print(term)
        if sampled_pdf[blocks[term]].shape[0] > 0:
            print(sampled_pdf[blocks[term]].sample(1))
    cnt += 1

Remaining: 80
14 percent
Remaining: 70
Andreas
Remaining: 58
about 30%
Remaining: 47
51 percent
Remaining: 34
$280 million
              term1         term2  label  jaro_winkler
39289  $280 million  $850 million  MONEY          0.95
Remaining: 24
Brandt
Remaining: 14
Michelle
Remaining: 4
$5 Million
Wall time: 7.75 s


In [140]:
for term, mask in blocks.items():
    print(term)
    N = len(sampled_pdf[mask])
    N = min(N, 10)
    if N > 0:
        print(sampled_pdf[mask].sample(N))

14 percent
$2.7 billion
The United Kingdom
$4.6 billion
$150 million
13.0%
up to 85%
Charles Dow
Chabert
Lyle
Andreas
$100 million
$2.4 billion
Dignan
$103 million
Tieto
$5 million
             term1       term2  label  jaro_winkler
78850  $14 million  $5 million  MONEY      0.915455
Thomas F.
           term1      term2   label  jaro_winkler
88590  Thomas F.  Thomas X.  PERSON      0.955556
Kurt
Peter Gorne
about 30%
$140 million
0.03%
Google Blog
$5 billion
            term1        term2  label  jaro_winkler
29246  $5 billion  $53 million  MONEY      0.915455
$1.9 billion
$7 Million
Chapter 12
25 per cent
about 1%
51 percent
David B.
$1.3 billion
$26.2 billion
               term1          term2  label  jaro_winkler
51958  $20.7 billion  $26.2 billion  MONEY      0.907692
Encyclopaedia Iranica
0.3%
6 percent
$20 million
             term1         term2  label  jaro_winkler
10606  $20 million  $300 million  MONEY      0.922727
1.3%
1 MB RAM
$280 million
              term1         ter