In [1]:
import pandas as pd
from jellyfish import jaro_winkler

In [23]:
entity_df = pd.read_csv('entity_list_with_labels.tsv', sep='\t')
entity_df['count'] = entity_df['count'].astype('int')
entity_df.head()

Unnamed: 0,term,label,count
0,LinkedIn Corporation Initial,ORG,3
1,May 2010,DATE,104
2,9 years ago,DATE,13
3,2010-05,DATE,3
4,Platform Web Browser,ORG,3


In [26]:
entity_df['count'].quantile([0.8, 0.9, 0.95, 0.98, 0.99])

0.80     2.0
0.90     4.0
0.95     8.0
0.98    19.0
0.99    39.0
Name: count, dtype: float64

In [27]:
count_mask = entity_df['count'] >= 39
len(entity_df[count_mask])

1927

In [28]:
entity_df[count_mask].sample(10)

Unnamed: 0,term,label,count
1732,156,CARDINAL,111
4742,"March 13, 2017",DATE,68
5318,0362-4331,DATE,103
47,7,CARDINAL,1314
2815,February 2010,DATE,66
4310,292,CARDINAL,39
2376,Visio Corporation,ORG,147
982,March 2013,DATE,71
4537,8.0,CARDINAL,58
2021,The Wall Street Journal,ORG,418


In [29]:
%%time
product_df = None
excluded_labels = ['DATE']
for label in entity_df.label.value_counts().index:
    print(label)
    if label in excluded_labels:
        continue
    mask = (entity_df.label == label) & count_mask
    print(len(entity_df[mask])**2)
    index = pd.MultiIndex.from_product([entity_df[mask].term, entity_df[mask].term], names=["term1", "term2"])
    tmp_df = pd.DataFrame(index=index).reset_index()
    tmp_df['label'] = label
    if product_df is not None:
        product_df = product_df.append(tmp_df, ignore_index=True)
    else:
        product_df = tmp_df
product_df = product_df[product_df.term1 < product_df.term2]

ORG
148996
PERSON
47961
DATE
CARDINAL
166464
GPE
63001
WORK_OF_ART
25
PRODUCT
1936
FAC
25
MONEY
1
NORP
13456
LOC
441
PERCENT
484
QUANTITY
0
EVENT
25
LAW
0
TIME
0
LANGUAGE
441
ORDINAL
729
Wall time: 912 ms


In [30]:
%%time
product_df['jaro_winkler'] = product_df.apply(lambda row: jaro_winkler(row[0], row[1]), axis=1)

Wall time: 6.81 s


In [31]:
product_df.head()

Unnamed: 0,term1,term2,label,jaro_winkler
8,iOS,iPhone,ORG,0.5
9,iOS,iPod,ORG,0.527778
12,iOS,the App Store,ORG,0.0
16,iOS,the Wayback Machine,ORG,0.0
45,iOS,the World Wide Web,ORG,0.0


In [32]:
product_df[product_df.jaro_winkler > 0.95]

Unnamed: 0,term1,term2,label,jaro_winkler
14902,BusinessWeek,Businessweek,ORG,0.966667
24934,HTTP,HTTPS,ORG,0.96
30066,The White House,the White House,ORG,0.955556
97901,Time,Times,ORG,0.96
119688,Wikimedia,Wikipedia,ORG,0.955556
122209,Microsoft Corp,Microsoft Corp.,ORG,0.986667
144418,Apple Inc .,Apple Inc.,ORG,0.981818
165346,William,Williams,PERSON,0.975
171998,Steve,Steven,PERSON,0.966667
191442,Julia,Julian,PERSON,0.966667


In [41]:
entity_df['count'] = entity_df['count'].astype('int')
entity_df = entity_df.sort_values(by='count', ascending=False)
mask = entity_df.label.apply(lambda x: x not in ['ORDINAL', 'CARDINAL', 'DATE', 'NORP'])
high_quality_entities = entity_df[mask & count_mask]
print(high_quality_entities.shape)
high_quality_entities.head(20)

(980, 3)


  after removing the cwd from sys.path.


Unnamed: 0,term,label,count
63,Microsoft,ORG,9323
961,ISBN,ORG,3407
1427,China,GPE,2761
219,Facebook,ORG,2523
19,Apple,ORG,2521
1447,English,LANGUAGE,2363
220,Google,ORG,1775
1513,Twitter,ORG,1710
303,London,GPE,1584
238,California,GPE,1562
