In [2]:
import spacy
import en_core_web_lg
from spacy import displacy
import collections
import pandas as pd

nlp = en_core_web_lg.load()
print(type(nlp))

<class 'spacy.lang.en.English'>


In [2]:
import json

with open('../scraping/mike/output2.json', 'r') as f:
    docs = json.loads(f.read())

In [3]:
from bs4 import BeautifulSoup
from bs4.element import Comment

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.find(id='bodyContent').findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.replace('\xa0', ' ').strip() for t in visible_texts)

In [4]:
%%time
entities = collections.Counter()
for doc in nlp.pipe([text_from_html(doc['html']) for doc in docs], batch_size=50):
    entities.update([ent.text for ent in doc.ents])

Wall time: 19min 39s


In [5]:
len(entities)

180482

In [6]:
elist = list(entities.items())
entity_df = pd.DataFrame(data={'term': [e[0] for e in elist], 'count': [e[1] for e in elist]})
entity_df.head()

Unnamed: 0,term,count
0,LinkedIn Corporation Initial,3
1,May 2010,104
2,9 years ago,13
3,2010-05,3
4,Platform Web Browser,4


In [8]:
# entity_df.to_csv('entity_list.tsv', sep='\t', index=False)

In [3]:
entity_df = pd.read_csv('entity_list.tsv', sep='\t')

In [4]:
entity_df['count'] = entity_df['count'].astype(dtype='int')

In [5]:
entity_df['len'] = entity_df['term'].astype('str').apply(len)
entity_df.head()

Unnamed: 0,term,count,len
0,LinkedIn Corporation Initial,3,28
1,May 2010,104,8
2,9 years ago,13,11
3,2010-05,3,7
4,Platform Web Browser,4,20


In [6]:
entity_df['count'].quantile([0.8, 0.9, 0.95, 0.99])

0.80     2.0
0.90     5.0
0.95     9.0
0.99    42.0
Name: count, dtype: float64

In [7]:
len(entity_df[entity_df['count'] > 9])

8379

In [8]:
count_mask = entity_df['count'] > 9
N = 10
quantiles = entity_df[count_mask].len.quantile([(1/N)*i for i in range(1, N)])
quantiles

0.1     3.0
0.2     4.0
0.3     6.0
0.4     7.0
0.5     9.0
0.6    11.0
0.7    13.0
0.8    14.0
0.9    16.0
Name: len, dtype: float64

In [9]:
len(entity_df[(entity_df.len <= quantiles.iloc[0]) & count_mask])

1186

In [10]:
def jaccard(s1, s2):
    s1, s2 = set(s1), set(s2)
    return len(s1.intersection(s2)) / len(s1.union(s2))
jaccard(entity_df.iloc[0, 0], entity_df.iloc[1, 0])

0.1

In [11]:
%%time
product_df = None
for length in quantiles:
    mask = ((entity_df.len >= length * (1 - 0.2)) &
            (entity_df.len <= length * (1 + 0.2)) &
            count_mask)
    print(len(entity_df[mask])**2)
    index = pd.MultiIndex.from_product([entity_df[mask].term, entity_df[mask].term], names=["term1", "term2"])
    tmp_df = pd.DataFrame(index=index).reset_index()
    if product_df is not None:
        product_df = product_df.append(tmp_df, ignore_index=True)
    else:
        product_df = tmp_df

902500
611524
2920681
2660161
1742400
4734976
4435236
5031049
5148361
Wall time: 13.7 s


In [12]:
print(product_df.shape)
product_df.head()

(28186888, 2)


Unnamed: 0,term1,term2
0,iOS,iOS
1,iOS,two
2,iOS,one
3,iOS,RSS
4,iOS,2.0


In [13]:
product_df['term1'] = product_df['term1'].astype(str)
product_df['term2'] = product_df['term2'].astype(str)

In [17]:
%%time
product_df = product_df[product_df.term1 < product_df.term2]

Wall time: 7.69 s


In [18]:
len(product_df)

14085851

In [19]:
%%time
product_df['jaccard'] = \
    product_df.apply(lambda row: jaccard(row[0], row[1]),
                     axis=1)

Wall time: 19min 23s


In [20]:
product_df.head()

Unnamed: 0,term1,term2,jaccard
1,iOS,two,0.0
2,iOS,one,0.0
125,iOS,six,0.2
559,iOS,ten,0.0
605,iOS,,0.0


In [21]:
product_df.to_csv('jaccards.tsv', sep='\t', index=False)