In [3]:
import spacy 

In [4]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.2.0/en_core_web_md-3.2.0-py3-none-any.whl (45.7 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_md')


In [5]:
nlp = spacy.load("en_core_web_md")

In [6]:
with open ("wiki1.txt", "r") as f:
    text = f.read()

In [7]:
doc = nlp(text) # tokenizing 
sen1 = list(doc.sents)[0]
print(sen1)

The United States of America(U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America.


In [12]:
import numpy as np

# word vector similarity, not sysnonym similarity
your_country = "country"
ms = nlp.vocab.vectors.most_similar(np.asarray([nlp.vocab.vectors[nlp.vocab.strings[your_country]]]), n=10)
words = [nlp.vocab.strings[w] for w in ms[0][0]]
distances = ms[2]
print(words)

['country', 'COUNTRY', 'NATION', 'nation', 'COUNTIRES', 'nations', 'member-states', 'worLd', 'World', 'world']


In [41]:
# determining degree of similarity
doc1 = nlp("i like salty fries and hamburgers.")
doc2 = nlp("Fast food tastes very good.")

In [42]:
print(doc1, "<->", doc2, doc1.similarity(doc2))

i like salty fries and hamburgers. <-> Fast food tastes very good. 0.7799485285662074


In [43]:
doc3 = nlp("Victoria is in Kolkata.")
# matching happens judging the overlapping, word clustering
print(doc1, "<->", doc3, doc1.similarity(doc3))

i like salty fries and hamburgers. <-> Victoria is in Kolkata. 0.5086395749623951


In [44]:
doc4 = nlp("I enjoy oranges.")
doc5 = nlp("I enjoy apples.")

In [45]:
# similar sentesnses, with similar clustering, overlapping
# similar in terms of class, usage etc. (both are fruits and edible)
print(doc4, ",->", doc5, doc4.similarity(doc5))

I enjoy oranges. ,-> I enjoy apples. 0.9607558420297302


In [46]:
doc6 = nlp("i enjoy burgers.")

In [49]:
print(doc4, "<->", doc6, doc4.similarity(doc6))

I enjoy oranges. <-> i enjoy burgers. 0.8755329425467893


In [54]:
# not much of overlapping (like both are food, but of disimilar categories)
food1 = doc1[2:4]
food2 = doc1[5]
print(food1, "<->", food2,"\n",food1.similarity(food2))

salty fries <-> hamburgers 
 0.730462372303009


In [55]:
# pipelines in spacy 
# A pipeline is a sequence of pipes, or actors on data, that make alterations to the data or extract information from it.
# making a blank pipeline
nlp = spacy.blank("en")

In [56]:
# pipeline added
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x28d1d4cc6c0>

In [60]:
import requests
from bs4 import BeautifulSoup
s = requests.get("https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt")
soup = BeautifulSoup(s.content).text.replace("-\n", "").replace("\n", " ")
nlp.max_length = 5278439

In [61]:
%%time
doc = nlp(soup)
print (len(list(doc.sents)))
# 94k sentences found, in 7.76 seconds, using sentencizer

94133
Wall time: 7.76 s


In [62]:
nlp2 = spacy.load("en_core_web_sm")
nlp2.max_length = 5278439

In [64]:
%%time
doc = nlp2(soup)
print (len(list(doc.sents)))
# 112k sentences found, in 47 minutes, using the smallest model
# the sentencizer doesn't check all the bounderies, 
# but the 'small model', checks all the bounderies

In [69]:
nlp.analyze_pipes()

{'summary': {'sentencizer': {'assigns': ['token.is_sent_start', 'doc.sents'],
   'requires': [],
   'scores': ['sents_f', 'sents_p', 'sents_r'],
   'retokenizes': False}},
 'problems': {'sentencizer': []},
 'attrs': {'doc.sents': {'assigns': ['sentencizer'], 'requires': []},
  'token.is_sent_start': {'assigns': ['sentencizer'], 'requires': []}}}

In [71]:
nlp2 = spacy.load("en_core_web_sm")

In [78]:
nlp2.analyze_pipes()
# for all the models (small, medium and large) the analyzation
# is same 
# this proves that any model like this is much more
# effcient that the 'sentencizer' alone 

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  'tagger': [],
  'parser': [],
  'attribute_ruler': [],
  'lemmatizer': [],
  'ner': []},
 'att