# Language Processing Pipeline in Spacy: NLP Tutorial For Beginners - 9

https://www.youtube.com/watch?v=hKK59rfpXL0&list=PLeo1K3hjS3uuvuAXhYjV2lMEShq2UYSwX&index=10

In [1]:
import spacy

In [2]:
nlp = spacy.blank("en")  # pipeline intialization; and it is blank

doc = nlp("Captain America ate 100$ of samosa. Then he said I can do it all day")

[_ for _ in doc]

[Captain,
 America,
 ate,
 100,
 $,
 of,
 samosa,
 .,
 Then,
 he,
 said,
 I,
 can,
 do,
 it,
 all,
 day]

In [3]:
nlp.pipe_names

[]

In [4]:
# pipeline intialization
nlp = spacy.load("en_core_web_sm")  # 'en' means English and 'sm' means small

In [5]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [6]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x1b7a353ffa0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x1b7a353fe80>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x1b7a3231b30>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x1b7a35cbb40>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x1b7a3591880>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1b7a3231c80>)]

Available trained pipelines for English: https://spacy.io/models/en

In [7]:
# medium model

# !python -m spacy download en_core_web_md
nlp = spacy.load("en_core_web_md")  # pipeline intialization

In [8]:
doc = nlp("Captain America ate 100$ of samosa. Then he said I can do it all day")

[(_, _.pos_, _.lemma_) for _ in doc]

[(Captain, 'PROPN', 'Captain'),
 (America, 'PROPN', 'America'),
 (ate, 'VERB', 'eat'),
 (100, 'NUM', '100'),
 ($, 'NOUN', '$'),
 (of, 'ADP', 'of'),
 (samosa, 'PROPN', 'samosa'),
 (., 'PUNCT', '.'),
 (Then, 'ADV', 'then'),
 (he, 'PRON', 'he'),
 (said, 'VERB', 'say'),
 (I, 'PRON', 'I'),
 (can, 'AUX', 'can'),
 (do, 'VERB', 'do'),
 (it, 'PRON', 'it'),
 (all, 'DET', 'all'),
 (day, 'NOUN', 'day')]

In [9]:
doc = nlp("Elon Musk bought Twitter for $45 billion")

[(_.text, _.label_, spacy.explain(_.label_)) for _ in doc.ents]

[('Elon Musk', 'PERSON', 'People, including fictional'),
 ('Twitter', 'ORG', 'Companies, agencies, institutions, etc.'),
 ('$45 billion', 'MONEY', 'Monetary values, including unit')]

#### it didn't recognized Twitter as ORG using en_core_web_sm!!!

### visualization

In [10]:
from spacy import displacy

displacy.render(doc, style='ent')

## French language

https://spacy.io/models/fr

In [11]:
# medium model

# !python -m spacy download fr_core_news_md

# nlp = spacy.load("fr_core_news_md")  # pipeline intialization

In [12]:
# large model

# !python -m spacy download fr_core_news_lg

nlp = spacy.load("fr_core_news_lg")  # pipeline intialization

In [13]:
doc = """Kinshasa et Kigali ont repris le dialogue lors de la Tripartite des ministres des Affaires étrangères d’Angola, 
de la RDC et Rwanda tenue, samedi 5 novembre, à Luanda, en Angola.
A l’initiative du président angolais et président en exercice de la CIRGL, 
Joâo Lourenço, cette rencontre a voulu rétablir le climat de confiance entre la RDC et le Rwanda, 
en froid depuis le regain de violences dans l'Est du pays et l'appui avéré du Rwanda aux rebelles du M23. 
"""

doc = nlp(doc)

[(_.text, _.label_, spacy.explain(_.label_)) for _ in doc.ents]

[('Kinshasa', 'LOC', 'Non-GPE locations, mountain ranges, bodies of water'),
 ('Kigali', 'LOC', 'Non-GPE locations, mountain ranges, bodies of water'),
 ('Tripartite', 'ORG', 'Companies, agencies, institutions, etc.'),
 ('Affaires étrangères d’Angola',
  'ORG',
  'Companies, agencies, institutions, etc.'),
 ('RDC', 'LOC', 'Non-GPE locations, mountain ranges, bodies of water'),
 ('Rwanda', 'LOC', 'Non-GPE locations, mountain ranges, bodies of water'),
 ('Luanda', 'LOC', 'Non-GPE locations, mountain ranges, bodies of water'),
 ('Angola', 'LOC', 'Non-GPE locations, mountain ranges, bodies of water'),
 ('A', 'LOC', 'Non-GPE locations, mountain ranges, bodies of water'),
 ('Joâo Lourenço', 'PER', 'Named person or family.'),
 ('RDC', 'LOC', 'Non-GPE locations, mountain ranges, bodies of water'),
 ('Rwanda', 'LOC', 'Non-GPE locations, mountain ranges, bodies of water'),
 ('Rwanda', 'LOC', 'Non-GPE locations, mountain ranges, bodies of water'),
 ('M23', 'ORG', 'Companies, agencies, institution

### adding a component to a blank pipeline

In [14]:
nlp = spacy.blank('fr')

nlp.add_pipe('ner', source=spacy.load("fr_core_news_sm"))

nlp.pipe_names

['ner']

NB! doesn't work with medium and large models

In [15]:
doc = """Kinshasa et Kigali ont repris le dialogue lors de la Tripartite des ministres des Affaires étrangères d’Angola, 
de la RDC et Rwanda tenue, samedi 5 novembre, à Luanda, en Angola.
A l’initiative du président angolais et président en exercice de la CIRGL, 
Joâo Lourenço, cette rencontre a voulu rétablir le climat de confiance entre la RDC et le Rwanda, 
en froid depuis le regain de violences dans l'Est du pays et l'appui avéré du Rwanda aux rebelles du M23. 
"""

doc = nlp(doc)

[(_.text, _.label_, spacy.explain(_.label_)) for _ in doc.ents]

[('Kinshasa', 'LOC', 'Non-GPE locations, mountain ranges, bodies of water'),
 ('Kigali', 'LOC', 'Non-GPE locations, mountain ranges, bodies of water'),
 ('Tripartite des ministres des Affaires étrangères',
  'MISC',
  'Miscellaneous entities, e.g. events, nationalities, products or works of art'),
 ('RDC', 'LOC', 'Non-GPE locations, mountain ranges, bodies of water'),
 ('Rwanda', 'LOC', 'Non-GPE locations, mountain ranges, bodies of water'),
 ('Luanda', 'LOC', 'Non-GPE locations, mountain ranges, bodies of water'),
 ('Angola', 'LOC', 'Non-GPE locations, mountain ranges, bodies of water'),
 ('CIRGL', 'ORG', 'Companies, agencies, institutions, etc.'),
 ('Joâo Lourenço', 'ORG', 'Companies, agencies, institutions, etc.'),
 ('RDC', 'LOC', 'Non-GPE locations, mountain ranges, bodies of water'),
 ('Rwanda', 'LOC', 'Non-GPE locations, mountain ranges, bodies of water'),
 ('Rwanda', 'LOC', 'Non-GPE locations, mountain ranges, bodies of water'),
 ('M23', 'LOC', 'Non-GPE locations, mountain range