In [None]:
import spacy
sp = spacy.load('en_core_web_sm')

In [None]:
sen = sp(u"She doesn’t study German on Monday. She swims every morning.")

In [None]:
print(sen.text)

She doesn’t study German on Monday. She swims every morning.


In [None]:
print(sen[0].pos_)

PRON


In [None]:
print(sen[0].tag_)

PRP


In [None]:
print(sen[3].pos_)

VERB


In [None]:
print(sen[3].tag_)

VB


In [None]:
print(spacy.explain(sen[0].tag_))

pronoun, personal


In [None]:
print(spacy.explain(sen[3].tag_))

verb, base form


In [None]:
for word in sen:
    print(f'{word.text:{10}} {word.pos_:{8}} {word.tag_:{6}} {spacy.explain(word.tag_)}')

She        PRON     PRP    pronoun, personal
does       AUX      VBZ    verb, 3rd person singular present
n’t        PART     RB     adverb
study      VERB     VB     verb, base form
German     ADJ      JJ     adjective (English), other noun-modifier (Chinese)
on         ADP      IN     conjunction, subordinating or preposition
Monday     PROPN    NNP    noun, proper singular
.          PUNCT    .      punctuation mark, sentence closer
She        PRON     PRP    pronoun, personal
swims      VERB     VBZ    verb, 3rd person singular present
every      DET      DT     determiner
morning    NOUN     NN     noun, singular or mass
.          PUNCT    .      punctuation mark, sentence closer


In [None]:
sen = sp(u'Can you help me please?')
word = sen[1]

print(f'{word.text:{12}} {word.pos_:{10}} {word.tag_:{8}} {spacy.explain(word.tag_)}')

you          PRON       PRP      pronoun, personal


In [None]:
sen = sp(u"She doesn’t study German on Monday. She swims every morning.")

num_pos = sen.count_by(spacy.attrs.POS)
num_pos

{95: 2, 87: 1, 94: 1, 100: 2, 84: 1, 85: 1, 96: 1, 97: 2, 90: 1, 92: 1}

In [None]:
for k,v in sorted(num_pos.items()):
    print(f'{k}. {sen.vocab[k].text:{12}}: {v}')

84. ADJ         : 1
85. ADP         : 1
87. AUX         : 1
90. DET         : 1
92. NOUN        : 1
94. PART        : 1
95. PRON        : 2
96. PROPN       : 1
97. PUNCT       : 2
100. VERB        : 2


In [None]:
from spacy import displacy

sen = sp(u"She doesn’t study German on Monday. She swims every morning.")
displacy.render(sen, style='dep', jupyter=True, options={'distance': 85})

In [None]:
displacy.serve(sen, style='dep', options={'distance': 120})


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [None]:
import spacy
sp = spacy.load('en_core_web_sm')

sen = sp(u'Tunis is the capital and largest city of Tunisia. The greater metropolitan area of Tunis, often referred to as "Grand Tunis", has about 2,700,000 inhabitants. As of 2020, it is the third-largest city in the Maghreb region (after Casablanca and Algiers) and the eleventh-largest in the Arab world.')

In [None]:
#returns the list of all the named entities
print(sen.ents)

(Tunisia, Tunis, Grand Tunis, about 2,700,000, 2020, third, Casablanca, Arab)


In [None]:
for entity in sen.ents:
    print(entity.text + ' - ' + entity.label_ + ' - ' + str(spacy.explain(entity.label_)))

Tunisia - GPE - Countries, cities, states
Tunis - NORP - Nationalities or religious or political groups
Grand Tunis - WORK_OF_ART - Titles of books, songs, etc.
about 2,700,000 - CARDINAL - Numerals that do not fall under another type
2020 - DATE - Absolute or relative dates or periods
third - ORDINAL - "first", "second", etc.
Casablanca - GPE - Countries, cities, states
Arab - NORP - Nationalities or religious or political groups


In [None]:
sen = sp(u'The Tunisian dialect features Arabic vocabulary spiced with Berber and French words and phrases. It is also highly influenced by Latin languages such as Italian and Spanish, in addition to some Turkish loanwords. If you study Tunisian Arabic, you will definitely notice the heavy influence from foreign languages.')
for entity in sen.ents:
    print(entity.text + ' - ' + entity.label_ + ' - ' + str(spacy.explain(entity.label_)))

Tunisian - NORP - Nationalities or religious or political groups
Arabic - LANGUAGE - Any named language
Berber - PERSON - People, including fictional
French - NORP - Nationalities or religious or political groups
Latin - LANGUAGE - Any named language
Italian - NORP - Nationalities or religious or political groups
Spanish - NORP - Nationalities or religious or political groups
Turkish - NORP - Nationalities or religious or political groups
Tunisian - NORP - Nationalities or religious or political groups


In [None]:
from spacy.tokens import Span

ORG = sen.vocab.strings[u'ORG']
new_entity = Span(sen, 0, 1, label=ORG)
sen.ents = list(sen.ents) + [new_entity]

In [None]:
for entity in sen.ents:
    print(entity.text + ' - ' + entity.label_ + ' - ' + str(spacy.explain(entity.label_)))

The - ORG - Companies, agencies, institutions, etc.
Tunisian - NORP - Nationalities or religious or political groups
Arabic - LANGUAGE - Any named language
Berber - PERSON - People, including fictional
French - NORP - Nationalities or religious or political groups
Latin - LANGUAGE - Any named language
Italian - NORP - Nationalities or religious or political groups
Spanish - NORP - Nationalities or religious or political groups
Turkish - NORP - Nationalities or religious or political groups
Tunisian - NORP - Nationalities or religious or political groups


In [None]:
en = sp(u'The Tunisian dialect features Standard Arabic vocabulary spiced with Berber and French words and phrases. It is also highly influenced by Latin languages such as Italian and Spanish, in addition to some Turkish loanwords. Thus, Tunisian Arabic is definitely influenced by foreign languages.')
for entity in sen.ents:
    print(entity.text + ' - ' + entity.label_ + ' - ' + str(spacy.explain(entity.label_)))

The - ORG - Companies, agencies, institutions, etc.
Tunisian - NORP - Nationalities or religious or political groups
Arabic - LANGUAGE - Any named language
Berber - PERSON - People, including fictional
French - NORP - Nationalities or religious or political groups
Latin - LANGUAGE - Any named language
Italian - NORP - Nationalities or religious or political groups
Spanish - NORP - Nationalities or religious or political groups
Turkish - NORP - Nationalities or religious or political groups
Tunisian - NORP - Nationalities or religious or political groups


In [None]:
len([ent for ent in sen.ents if ent.label_=='PERSON'])

1

In [None]:
from spacy import displacy

sen = sp(u'The Tunisian dialect features Standard Arabic vocabulary spiced with Berber and French words and phrases. It is also highly influenced by Latin languages such as Italian and Spanish, in addition to some Turkish loanwords. Thus, Tunisian Arabic is definitely influenced by foreign languages.')
displacy.render(sen, style='ent', jupyter=True)

In [None]:
from spacy import displacy

sen1 = sp(u"Ons Jabeur is a Tunisian professional tennis player. She has been ranked as high as world No. 2 by the Women's Tennis Association (WTA), achieved on 27 June 2022. Jabeur is the current No. 1 Tunisian player, and the highest-ranked African and Arab tennis player in WTA and ATP rankings history. She has won three singles titles on the WTA Tour, as well as 11 singles titles and one doubles title on the ITF Women's Circuit.")
displacy.render(sen1, style='ent', jupyter=True)

In [None]:
filter = {'ents': ['PERSON']}
displacy.render(sen1, style='ent', jupyter=True, options=filter)

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.3-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 30.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 85.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 58.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.3


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "My name is Nesrine Azaiez, I'm from Tunisia and I live in Sfax"

ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-PER', 'score': 0.99949133, 'index': 4, 'word': 'N', 'start': 11, 'end': 12}, {'entity': 'B-PER', 'score': 0.9896903, 'index': 5, 'word': '##es', 'start': 12, 'end': 14}, {'entity': 'I-PER', 'score': 0.54800695, 'index': 6, 'word': '##rine', 'start': 14, 'end': 18}, {'entity': 'I-PER', 'score': 0.9993211, 'index': 7, 'word': 'A', 'start': 19, 'end': 20}, {'entity': 'I-PER', 'score': 0.99401855, 'index': 8, 'word': '##zai', 'start': 20, 'end': 23}, {'entity': 'I-PER', 'score': 0.9916877, 'index': 9, 'word': '##ez', 'start': 23, 'end': 25}, {'entity': 'B-LOC', 'score': 0.9998332, 'index': 15, 'word': 'Tunisia', 'start': 36, 'end': 43}, {'entity': 'B-LOC', 'score': 0.99842775, 'index': 20, 'word': 'S', 'start': 58, 'end': 59}, {'entity': 'I-LOC', 'score': 0.9383965, 'index': 21, 'word': '##fa', 'start': 59, 'end': 61}, {'entity': 'I-LOC', 'score': 0.8901361, 'index': 22, 'word': '##x', 'start': 61, 'end': 62}]


In [None]:
ner_results


[{'entity': 'B-PER',
  'score': 0.99949133,
  'index': 4,
  'word': 'N',
  'start': 11,
  'end': 12},
 {'entity': 'B-PER',
  'score': 0.9896903,
  'index': 5,
  'word': '##es',
  'start': 12,
  'end': 14},
 {'entity': 'I-PER',
  'score': 0.54800695,
  'index': 6,
  'word': '##rine',
  'start': 14,
  'end': 18},
 {'entity': 'I-PER',
  'score': 0.9993211,
  'index': 7,
  'word': 'A',
  'start': 19,
  'end': 20},
 {'entity': 'I-PER',
  'score': 0.99401855,
  'index': 8,
  'word': '##zai',
  'start': 20,
  'end': 23},
 {'entity': 'I-PER',
  'score': 0.9916877,
  'index': 9,
  'word': '##ez',
  'start': 23,
  'end': 25},
 {'entity': 'B-LOC',
  'score': 0.9998332,
  'index': 15,
  'word': 'Tunisia',
  'start': 36,
  'end': 43},
 {'entity': 'B-LOC',
  'score': 0.99842775,
  'index': 20,
  'word': 'S',
  'start': 58,
  'end': 59},
 {'entity': 'I-LOC',
  'score': 0.9383965,
  'index': 21,
  'word': '##fa',
  'start': 59,
  'end': 61},
 {'entity': 'I-LOC',
  'score': 0.8901361,
  'index': 22,
  