In [1]:
!pip install --upgrade nltk
import nltk
from nltk.corpus import brown
from nltk import FreqDist, ConditionalFreqDist
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger, brill, hmm
from nltk.tag.brill import fntbl37
from nltk.tag.brill_trainer import BrillTaggerTrainer
from sklearn.model_selection import train_test_split

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.8.1
    Uninstalling nltk-3.8.1:
      Successfully uninstalled nltk-3.8.1
Successfully installed nltk-3.9.1


In [2]:
nltk.download('brown')
nltk.download('universal_tagset')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [3]:
tagged_sentences = brown.tagged_sents(tagset='universal')
train_data, test_data = train_test_split(tagged_sentences, test_size=0.1, random_state=42)


In [4]:
# Étape 1 : Extraction des verbes
verbs = [word for sent in train_data for word, tag in sent if tag == 'VERB']

In [5]:
# Étape 2 : Chercher les 10 noms les plus fréquents
nouns = [word for sent in train_data for word, tag in sent if tag == 'NOUN']
freq_dist_nouns = FreqDist(nouns)
top_10_nouns = freq_dist_nouns.most_common(10)

In [6]:
# Étape 3 : Créer un dictionnaire (mot : étiquette)
word_tag_dict = {}
for sent in train_data:
    for word, tag in sent:
        word_tag_dict.setdefault(word.lower(), set()).add(tag)

In [7]:
# 4.1 Unigram Tagger
unigram_tagger = UnigramTagger(train_data)
unigram_accuracy = unigram_tagger.evaluate(test_data)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  unigram_accuracy = unigram_tagger.evaluate(test_data)


In [8]:
# 4.2 Bigram Tagger
bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)
bigram_accuracy = bigram_tagger.evaluate(test_data)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  bigram_accuracy = bigram_tagger.evaluate(test_data)


In [9]:
# 4.3 Trigram Tagger
trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger)
trigram_accuracy = trigram_tagger.evaluate(test_data)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  trigram_accuracy = trigram_tagger.evaluate(test_data)


In [10]:
# 4.4 Brill Tagger
unigram_tagger = UnigramTagger(train_data)
bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)
trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger)

templates = fntbl37()
brill_trainer = BrillTaggerTrainer(initial_tagger=trigram_tagger, templates=templates)
brill_tagger = brill_trainer.train(train_data)
brill_accuracy = brill_tagger.evaluate(test_data)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  brill_accuracy = brill_tagger.evaluate(test_data)


In [11]:
# 4.5 Viterbi HMM Tagger
hmm_tagger = hmm.HiddenMarkovModelTrainer().train(train_data)
hmm_accuracy = hmm_tagger.evaluate(test_data)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  hmm_accuracy = hmm_tagger.evaluate(test_data)
  O[i, k] = self._output_logprob(si, self._symbols[k])
  O[i, k] = self._output_logprob(si, self._symbols[k])


In [12]:
# Résultats
print("Les 10 noms les plus fréquents : ", top_10_nouns)
print("Unigram Tagger Accuracy: ", unigram_accuracy)
print("Bigram Tagger Accuracy: ", bigram_accuracy)
print("Trigram Tagger Accuracy: ", trigram_accuracy)
print("Brill Tagger Accuracy: ", brill_accuracy)
print("HMM Viterbi Tagger Accuracy: ", hmm_accuracy)

Les 10 noms les plus fréquents :  [('time', 1399), ('man', 1037), ('Af', 910), ('years', 858), ('way', 801), ('Mr.', 760), ('people', 726), ('men', 653), ('world', 629), ('life', 621)]
Unigram Tagger Accuracy:  0.9323251120488559
Bigram Tagger Accuracy:  0.9402493469277209
Trigram Tagger Accuracy:  0.9405114495146734
Brill Tagger Accuracy:  0.955958028639076
HMM Viterbi Tagger Accuracy:  0.7636708340977992
