## Import libraries and include the Brown data .

In [None]:
!pip install sklearn-crfsuite



In [None]:
import nltk
from nltk.corpus import brown
nltk.download('brown')
nltk.download('universal_tagset')

import spacy
from spacy.tokens import Doc
nlp = spacy.load("en_core_web_sm")

import sklearn_crfsuite
from sklearn_crfsuite import metrics

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


## The function (token2features) for the feature extraction.
* The input is list of tokens.
* The output is list of dictionary which contains the features of the current, the previous and the next word for every word as a dictionary.

In [None]:
def token2features(ListOfWords):
    doc = Doc(nlp.vocab, words=ListOfWords)
    result = []
    I = range(len(doc))
    for i  in I:
      features = {
          'lower': doc[i].lower_,
          'suffix': doc[i].suffix_,
          'prefix': doc[i].prefix_,
          'isupper': doc[i].is_upper,
          'istitle': doc[i].is_title,
          'isdigit': doc[i].is_digit
          }
      if i > 0:
          features.update({
              '-1_lower': doc[i-1].lower_,
              '-1_suffix': doc[i-1].suffix_,
              '-1_prefix': doc[i-1].prefix_,
              '-1_isupper': doc[i-1].is_upper,
              '-1_istitle': doc[i-1].is_title,
              '-1_isdigit': doc[i-1].is_digit
          })
      else:
          features['BOS'] = True

      if i < len(doc)-1:
          features.update({
              '+1_lower': doc[i+1].lower_,
              '+1_suffix': doc[i+1].suffix_,
              '+1_prefix': doc[i+1].prefix_,
              '+1_isupper': doc[i+1].is_upper,
              '+1_istitle': doc[i+1].is_title,
              '+1_isdigit': doc[i+1].is_digit,
          })
      else:
          features['EOS'] = True

      result.append(features)

    return result

## I assign the Brown tagged sentences with the universal tagset for the variable "sentences".
## I assign the Brown sentences for the variable "sentences1".

In [None]:
sentences = brown.tagged_sents(tagset="universal")
sentences1 = brown.sents()

## I use list comprehension with token2features function to get list of lists of dictionaries.
## I use list comprehension to get list of lists of strings where those strings are POS tags for each word in Brown data.

In [None]:
sents = [token2features(sent) for sent in sentences1]
labels = [[s[1] for s in lis] for lis in sentences]

## Now we have the input data in "sents" variable and the output data in "labels" variable, I split them to train and test data.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sents,labels, test_size=0.1,shuffle=False) 

In [None]:
len(X_train) , len(y_train) , len(X_test) , len(y_test)

(51606, 51606, 5734, 5734)

## I create a CRF model using sklearn_crfsuite.
## I fit the model with train data.
* Conditional random fields (CRFs) are a class of statistical modeling method often applied in pattern recognition and machine learning and used for structured prediction. Whereas a classifier predicts a label for a single sample without considering "neighboring" samples, a CRF can take context into account.(From Wikipedia)

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

## I show the unique labels.

In [None]:
labels = list(crf.classes_)
labels

['DET',
 'NOUN',
 'ADJ',
 'VERB',
 'ADP',
 '.',
 'ADV',
 'CONJ',
 'PRT',
 'PRON',
 'NUM',
 'X']

## I use the last model to make prediction with test data, we get a good score.

In [None]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

0.9744252902116789

In [None]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

           .      1.000     1.000     1.000     15505
           X      0.829     0.234     0.365       124
         ADJ      0.916     0.907     0.912      5492
         ADP      0.971     0.977     0.974      9630
         ADV      0.941     0.937     0.939      5357
        VERB      0.982     0.981     0.982     16917
         DET      0.991     0.989     0.990     10113
        CONJ      0.991     0.999     0.995      3326
        NOUN      0.965     0.973     0.969     17692
        PRON      0.992     0.990     0.991      7353
         PRT      0.954     0.924     0.939      3446
         NUM      0.862     0.986     0.920       487

    accuracy                          0.975     95442
   macro avg      0.949     0.908     0.914     95442
weighted avg      0.975     0.975     0.974     95442



In [None]:
y_pred[0:2] , y_test[0:2]

([['PRON', 'VERB', 'ADV', 'NUM', 'NOUN', 'ADJ', '.'],
  ['.', 'DET', 'ADJ', 'NOUN', '.', 'DET', 'NOUN', '.', '.']],
 [['PRON', 'VERB', 'ADV', 'NUM', 'NOUN', 'ADJ', '.'],
  ['.', 'DET', 'ADJ', 'NOUN', '.', 'DET', 'NOUN', '.', '.']])

## The funcion (pos_tagger):
* The input is either untokenized string sentences or tokenized list of words.
* The output is a list of POS tags
* If the input is a string sentence, I use spacy for tokenization then I call token2features function for creating the features then I use the last model to pridict POS tags.

In [None]:
def pos_tagger(sentence):
    if type(sentence) == str:
        doc = nlp(sentence)
        sentence = [word.text for word in doc]
    
    creatingFeatures = token2features(sentence)
    pred = crf.predict([creatingFeatures])
    return pred

## Here you can input a text, the POS tags will be displaied using pos_tagger function.

In [None]:
text = input('Input your text: ')
print("\n",pos_tagger(text)[0])

Input your text: This is Mohammad

 ['DET', 'VERB', 'NOUN']
