In [3]:
# Ensure reproducibility

import numpy as np

CUSTOM_SEED = 42
np.random.seed(CUSTOM_SEED)

In [18]:
import nltk
nltk.download('treebank')
nltk.download('maxent_treebank_pos_tagger')

[nltk_data] Downloading package treebank to /home/linu/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     /home/linu/nltk_data...
[nltk_data]   Unzipping taggers/maxent_treebank_pos_tagger.zip.


True

In [21]:
import random
from nltk.corpus import treebank
sentences = treebank.tagged_sents()
print('a random sentence: \n-> {}'.format(random.choice(sentences)))

a random sentence: 
-> [('INTER-TEL', 'NNP'), ('Inc', 'NNP'), ('.', '.'), ('-LRB-', '-LRB-'), ('Chandler', 'NNP'), (',', ','), ('Ariz.', 'NNP'), ('-RRB-', '-RRB-'), ('--', ':')]


In [22]:
tags = set([tag for sentence in treebank.tagged_sents() for _, tag in sentence])
print('nb_tags: {}\ntags: {}'.format(len(tags), tags))

nb_tags: 46
tags: {',', 'SYM', 'DT', 'VBG', 'WDT', 'UH', '-NONE-', 'JJS', '-RRB-', 'VBN', 'PDT', 'VB', 'EX', 'PRP$', 'NNP', 'CC', '-LRB-', 'LS', 'VBZ', "''", '``', 'FW', 'NN', ':', 'JJR', 'VBD', '.', 'RBS', 'WP', 'NNS', 'PRP', '#', 'TO', 'POS', 'MD', 'WP$', 'JJ', 'RBR', 'RP', 'WRB', 'VBP', 'IN', 'NNPS', '$', 'RB', 'CD'}


In [23]:
train_test_cutoff = int(.80 * len(sentences)) 
training_sentences = sentences[:train_test_cutoff]
testing_sentences = sentences[train_test_cutoff:]
 
train_val_cutoff = int(.25 * len(training_sentences))
validation_sentences = training_sentences[:train_val_cutoff]
training_sentences = training_sentences[train_val_cutoff:]

In [24]:

def add_basic_features(sentence_terms, index):
    """ Compute some very basic word features.

        :param sentence_terms: [w1, w2, ...] 
        :type sentence_terms: list
        :param index: the index of the word 
        :type index: int
        :return: dict containing features
        :rtype: dict
    """
    term = sentence_terms[index]
    return {
        'nb_terms': len(sentence_terms),
        'term': term,
        'is_first': index == 0,
        'is_last': index == len(sentence_terms) - 1,
        'is_capitalized': term[0].upper() == term[0],
        'is_all_caps': term.upper() == term,
        'is_all_lower': term.lower() == term,
        'prefix-1': term[0],
        'prefix-2': term[:2],
        'prefix-3': term[:3],
        'suffix-1': term[-1],
        'suffix-2': term[-2:],
        'suffix-3': term[-3:],
        'prev_word': '' if index == 0 else sentence_terms[index - 1],
        'next_word': '' if index == len(sentence_terms) - 1 else sentence_terms[index + 1]
    }

In [25]:
def untag(tagged_sentence):
    """ 
    Remove the tag for each tagged term. 

    :param tagged_sentence: a POS tagged sentence
    :type tagged_sentence: list
    :return: a list of tags
    :rtype: list of strings
    """
    return [w for w, _ in tagged_sentence]

def transform_to_dataset(tagged_sentences):
    """
    Split tagged sentences to X and y datasets and append some basic features.

    :param tagged_sentences: a list of POS tagged sentences
    :param tagged_sentences: list of list of tuples (term_i, tag_i)
    :return: 
    """
    X, y = [], []

    for pos_tags in tagged_sentences:
        for index, (term, class_) in enumerate(pos_tags):
            # Add basic NLP features for each sentence term
            X.append(add_basic_features(untag(pos_tags), index))
            y.append(class_)
    return X, y

In [26]:

X_train, y_train = transform_to_dataset(training_sentences)
X_test, y_test = transform_to_dataset(testing_sentences)
X_val, y_val = transform_to_dataset(validation_sentences)

In [30]:
from sklearn.feature_extraction import DictVectorizer
 
dict_vectorizer = DictVectorizer(sparse=False)
dict_vectorizer.fit(X_train + X_test + X_val)

DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)

In [31]:
X_train = dict_vectorizer.transform(X_train)
X_test = dict_vectorizer.transform(X_test)
X_val = dict_vectorizer.transform(X_val)

MemoryError: 