In [44]:
import logging
import re
from collections import Counter

class WordIndexer:
    """Transform g a dataset of text to a list of index of words. Not memory 
    optimized for big datasets"""

    def __init__(self, min_word_occurences=5, right_window=1, oov_word="OOV"):
        self.oov_word = oov_word
        self.right_window = right_window
        self.min_word_occurences = min_word_occurences
        self.word_to_index = {oov_word: 0}
        self.index_to_word = [oov_word]
        self.word_occurrences = {}
        self.re_words = re.compile(r"\b[a-zA-Z]{2,}\b")

    def _get_or_set_word_to_index(self, word):
        try:
            return self.word_to_index[word]
        except KeyError:
            idx = len(self.word_to_index)
            self.word_to_index[word] = idx
            self.index_to_word.append(word)
            return idx

    @property
    def n_words(self):
        return len(self.word_to_index)

    def fit_transform(self, texts):
        l_words = [list(self.re_words.findall(sentence.lower())) for sentence in texts]
        word_occurrences = Counter(word for words in l_words for word in words)

        self.word_occurrences = {
            word: n_occurences
            for word, n_occurences in word_occurrences.items()
            if n_occurences >= self.min_word_occurences}

        oov_index = 0
        return [[self._get_or_set_word_to_index(word)
                 if word in self.word_occurrences else oov_index
                 for word in words]
                for words in l_words]

    def _get_ngrams(self, indexes):
        for i, left_index in enumerate(indexes):
            window = indexes[i + 1:i + self.right_window + 1]
            for distance, right_index in enumerate(window):
                yield left_index, right_index, distance + 1

    def get_comatrix(self, data):
        comatrix = Counter()
        z = 0
        for indexes in data:
            l_ngrams = self._get_ngrams(indexes)
            for left_index, right_index, distance in l_ngrams:
                comatrix[(left_index, right_index)] += 1. / distance
                z += 1
        return zip(*[(left, right, x) for (left, right), x in comatrix.items()])

In [9]:
import nltk
# nltk.download('gutenberg')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [47]:
import time
from nltk.corpus import gutenberg

indexer = WordIndexer(right_window=1, min_word_occurences=5)

texts2 = gutenberg.sents('shakespeare-macbeth.txt')
texts = ["One day I went to lidl", "I went to shop lift in lidl", "now I dont go back to lidl"]
sentences = [" ".join(list_of_words) for list_of_words in texts2]

start_time = time.time()
data = indexer.fit_transform(sentences)
end_time = time.time()
print("Time taken is {}".format(end_time-start_time))
print(data[1000])

Time taken is 0.014100313186645508
[0, 0, 17, 0]
