In [60]:
# imports
import nltk
import numpy as np

### 1. Clean and Tokenize Text

In [2]:
# TODO
# def clean_and_tokenize(text):
# or simply use nltk function
# function dependent on dataset and task
# put clean and tokenze function in python files

In [9]:
def clean_and_tokenize(text):
    return nltk.word_tokenize(text)

In [10]:
clean_and_tokenize("explain how vocabulary is built")

['explain', 'how', 'vocabulary', 'is', 'built']

### 2. Define Vocabulary
- explain how vocabulary is built
- determine vocabulary threshold

In [None]:
# TODO
# define vocabulary function
# put function in python file to call

In [11]:
def build_vocabulary(corpus, vocabulary_threshold=None):
    '''
    Parameters:
        - corpus [list(string)]: list of tokens
        - vocabulary_threshold [int]: minimum occurrence of token
    Returns:
        - vocabulary [dictionary]:
                - keys: token
                - values: occurrence of token in corpus
    '''
    vocabulary = {}
    
    # build vocabulary
    for token in corpus:
        if token in vocabulary.keys():
            vocabulary[token] += 1
        else:
            vocabulary[token] = 1
            
    # apply vocabulary threshold
    keys_to_remove = []
    
    if vocabulary_threshold:
        for key in vocabulary.keys():
            if vocabulary[key]<vocabulary_threshold:
                keys_to_remove.append(key)
    
    for key in keys_to_remove:
        vocabulary.pop(key)
            
    return vocabulary

In [12]:
# test build_vocabulary

corpus = ['this', 'is', 'a', 'trial', 'run', 'a', 'this', 'fun', 'fun', 'fun']

print(build_vocabulary(corpus))

print(build_vocabulary(corpus, vocabulary_threshold=2))

{'this': 2, 'is': 1, 'a': 2, 'trial': 1, 'run': 1, 'fun': 3}
{'this': 2, 'a': 2, 'fun': 3}


### 3. Build Co-occurrence matrix
- document how unknown tokens are managed
- 'Bag of words' concept vs co-occurrence matrix


In [72]:
def build_co_occurrence_matrix(corpus, vocabulary, window_size):
    '''
    Parameters:
        - corpus [list(string)]: list of tokens
        - vocabulary [list(string)]: list of unique tokens to use as vocabulary
        - window_size [int]: size of window within which to consider co-occurrence
    Returns:
        - co_occurrence_matrix [2D array] : 2D array of integers indicating co-occurrence of tokens
    '''

    n = len(vocabulary)
    
    # matrix of vocabulary size + 1 'for unknown'
    co_occurrence_matrix = np.array([[0] * (n+1)] * (n+1))
    
    for i in range(len(corpus)):
        
        left_factor = (window_size-1)//2
        right_factor = ((window_size-1)//2) + ((window_size-1)%2)
        
        for j in range(max(0,i - left_factor), min(len(corpus), i + right_factor + 1)):
            
            if i != j:
            
                index_i = i
                index_j = j
            
                # check if token in vocabulary
                if corpus[i] in vocabulary:
                    index_i = vocabulary.index(corpus[i])
                else:
                    index_i = n
                
                # check if co-occurring token in vocabulary
                if corpus[j] in vocabulary:
                    index_j = vocabulary.index(corpus[j])
                else:
                    index_j = n
                
                # update co-occurrence matrix
                co_occurrence_matrix[index_i][index_j] += 1
                
    return co_occurrence_matrix

In [73]:
corpus = ['this', 'is', 'a', 'trial', 'run', 'a', 'this', 'fun', 'fun', 'fun']
voc = build_vocabulary(corpus)
print(voc)
print(list(voc.keys()))
corpus2 = ['this', 'is', 'a', 'trial', 'run', 'a', 'this', 'fun', 'fun', 'fun', 'not_in_vocab']

cooc = build_co_occurrence_matrix(corpus2, list(voc.keys()), 3)
print(cooc)

{'this': 2, 'is': 1, 'a': 2, 'trial': 1, 'run': 1, 'fun': 3}
['this', 'is', 'a', 'trial', 'run', 'fun']
[[0 1 1 0 0 1 0]
 [1 0 1 0 0 0 0]
 [1 1 0 1 1 0 0]
 [0 0 1 0 1 0 0]
 [0 0 1 1 0 0 0]
 [1 0 0 0 0 4 1]
 [0 0 0 0 0 1 0]]
