In [2]:
toy_corpus = ["START All that glitters isn't gold END".split(" "), "START All's well that ends well END".split(" ")]
toy_corpus

[['START', 'All', 'that', 'glitters', "isn't", 'gold', 'END'],
 ['START', "All's", 'well', 'that', 'ends', 'well', 'END']]

In [3]:
import numpy as np

In [169]:
corpus = toy_corpus
out = np.concatenate(corpus).ravel()
unique_words = sorted(np.unique(out))
word2Ind = {word:i for i,word in enumerate(unique_words)}

In [177]:
# 1. iterate lists in corpus
# 2. add padding to lists, o.w. indexing won't work
# 3. iterate lists and find center words and words to left and right of center word
# 4. find indices for cooccuring words and add 1 to correct index

def calc_cooccurence_mat(corpus, window_size=2):
    M = np.zeros((len(unique_words),len(unique_words)))
    
    #window_size = 1
    for word_ls in corpus:
        word_ls = [None for i in range(window_size)] + word_ls + [None for i in range(window_size)] #padding 
        for i in range(window_size, len(word_ls)-window_size):


            word_slice = word_ls[i-window_size:i+window_size+1]
            cut = len(word_slice)//2

            center_word = word_slice[window_size]
            
            word_slice_left = word_slice[:cut]
            word_slice_right= word_slice[cut+1:]
            
            #print("For word slice:", word_slice)
            #print("left:",word_slice_left)
            #print("center:",center_word)
            #print("right:",word_slice_right)
            
            
            # list of N indices for word M in MxN matrix
            words_to_add = [word2Ind[tok] for tok in word_slice_left + word_slice_right 
                           if tok is not None]
            #print("Words to add:", words_to_add)
            #print("--------------------------")

            # index of word M
            m_idx = word2Ind[center_word]

            for word_idx in words_to_add: 
                M[m_idx, word_idx] += 1
    return M
        

In [179]:
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA


In [189]:
M = np.random.rand(100, 100)

In [190]:
M

array([[0.38992842, 0.84196181, 0.95594219, ..., 0.03469912, 0.20901681,
        0.15012548],
       [0.29792619, 0.84930652, 0.85178872, ..., 0.90650492, 0.3451828 ,
        0.20531002],
       [0.29410528, 0.80740193, 0.27616625, ..., 0.83704028, 0.83101991,
        0.92353669],
       ...,
       [0.42654196, 0.3259527 , 0.58588012, ..., 0.36682743, 0.77327251,
        0.13895012],
       [0.59457871, 0.63464073, 0.05374651, ..., 0.37519777, 0.65212029,
        0.29694526],
       [0.59020977, 0.17077184, 0.14792485, ..., 0.25983418, 0.83660418,
        0.09225988]])

In [191]:
k = 2
n_iters = 10
svd = TruncatedSVD(n_components = k, n_iter=n_iters)
M_reduced = svd.fit_transform(M)

In [192]:
python grader.py

array([[ 4.74587885, -0.86732649],
       [ 5.14921043, -0.58594638],
       [ 5.2380408 , -0.63715935],
       [ 4.75752977, -0.33255309],
       [ 4.46362255,  0.19851872],
       [ 5.14681111, -0.02802903],
       [ 5.228984  ,  0.15824285],
       [ 4.85158308,  0.93061569],
       [ 4.69736801,  0.01126203],
       [ 4.9576889 ,  0.2921155 ],
       [ 4.93847275,  0.90330225],
       [ 5.48166542, -0.53621125],
       [ 5.44596835,  0.19785556],
       [ 4.81408333, -0.21933978],
       [ 5.00419442,  0.54243103],
       [ 5.28629459, -0.83240893],
       [ 4.9554833 ,  0.35466502],
       [ 5.23657456, -0.26297839],
       [ 5.48459944, -0.27840572],
       [ 5.4410454 , -0.32822266],
       [ 4.7910387 ,  0.92337446],
       [ 5.2762754 ,  0.7915306 ],
       [ 4.72145489, -0.23302665],
       [ 5.14435936,  0.17423258],
       [ 4.59299834, -0.64720197],
       [ 4.84602764,  0.27994903],
       [ 4.93185698,  0.01579453],
       [ 4.7247443 ,  0.11009249],
       [ 5.11204443,