## Imports

In [1]:
#Do imports just once and for all
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix
from scipy.sparse.linalg import svds

import json

#instead of popup, plot images inline
%matplotlib inline

# Code

## Input

In [2]:
#Test Daten (dictionary) 
docs = {    
    'file1': ['ein','test','für','Dokument','eins'],    
    'file2': ['ein','weiteres','Dokument'],
    'file3': ['alle','guten','Dinge','sind','drei']
}

In [None]:
# Testing load and save
# - Numpy Arrays: https://docs.scipy.org/doc/numpy/reference/generated/numpy.save.html
# - General: 
#   - Input/Output: https://docs.python.org/3.3/tutorial/inputoutput.html
#   - Write JSON Dump: http://stackoverflow.com/questions/12309269/how-do-i-write-json-data-to-a-file-in-python

#import json
with open('files/test.dict', 'w+') as outfile:
    json.dump(docs, outfile)

with open('files/test.dict', 'r') as infile:
    docs2 = json.load(infile)
    
docs2

## Sparse DTM

In [4]:
#https://datascience.blog.wzb.eu/2016/06/17/creating-a-sparse-document-term-matrix-for-topic-modeling-via-lda/
# -> adapted to fit TermxDoc Matrix without additional transpose at the end

#Init Vocabulary (Set)
vocab = set()

#Init Count of Non-Zero Values
n_nonzero = 0

#Iterate over documents an create vocabulary
for docterms in docs.values():
    unique_terms = set(docterms)    # all unique terms of this doc (set of terms)
    vocab |= unique_terms           # set union: add unique terms of this doc
    n_nonzero += len(unique_terms)  # increase Non-Zero count by adding count of unique terms in this doc
    

#convert to numpy for processing
docpaths = np.array(list(docs.keys())) # keys of dictionary (order of dictionary is used, but this doesn't correspond to order of inserts!)
vocab = np.array(list(vocab)) 

#Array containing sorted indices
vocab_sorter = np.argsort(vocab)   

#print(vocab_sorter) #sorted indizes 
#print(vocab[vocab_sorter]) #outputs sorted vocabulary

ndocs = len(docpaths)
nvocab = len(vocab)

#Initialize components of COO-Matrix (values, row indizes, col indizes) with emtpy values (for all non-zero elements)
data = np.empty(n_nonzero, dtype=np.intc)     # all non-zero term frequencies at data[i,k]
rows = np.empty(n_nonzero, dtype=np.intc)     # row index for [i,k]th data item (ith term freq.)
cols = np.empty(n_nonzero, dtype=np.intc)     # column index for [i,k]th data item (kth document)

#Init index (w.r.t. position in arrays of sparse COO matrix)
ind = 0     
# go through all documents with their terms
for docpath, terms in docs.items():
    # find indices into  such that, if the corresponding elements in  were
    # inserted before the indices, the order of  would be preserved
    # -> array of indices of  in 
    term_indices = vocab_sorter[np.searchsorted(vocab, terms, sorter=vocab_sorter)]

    # count the unique terms of the document and get their vocabulary indices
    uniq_indices, counts = np.unique(term_indices, return_counts=True)
    n_vals = len(uniq_indices)  # = number of unique terms
    ind_end = ind + n_vals  #  to  is the slice that we will fill with data

    data[ind:ind_end] = counts                  # save the counts (term frequencies)
    rows[ind:ind_end] = uniq_indices            # save the row index: index in 
    doc_idx = np.where(docpaths == docpath)     # get the document index for the document name
    cols[ind:ind_end] = np.repeat(doc_idx, n_vals)  # save it as repeated value

    ind = ind_end  # resume with next document -> add data to the end
    
dtm = coo_matrix((data, (rows, cols)), shape=(nvocab, ndocs), dtype=np.intc)




In [5]:
print(dtm.toarray())
print(vocab)
print(docpaths)

[[0 1 0]
 [0 0 1]
 [1 1 0]
 [0 0 1]
 [0 0 1]
 [0 1 0]
 [1 0 0]
 [0 0 1]
 [0 0 1]
 [1 1 0]
 [0 1 0]]
['für' 'sind' 'Dokument' 'Dinge' 'alle' 'eins' 'weiteres' 'drei' 'guten'
 'ein' 'test']
['file2' 'file1' 'file3']
