In [8]:
import numpy as np
import pandas as pd
import logging
import nltk
import scipy
import gensim
import sklearn

from pprint import pprint
import scipy.sparse
from scipy.sparse import csr_matrix
import sklearn.metrics.pairwise

from gensim import corpora
from collections import defaultdict
from gensim.models import Doc2Vec
from sklearn.feature_extraction.text import CountVectorizer


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [9]:
xl = pd.ExcelFile("./data/survey_data.xlsx")
df = xl.parse()
df = df.dropna()

# df.ix[:,1]
# df.head()
# df.shape

# Columns to summarize: 1,2,3,4,5,6

In [5]:
def make_sentences(df):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    textBlobs = []
    for i in range(1,7):
        if str(df.ix[:,i].dtype) == 'object':
            textBlob = df.ix[:,i].str.cat(sep='. ')
            tokenized = tokenizer.tokenize(textBlob)
            textBlobs += tokenized
    return np.array(textBlobs)


def vectorize(sentences):
    return CountVectorizer().fit_transform(sentences)


def sem_vol_max(sentences, vectors, L):
    S = set()
    B = set()
    
    c = (1 / vectors.shape[0]) * np.sum(vectors, axis=0)
    c = csr_matrix(c)
    
    dists = sklearn.metrics.pairwise.pairwise_distances(vectors, c)
    p = np.argmax(dists)
    S.add(sentences[p])
    
    dists = sklearn.metrics.pairwise.pairwise_distances(vectors, vectors[p])
    q = np.argmax(dists)
    S.add(sentences[q])
    
    b_0 = vectors[q] / scipy.sparse.linalg.norm(vectors[q])
    B.add(b_0)
    
    len_p = len(vectors[p].nonzero()[0])
    len_q = len(vectors[q].nonzero()[0])
    total_length = len_p + len_q
    
    for i in range(0, vectors.shape[0]):
        r = ortho_proj_vec(vectors, B)
        
        print("Argmin new sentence: ", r)
        print("Total words: ", total_length)
        print("Next sentence length: ", len(sentences[r].split()))
        if total_length > L:
            break
        
        if total_length + len(sentences[r].split()) <= L:
            S.add(sentences[r])
            b_r = np.divide(vectors[r], scipy.sparse.linalg.norm(vectors[r]))
            B.add(b_r)
            total_length = total_length + len(sentences[r])
     
    return S


def ortho_proj_vec(vectors, B):
    print("Calculating vector with largest distance to subspace with {} basis vectors".format(len(B)))
    projs = csr_matrix(vectors.shape, dtype=np.int8) # coo_matrix
    
    iteration = 0
    for b in B:
        iteration += 1
        print("Starting with basis vector {} of {}".format(iteration, len(B)))
        
        p = vectors.dot(b.T)
        p_new = csr_matrix(vectors.shape, dtype=np.int8) # dtype=object)
        
        for i in range(0, p.shape[0]):
            toadd = csr_matrix(p[i] * b)
            p_new[i,:] = p_new[i,:] + toadd
            
        projs += p_new
    
    dists = scipy.sparse.linalg.norm( (vectors - projs), axis=0 )
    return np.argmax(dists)


sentences = make_sentences(df)
vectorized = vectorize(sentences)
summary = sem_vol_max(sentences, vectorized, 400)

print(summary)

Calculating vector of largest distance with 1 basis vectors
Starting with basis vector 1 of 1
('Argmin new sentence: ', 62)
('total_length: ', 57)
('next sentence length: ', 7)
Calculating vector of largest distance with 2 basis vectors
Starting with basis vector 1 of 2
Starting with basis vector 2 of 2
('Argmin new sentence: ', 62)
('total_length: ', 102)
('next sentence length: ', 7)
Calculating vector of largest distance with 3 basis vectors
Starting with basis vector 1 of 3
Starting with basis vector 2 of 3
Starting with basis vector 3 of 3
('Argmin new sentence: ', 62)
('total_length: ', 147)
('next sentence length: ', 7)
Calculating vector of largest distance with 4 basis vectors
Starting with basis vector 1 of 4
Starting with basis vector 2 of 4
Starting with basis vector 3 of 4
Starting with basis vector 4 of 4
('Argmin new sentence: ', 62)
('total_length: ', 192)
('next sentence length: ', 7)
Calculating vector of largest distance with 5 basis vectors
Starting with basis vecto

{u'Eating right, drinking lots of water and using products with the right ingredients that will achieve the look and feel of healthy skin.. By eating properly, drinking plenty of water and getting adequate sleep.. By eating properly, drinking lots of water, and using products than contain nutrients and things you skin needs - things that are healthy for it, and things it loses as you age, like collagen.',
 u'Skin that is cleaned, washed, and exfoliated.',
 u'Skin that is not to dry, to oily, has a radiance about it.. not oily, not dry, not itchy, clean of cuts, bruises, bright, not blotchy.'}