In [1]:
import sys
import os
import json
import itertools

import numpy as np
import pandas as pd

# from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm_notebook as tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors

from nltk.tokenize import sent_tokenize

import gensim
from gensim.models import Word2Vec



In [2]:
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
n_groups = 12
    
def parse_json(filepath, min_len=50, max_len=400):
    with open(filepath) as fp:
        reviews = json.load(fp)
    
    all_reviews = []
    for hotel in reviews:
        hotel_name = hotel['name']
        hotel_reviews = hotel['comments']
        
        all_reviews += [review.lower() for review in hotel_reviews if min_len < len(review) < max_len]
        
    return all_reviews

def parse_all_jsons(min_len=50, max_len=400):
    all_reviews = []
    
    print "Parsing jsons"
    with ThreadPoolExecutor(max_workers=n_groups) as executor: 
        futures = [executor.submit(parse_json, os.path.join('jsons', filename), min_len, max_len) for filename in os.listdir('jsons')]
        for future in tqdm(as_completed(futures), total=len(futures)):
            all_reviews += future.result()
      
    return all_reviews
        
def train_w2v(min_len, max_len):
    all_reviews = parse_all_jsons(min_len=min_len, max_len=max_len)
    
    print "Training model"
    model = Word2Vec(all_reviews)
    model.save('w2v.model')
    
def grouper(n, iterable, fillvalue=None):
    args = [iter(iterable)] * n
    return itertools.izip_longest(*args)

def find_closer(text_min_len, text_max_len, sum_min_len, sum_max_len):
    assert(os.path.isfile('w2v.model'))
    
    print "Loading model"
    model = Word2Vec.load('w2v.model')

    all_texts = parse_all_jsons(min_len=text_min_len, max_len=text_max_len)
    all_absts = parse_all_jsons(min_len=sum_min_len, max_len=sum_max_len)
    
    print len(all_texts)
    print len(all_absts)
    
    print "Vectorizing reviews"
    def vectorize(model, all_reviews):
        vects = []
        with ThreadPoolExecutor(max_workers=n_groups) as executor: 
            def vec_task(model, reviews):
                return [np.mean([model[w] for w in sent if w in model], axis=0) for sent in reviews if sent is not None]

            groups = grouper(100, all_reviews)            
            futures = [executor.submit(vec_task, model, group) for group in groups]

            for future in tqdm(as_completed(futures), total=len(futures)):
                vects += future.result()
                
        return vects
    
    texts_vects = vectorize(model, all_texts)
    absts_vects = vectorize(model, all_absts)
    
    print "Finding data pairs"
    all_results = []
    with ThreadPoolExecutor(max_workers=n_groups) as executor:        
        def nn_task(offset, all_texts, all_absts, vects, absts_vects):
            nn = NearestNeighbors(n_neighbors=2).fit(vects)
            results = []
            
            for i, vect in enumerate(absts_vects):
                if vect is None:
                    continue
                    
                distances, indices = nn.kneighbors([vect], n_neighbors=5)
                most_similar = [all_texts[offset + indices[0][k]] for k, d in enumerate(distances[0]) if d > 0]
                rnd_idx = np.random.choice(range(len(most_similar)), size=min(len(most_similar), 2), replace=False)
                    
                for k in rnd_idx:
                    summary = all_absts[i]
                    text = most_similar[k]
                    results += [pd.Series({'text': text, 'summary': summary})]
                    
            return results
        
        groups = grouper(100, texts_vects)
        futures = [executor.submit(nn_task, k*100, all_texts, all_absts, group, absts_vects) for k, group in enumerate(groups)]
        
        for future in tqdm(as_completed(futures), total=len(futures)):
            all_results += future.result()

    def normalize(t):
        t = t.encode('ascii', 'ignore')
        t = t.replace("\t", " ")
        t = t.replace("\n", " ")
        t = t.replace("=", "")
        return t
            
    return pd.DataFrame(all_results).applymap(normalize)

In [None]:
def tostring(row):
    text = '<s>' + '</s><s>'.join(sent_tokenize(row['text'])) + '</s>'
    summary = '<s>' + '</s><s>'.join(sent_tokenize(row['summary'])) + '</s>'
    return 'article=' + text + '\tabstract=' + summary

In [None]:
def main():
    #train_w2v(0, float('inf'))
    df = find_closer(120, 250, 0, 120)
    df.to_csv('data.csv')
    
    data = '\n'.join(df.apply(tostring, axis=1))
    with open('data.txt', 'w') as fp:
        fp.write(data)
    
    return df
    
df = main()

Loading model


2017-06-08 04:15:06,594 : INFO : loading Word2Vec object from w2v.model
2017-06-08 04:15:06,678 : INFO : loading wv recursively from w2v.model.wv.* with mmap=None
2017-06-08 04:15:06,681 : INFO : setting ignored attribute syn0norm to None
2017-06-08 04:15:06,684 : INFO : setting ignored attribute cum_table to None
2017-06-08 04:15:06,686 : INFO : loaded w2v.model


Parsing jsons

Parsing jsons

387680
16231
Vectorizing reviews


Finding data pairs
24/|/  1%|| 24/3877 [26:01<69:39:05, 65.08s/it]

In [None]:
df.head(30).values

In [39]:
concat = [''.join(val) for val in df.values]
voc = CountVectorizer().fit(concat)

In [45]:
with open('vocab.bin', 'wb') as fp:
    fp.writelines([k + ' ' + str(v) + '\n' for k,v in voc.vocabulary_.iteritems()])

In [48]:
from sklearn.model_selection import train_test_split

df = df.sample(frac=1)
train, test = train_test_split(df, test_size=0.2)

data = '\n'.join(df.apply(tostring, axis=1))
with open('data.txt', 'w') as fp:
    fp.write(data)

data_train = '\n'.join(train.apply(tostring, axis=1))
with open('data_train.txt', 'w') as fp:
    fp.write(data_train)
    
data_test = '\n'.join(test.apply(tostring, axis=1))
with open('data_test.txt', 'w') as fp:
    fp.write(data_test)