In [1]:
import sys
import os
import json
import itertools

import numpy as np
import pandas as pd

# from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm_notebook as tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors

from nltk.tokenize import sent_tokenize

import gensim
from gensim.models import Word2Vec



In [2]:
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
n_groups = 12
    
def parse_json(filepath, min_len=50, max_len=400):
    with open(filepath) as fp:
        reviews = json.load(fp)
    
    all_reviews = []
    for hotel in reviews:
        hotel_name = hotel['name']
        hotel_reviews = hotel['comments']
        
        all_reviews += [review for review in hotel_reviews if min_len < len(review) < max_len]
        
    return all_reviews

def parse_all_jsons(min_len=50, max_len=400):
    all_reviews = []
    
    print "Parsing jsons"
    with ThreadPoolExecutor(max_workers=n_groups) as executor: 
        futures = [executor.submit(parse_json, os.path.join('jsons', filename), min_len, max_len) for filename in os.listdir('jsons')]
        for future in tqdm(as_completed(futures), total=len(futures)):
            all_reviews += future.result()
      
    return all_reviews
        
def train_w2v(min_len, max_len):
    all_reviews = parse_all_jsons(min_len=min_len, max_len=max_len)
    
    print "Training model"
    model = Word2Vec(all_reviews)
    model.save('w2v.model')
    
def grouper(n, iterable, fillvalue=None):
    args = [iter(iterable)] * n
    return itertools.izip_longest(*args)

def find_closer(min_len, max_len):
    assert(os.path.isfile('w2v.model'))
    
    print "Loading model"
    model = Word2Vec.load('w2v.model')

    all_reviews = parse_all_jsons(min_len=min_len, max_len=max_len)
    
    print "Vectorizing reviews"
    vects = []
    with ThreadPoolExecutor(max_workers=n_groups) as executor: 
        def vec_task(model, reviews):
            return [np.mean([model[w] for w in sent if w in model], axis=0) for sent in reviews if sent is not None]
        
        groups = grouper(100, all_reviews)            
        futures = [executor.submit(vec_task, model, group) for group in groups]
        
        for future in tqdm(as_completed(futures), total=len(futures)):
            vects += future.result()
    
    nn = NearestNeighbors(n_neighbors=2).fit(vects)

    print "Finding data pairs"
    all_results = []
    with ThreadPoolExecutor(max_workers=n_groups) as executor:        
        def nn_task(nn, all_reviews, vects):
            results = []
            
            for i, vect in enumerate(vects):
                if vect is None:
                    continue
                    
                distances, indices = nn.kneighbors([vect], n_neighbors=5)
                most_similar = [all_reviews[indices[0][k]] for k, d in enumerate(distances[0]) if d > 0]

                if len(most_similar) >= 2:
                    rnd_idx = np.random.choice(range(len(most_similar)), size=2, replace=False)
                    text = all_reviews[i] + '\n' + most_similar[rnd_idx[0]]
                    summary = most_similar[rnd_idx[1]]

                    results += [pd.Series({'text': text, 'summary': summary})]
                    
            return results
        
        groups = grouper(100, vects)
        futures = [executor.submit(nn_task, nn, all_reviews, group) for group in groups]
        
        for future in tqdm(as_completed(futures), total=len(futures)):
            all_results += future.result()

    def normalize(t):
        t = t.encode('ascii', 'ignore')
        t = t.replace("\t", " ")
        t = t.replace("\n", " ")
        t = t.replace("=", "")
        return t.lower()
            
    return pd.DataFrame(all_results).applymap(normalize)

In [4]:
def tostring(row):
    text = '<s>' + '</s><s>'.join(sent_tokenize(row['text'])) + '</s>'
    summary = '<s>' + '</s><s>'.join(sent_tokenize(row['summary'])) + '</s>'
    return 'article=' + text + '\tabstract=' + summary

In [5]:
def main():
#     train_w2v(0, float('inf'))
    df = find_closer(50, 250)
    df.to_csv('data.csv')
    
    data = '\n'.join(df.apply(tostring, axis=1))
    with open('data.txt', 'w') as fp:
        fp.write(data)
    
    return df
    
df = main()

Parsing jsons

Training model


2017-05-15 21:05:02,625 : INFO : collecting all words and their counts
2017-05-15 21:05:02,628 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-05-15 21:05:02,940 : INFO : PROGRESS: at sentence #10000, processed 3098299 words, keeping 196 word types
2017-05-15 21:05:03,220 : INFO : PROGRESS: at sentence #20000, processed 6148577 words, keeping 214 word types
2017-05-15 21:05:03,505 : INFO : PROGRESS: at sentence #30000, processed 9160803 words, keeping 384 word types
2017-05-15 21:05:03,770 : INFO : PROGRESS: at sentence #40000, processed 12255176 words, keeping 523 word types
2017-05-15 21:05:04,019 : INFO : PROGRESS: at sentence #50000, processed 15308038 words, keeping 675 word types
2017-05-15 21:05:04,273 : INFO : PROGRESS: at sentence #60000, processed 18424883 words, keeping 767 word types
2017-05-15 21:05:04,535 : INFO : PROGRESS: at sentence #70000, processed 21573762 words, keeping 932 word types
2017-05-15 21:05:04,805 : INFO : PROGRESS: at sen

Loading model


2017-05-15 21:21:20,566 : INFO : loading Word2Vec object from w2v.model
2017-05-15 21:21:20,581 : INFO : loading wv recursively from w2v.model.wv.* with mmap=None
2017-05-15 21:21:20,582 : INFO : setting ignored attribute syn0norm to None
2017-05-15 21:21:20,585 : INFO : setting ignored attribute cum_table to None
2017-05-15 21:21:20,588 : INFO : loaded w2v.model


Parsing jsons

Vectorizing reviews

Finding data pairs
2588/|/ 64%|| 2588/4040 [54:13<30:25,  1.26s/it]


In [8]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

data_train = '\n'.join(train.apply(tostring, axis=1))
with open('data_train.txt', 'w') as fp:
    fp.write(data_train)
    
data_test = '\n'.join(test.apply(tostring, axis=1))
with open('data_test.txt', 'w') as fp:
    fp.write(data_test)