The hypothesis is that the supplied documents defines the desired corpus, and therefore documents with similar words or words that appear in that corpus will be more relevant than documents with words outside of the corpus. 

In [143]:
import pandas as pd
import numpy as np
import glob
import nltk
import re
import urllib.request
import ujson

from pathlib import Path

from gensim.corpora import Dictionary, HashDictionary, MmCorpus
from gensim import models, utils, similarities

In [26]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/liad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/liad/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
def is_meaningful(word):
    return word not in nltk.corpus.stopwords.words('english')

In [112]:
SPECIAL_CHARS = '[^A-Za-z0-9 ]+'
stopwords = nltk.corpus.stopwords.words('english')

In [113]:
def preprocess(text):
    tokens = [re.sub(SPECIAL_CHARS, '', word.lower()) for word in nltk.word_tokenize(text)]
    tokens = [word for word in tokens if word not in stopwords]
    tokens = list(filter(None, tokens))
    return tokens

In [71]:
class Corpus(object):
    def __iter__(self):
        for file in glob.glob("*.txt"):
            print(file)
            paper = Path(file).read_text(encoding='utf8')
            yield paper

In [73]:
corpus_memory_friendly = Corpus()
papers = list(corpus_memory_friendly)

Aspect-augmented Adversarial Networks for Domain Adaptation.txt
Explaining the Predictions of Any Classifier.txt
Rationalizing Neural Predictions.txt
Representation Learning for Grounded Spatial Reasoning.txt


In [114]:
texts = [list(preprocess(t)) for t in papers]

In [115]:
# define the dictionary:
dictionary = Dictionary(texts)
len(dictionary)

4588

In [116]:
#Let's take a look at our dictionary
print(dictionary.token2id)

{'0': 0, '00': 1, '01': 2, '02': 3, '04': 4, '06': 5, '08': 6, '081': 7, '09': 8, '1': 9, '10': 10, '100': 11, '100k': 12, '106': 13, '107k': 14, '10th': 15, '120128': 16, '127135': 17, '13451359': 18, '138': 19, '13891398': 20, '13th': 21, '150': 22, '160167': 23, '16th': 24, '17th': 25, '1998': 26, '1x': 27, '2': 28, '20': 29, '2005': 30, '2006': 31, '2007': 32, '2008': 33, '200k': 34, '2010': 35, '2011': 36, '2012': 37, '2013': 38, '2014': 39, '2015': 40, '2016': 41, '2017': 42, '202': 43, '204213': 44, '206': 45, '21': 46, '21102118': 47, '22': 48, '229k': 49, '23': 50, '23692372': 51, '238k': 52, '25': 53, '252': 54, '25th': 55, '260267': 56, '26722680': 57, '268': 58, '27': 59, '277281': 60, '280': 61, '28th': 62, '295': 63, '2k': 64, '3': 65, '30': 66, '316': 67, '320': 68, '327': 69, '34': 70, '371378': 71, '372': 72, '390': 73, '392': 74, '4': 75, '41': 76, '410': 77, '42': 78, '43': 79, '43rd': 80, '440447': 81, '45': 82, '450': 83, '458': 84, '464': 85, '480': 86, '492': 87,

In [117]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [118]:
MmCorpus.serialize('reasoning_corpus.mm', corpus)

In [119]:
print(corpus)

[[(0, 9), (1, 3), (2, 2), (3, 3), (4, 2), (5, 2), (6, 2), (7, 1), (8, 3), (9, 22), (10, 5), (11, 1), (12, 4), (13, 1), (14, 2), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 3), (23, 1), (24, 1), (25, 1), (26, 2), (27, 1), (28, 17), (29, 1), (30, 2), (31, 5), (32, 6), (33, 6), (34, 3), (35, 5), (36, 4), (37, 14), (38, 2), (39, 10), (40, 25), (41, 25), (42, 3), (43, 2), (44, 1), (45, 1), (46, 1), (47, 1), (48, 3), (49, 2), (50, 1), (51, 1), (52, 2), (53, 2), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 3), (60, 1), (61, 2), (62, 1), (63, 1), (64, 1), (65, 16), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 11), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 2), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 17), (89, 1), (90, 2), (91, 1), (92, 1), (93, 2), (94, 1), (95, 1), (96, 1), (97, 3), (98, 1), (99, 1), (100, 1), (101, 1), (102, 2), (103, 1), (104, 7), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1),

corpus created. Now let's create a comparisson model: LSI

In [120]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=20)

In [121]:
lsi.print_topics()

[(0,
  '0.343*"model" + 0.206*"x" + 0.181*"learning" + 0.152*"explanations" + 0.141*"classifier" + 0.140*"1" + 0.136*"et" + 0.136*"al" + 0.134*"dataset" + 0.123*"set"'),
 (1,
  '0.306*"explanations" + -0.177*"domain" + -0.166*"al" + -0.166*"et" + -0.162*"aspect" + 0.161*"predictions" + 0.151*"features" + 0.150*"lime" + 0.139*"trust" + 0.138*"explanation"'),
 (2,
  '-0.260*"instructions" + -0.214*"goal" + 0.195*"domain" + -0.169*"value" + -0.167*"language" + 0.160*"classifier" + -0.156*"model" + -0.151*"map" + -0.149*"global" + -0.142*"spatial"'),
 (3,
  '0.270*"rationales" + 0.222*"rationale" + 0.194*"generator" + 0.176*"neural" + 0.175*"z" + 0.173*"encoder" + -0.154*"instructions" + 0.148*"x" + 0.132*"words" + 0.131*"recurrent"')]

Corpus created. Let's compare it now to new documents, and see which one match

In [97]:
with urllib.request.urlopen('https://keepcurrent-crawler.herokuapp.com/arxiv') as url:
            response = url.read()
        
parsed_response = ujson.loads(response)

In [98]:
parsed_response[:5]

[{'_rawid': '1806.03583',
  '_version': 2,
  'abstract': 'IntraVascular UltraSound (IVUS) is one of the most effective imaging\nmodalities that provides assistance to experts in order to diagnose and treat\ncardiovascular diseases. We address a central problem in IVUS image analysis\nwith Fully Convolutional Network (FCN): automatically delineate the lumen and\nmedia-adventitia borders in IVUS images, which is crucial to shorten the\ndiagnosis process or benefits a faster and more accurate 3D reconstruction of\nthe artery. Particularly, we propose an FCN architecture, called IVUS-Net,\nfollowed by a post-processing contour extraction step, in order to\nautomatically segments the interior (lumen) and exterior (media-adventitia)\nregions of the human arteries. We evaluated our IVUS-Net on the test set of a\nstandard publicly available dataset containing 326 IVUS B-mode images with two\nmeasurements, namely Jaccard Measure (JM) and Hausdorff Distances (HD). The\nevaluation result shows th

In [99]:
abstracts = [item['abstract'] for item in parsed_response]
print(len(abstracts))

100


In [124]:
abstract_tokens = [preprocess(abstract) for abstract in abstracts]
abstract_tokens_bow = [dictionary.doc2bow(abstract) for abstract in abstract_tokens]

In [127]:
vec_lsi = [lsi[vec_bow] for vec_bow in abstract_tokens_bow]
print(vec_lsi)

[[(0, 2.568359168457238), (1, 0.005621216041390769), (2, 0.20189567537882017), (3, -0.3456811772020574)], [(0, 2.4492759039899203), (1, 0.38334057660318155), (2, 0.14143882776483077), (3, 0.13483868907177152)], [(0, 1.8968591502484247), (1, -0.5782143114714328), (2, 0.17034475734364832), (3, 0.16076123897122455)], [(0, 2.916832565978402), (1, -0.7051795516822286), (2, 0.5208761898734203), (3, -0.5349428525030592)], [(0, 2.4962126862210354), (1, 0.9729868975350674), (2, 0.48203216102493884), (3, -0.4807388067593235)], [(0, 1.4033111264443097), (1, -0.6412903899268699), (2, 0.6986479666482784), (3, -0.13062285956126307)], [(0, 2.0861084829859347), (1, -0.11079399731820574), (2, -0.4374551639974099), (3, -0.36131933715209413)], [(0, 1.696141245237029), (1, -0.3315748354886427), (2, -0.08015391308023713), (3, -0.657119598026168)], [(0, 2.0078924686452244), (1, -0.40440325330540816), (2, -0.25657375715748987), (3, 0.15292717659042582)], [(0, 4.4645724812068455), (1, -0.209641706954137), (2,

In [130]:
index = similarities.MatrixSimilarity(lsi[corpus])
# index.save('rationality_mat_sim.index')
# index = similarities.MatrixSimilarity.load('rationality_mat_sim.index')

In [131]:
sims = index[vec_lsi]

In [163]:
df = pd.DataFrame(sims)
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,100.0,0.730612,0.125994,0.456485,0.631526,0.737993,0.824144,0.981713
1,100.0,0.764718,0.112063,0.510233,0.684496,0.773507,0.849072,0.983175
2,100.0,0.683369,0.132836,0.380723,0.588811,0.677646,0.796451,0.966195
3,100.0,0.640282,0.13869,0.288913,0.56294,0.640483,0.752628,0.959285


In [164]:
df = df.assign(S=df.loc[:,:].sum(1))

In [189]:
THRESHOLD = 0.7
SPECIAL_THRESHOLD = 0.95
TOP_N = 10

results_avg = df[df > THRESHOLD].dropna().sort_values(by='S', axis=0)
results_avg

Unnamed: 0,0,1,2,3,S
75,0.750938,0.770366,0.722543,0.759017,3.002864
64,0.781729,0.70121,0.748933,0.777737,3.009608
9,0.712894,0.743426,0.798512,0.759065,3.013896


In [190]:
results_special = df.iloc[:,0:4]
results_special = results_special[results_special > SPECIAL_THRESHOLD]
results_special = results_special.dropna(how='all')
results_special

Unnamed: 0,0,1,2,3
4,,0.963561,,
5,0.981713,,,
20,,0.969494,,
39,,,0.961445,
82,,,,0.959285
87,,0.983175,,
88,,,0.966195,


In [194]:
for idx in results.index.values.tolist():
    print(abstracts[idx])
    print('\n\n\n')
    
print ('special matches to one of the papers: \n\n')
for idx in results_special.index.values.tolist():
    print(abstracts[idx])
    print('\n\n\n')

Building multi-turn information-seeking conversation systems is an important
and challenging research topic. Although several advanced neural text matching
models have been proposed for this task, they are generally not efficient for
industrial applications. Furthermore, they rely on a large amount of labeled
data, which may not be available in real-world applications. To alleviate these
problems, we study transfer learning for multi-turn information seeking
conversations in this paper. We first propose an efficient and effective
multi-turn conversation model based on convolutional neural networks. After
that, we extend our model to adapt the knowledge learned from a resource-rich
domain to enhance the performance. Finally, we deployed our model in an
industrial chatbot called AliMe Assist
(https://consumerservice.taobao.com/online-help) and observed a significant
improvement over the existing online model.




The score function estimator is widely used for estimating gradients of
sto