In [1]:
import numpy as np
import nltk
import pandas as pd
from sqlite3 import connect
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline

# Vector space ranking demonstration

### Fetch data

In [2]:
con = connect('../data/nips-papers/database.sqlite')

In [3]:
texts = [x[0] for x in con.execute('select paper_text from papers;').fetchall()]
titles = con.execute('select title from papers;').fetchall()
years = con.execute('select year from papers;').fetchall()

### Proprocess and tokenize texts

In [4]:
from preprocessing import Preprocessor
from ranking import BasicVSRanker

In [5]:
prepr = Preprocessor()
tokenized = [prepr.process(text) for text in tqdm(texts)]

100%|██████████████████████████████████████████████████████████████████████████████| 6560/6560 [08:51<00:00,  9.36it/s]


### Build ranker

In [7]:
ranker = BasicVSRanker.from_tokenized(tokenized)

In [8]:
ranker.vectorizer.vocabulary_

{'associ': 61,
 'databas': 200,
 'applic': 46,
 'univers': 935,
 'effici': 263,
 'method': 534,
 'propos': 689,
 'togeth': 905,
 'robot': 755,
 'system': 878,
 'input': 432,
 'output': 611,
 'first': 338,
 'part': 621,
 'discuss': 242,
 'algorithm': 26,
 'aspect': 59,
 'produc': 681,
 'new': 577,
 'neural': 574,
 'network': 573,
 'latter': 476,
 'recognit': 714,
 'demonstr': 214,
 'introduct': 445,
 'let': 486,
 'map': 517,
 'given': 371,
 'finit': 336,
 'infinit': 428,
 'anoth': 41,
 'learn': 479,
 'machin': 508,
 'observ': 595,
 'pair': 615,
 'sampl': 767,
 'random': 700,
 'mean': 529,
 'product': 682,
 'estim': 290,
 'make': 514,
 'small': 817,
 'error': 287,
 'measur': 530,
 'usual': 944,
 'say': 769,
 'faster': 326,
 'decreas': 208,
 'increas': 419,
 'number': 592,
 'better': 88,
 'howev': 399,
 'express': 314,
 'perform': 634,
 'sinc': 811,
 'consider': 163,
 'candid': 107,
 'assum': 62,
 'find': 335,
 'good': 375,
 'concept': 153,
 'us': 942,
 'type': 926,
 'advanc': 20,
 'under

### Example query processing

In [9]:
[(titles[i], years[i]) for i in ranker.best_n_matches('neural networks', 5)]

[(('Complexity Issues in Neural Computation and Learning',), (1993,)),
 (('Analog Neural Networks of Limited Precision I: Computing with Multilinear Threshold Functions',),
  (1989,)),
 (('Neural Universal Discrete Denoiser',), (2016,)),
 (('Convolutional Networks on Graphs for Learning Molecular Fingerprints',),
  (2015,)),
 (('A New Approach to Hybrid HMM/ANN Speech Recognition using Mutual Information Neural Networks',),
  (1996,))]

In [10]:
[(titles[i], years[i]) for i in ranker.best_n_matches('latent models', 5)]

[(('Exploratory Data Analysis Using Radial Basis Function Latent Variable Models',),
  (1998,)),
 (('Latent Support Measure Machines for Bag-of-Words Data Classification',),
  (2014,)),
 (('Multi-view Anomaly Detection via Robust Probabilistic Latent Variable Models',),
  (2016,)),
 (('Discovering Hidden Variables in Noisy-Or Networks using Quartet Tests',),
  (2013,)),
 (('Learning the Dependency Structure of Latent Factors',), (2012,))]

In [11]:
[(titles[i], years[i]) for i in ranker.best_n_matches('pca', 5)]

[(('Bayesian PCA',), (1998,)),
 (('Faster Ridge Regression via the Subsampled Randomized Hadamard Transform',),
  (2013,)),
 (('Robust Transfer Principal Component Analysis with Rank Constraints',),
  (2013,)),
 (('Nonnegative Sparse PCA',), (2006,)),
 (('Semiparametric Principal Component Analysis',), (2012,))]