In [1]:
import numpy as np
import nltk
import pandas as pd
from sqlite3 import connect
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline

# Vector space ranking demonstration

### Fetch data

In [2]:
con = connect('../data/nips-papers/database.sqlite')

In [3]:
texts = [x[0] for x in con.execute('select paper_text from papers;').fetchall()]
titles = con.execute('select title from papers;').fetchall()
years = con.execute('select year from papers;').fetchall()

### Proprocess and tokenize texts

In [20]:
from preprocessing import Preprocessor

In [21]:
prepr = Preprocessor()
tokenized = [prepr.process(text) for text in tqdm(texts)]

100%|██████████████████████████████████████████████████████████████████████████████| 6560/6560 [09:02<00:00, 10.45it/s]


### Build ranker

In [34]:
from ranking import BasicVSRanker

In [35]:
ranker = BasicVSRanker.from_tokenized(tokenized)

### Example query processing

In [24]:
for i in ranker.best_n_matches('neural networks', 5):
    print("{} {}".format(years[i], titles[i]))

(1993,) ('Complexity Issues in Neural Computation and Learning',)
(1989,) ('Analog Neural Networks of Limited Precision I: Computing with Multilinear Threshold Functions',)
(2016,) ('Neural Universal Discrete Denoiser',)
(2015,) ('Convolutional Networks on Graphs for Learning Molecular Fingerprints',)
(1996,) ('A New Approach to Hybrid HMM/ANN Speech Recognition using Mutual Information Neural Networks',)


In [25]:
for i in ranker.best_n_matches('latent models', 5):
    print("{} {}".format(years[i], titles[i]))

(1998,) ('Exploratory Data Analysis Using Radial Basis Function Latent Variable Models',)
(2014,) ('Latent Support Measure Machines for Bag-of-Words Data Classification',)
(2016,) ('Multi-view Anomaly Detection via Robust Probabilistic Latent Variable Models',)
(2013,) ('Discovering Hidden Variables in Noisy-Or Networks using Quartet Tests',)
(2012,) ('Learning the Dependency Structure of Latent Factors',)


In [26]:
for i in ranker.best_n_matches('pca', 5):
    print("{} {}".format(years[i], titles[i]))

(1998,) ('Bayesian PCA',)
(2013,) ('Faster Ridge Regression via the Subsampled Randomized Hadamard Transform',)
(2013,) ('Robust Transfer Principal Component Analysis with Rank Constraints',)
(2006,) ('Nonnegative Sparse PCA',)
(2012,) ('Semiparametric Principal Component Analysis',)
