In [1]:
# imports
import numpy as np
import pandas as pd
import re
import time
from datasketch import MinHash, MinHashLSHForest

In [37]:
#Data Preprocessing and will split the data by punctuation 
def preprocess_text(text):
    text = re.sub(r'[^\w\s]','',text)
    tokens = text.lower()
    tokens = tokens.split()
    return tokens

In [38]:
text = 'The big brown fox jumped over the fence '
print('Shingles are:', preprocess_text(text))

('Shingles are:', ['the', 'big', 'brown', 'fox', 'jumped', 'over', 'the', 'fence'])


In [39]:
#Standard number of combinations run
combo= 128

#Num recommendations to return from the data
num_rec = 1

In [44]:
def get_forest_info(data, perms):
    start_time = time.time()
    
    minhash = []
    
    for text in data['text']:
        tokens = preprocess(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    print('It took %s seconds to build the forest.' %(time.time()-start_time))
    
    return forest

In [45]:
def prediction(text, database, perms, num_results, forest):
    start_time = time.time()
    
    tokens = preprocess(text)
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    
    result = database.iloc[idx_array]['title']
    
    print('It took %s seconds to query the forest.' %(time.time()-start_time))
    
    return result

In [17]:
data= pd.read_csv('/Users/matt/Downloads/nips-papers/papers.csv')

In [22]:
data

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."
5,1002,1994,Using a neural net to instantiate a deformable...,,1002-using-a-neural-net-to-instantiate-a-defor...,Abstract Missing,U sing a neural net to instantiate a\ndeformab...
6,1003,1994,Plasticity-Mediated Competitive Learning,,1003-plasticity-mediated-competitive-learning.pdf,Abstract Missing,Plasticity-Mediated Competitive Learning\n\nTe...
7,1004,1994,ICEG Morphology Classification using an Analog...,,1004-iceg-morphology-classification-using-an-a...,Abstract Missing,ICEG Morphology Classification using an\nAnalo...
8,1005,1994,Real-Time Control of a Tokamak Plasma Using Ne...,,1005-real-time-control-of-a-tokamak-plasma-usi...,Abstract Missing,Real-Time Control of a Tokamak Plasma\nUsing N...
9,1006,1994,Pulsestream Synapses with Non-Volatile Analogu...,,1006-pulsestream-synapses-with-non-volatile-an...,Abstract Missing,Real-Time Control of a Tokamak Plasma\nUsing N...


In [23]:
len(data)

7241

In [30]:
len(data.abstract=='Abstract Missing')

7241

In [28]:
len(data.abstract.unique())

3923

In [31]:
data['text'] = data['title'] + ' ' + data['abstract']

In [32]:
data['text']

0       Self-Organization of Associative Database and ...
1       A Mean Field Theory of Layer IV of Visual Cort...
2       Storing Covariance by the Associative Long-Ter...
3       Bayesian Query Construction for Neural Network...
4       Neural Network Ensembles, Cross Validation, an...
5       Using a neural net to instantiate a deformable...
6       Plasticity-Mediated Competitive Learning Abstr...
7       ICEG Morphology Classification using an Analog...
8       Real-Time Control of a Tokamak Plasma Using Ne...
9       Pulsestream Synapses with Non-Volatile Analogu...
10      Learning to Play the Game of Chess Abstract Mi...
11      Multidimensional Scaling and Data Clustering A...
12      An experimental comparison of recurrent neural...
13      Training Multilayer Perceptrons with the Exten...
14      Interference in Learning Internal Models of In...
15      Active Learning with Statistical Models Abstra...
16      A Rapid Graph-based Method for Arbitrary Trans...
17      Ocular

In [81]:
forest = get_forest_info(data, combo)

It took 23.0864419937 seconds to build the forest.


In [69]:
num_rec = 1
title = 'Using a neural net'
result = predict(title, data, combo, num_rec, forest)
print 'Top {} Recommendation(s) from nueral net are:\n{}'.format(num_rec,result)
print 'The number of results returned are {}'.format(len(result))

It took 0.00334405899048 seconds to query forest.
Top 1 Recommendation(s) from nueral net are:
7085    Classification of Electroencephalogram using A...
Name: title, dtype: object
The number of results returned are 1


In [70]:
num_rec = 4
title = 'Using a neural net'
result = predict(title, data, combo, num_rec, forest)
print 'Top {} Recommendation(s) from nueral net are:\n{}'.format(num_rec,result)
print 'The number of results returned are {}'.format(len(result))

It took 0.0029821395874 seconds to query forest.
Top 4 Recommendation(s) from nueral net are:
1051    Speech Recognition with Missing Data using Rec...
6987    Neural Network Exploration Using Optimal Exper...
7085    Classification of Electroencephalogram using A...
5047    Multi-Digit Recognition Using a Space Displace...
Name: title, dtype: object
The number of results returned are 4


In [71]:
num_rec = 10
title = 'Using a neural net'
result = predict(title, data, combo, num_rec, forest)
print 'Top {} Recommendation(s) from nueral net are:\n{}'.format(num_rec,result)
print 'The number of results returned are {}'.format(len(result))

It took 0.00451707839966 seconds to query forest.
Top 10 Recommendation(s) from nueral net are:
5       Using a neural net to instantiate a deformable...
5191    A Self-Organizing Integrated Segmentation and ...
6987    Neural Network Exploration Using Optimal Exper...
2092    Combining Visual and Acoustic Speech Signals w...
7085    Classification of Electroencephalogram using A...
2868    Speech Recognition Using Demi-Syllable Neural ...
2069    Analytic Solutions to the Formation of Feature...
5047    Multi-Digit Recognition Using a Space Displace...
1305    Optoelectronic Implementation of a FitzHugh-Na...
1051    Speech Recognition with Missing Data using Rec...
Name: title, dtype: object
The number of results returned are 10


In [77]:
combo = 512

In [78]:
num_rec = 10
title = 'Using a neural net'
result = predict(title, data, combo, num_rec, forest)
print 'Top {} Recommendation(s) from nueral net are:\n{}'.format(num_rec,result)
print 'The number of results returned are {}'.format(len(result))

It took 0.00538396835327 seconds to query forest.
Top 10 Recommendation(s) from nueral net are:
5       Using a neural net to instantiate a deformable...
5191    A Self-Organizing Integrated Segmentation and ...
6987    Neural Network Exploration Using Optimal Exper...
2092    Combining Visual and Acoustic Speech Signals w...
7085    Classification of Electroencephalogram using A...
2868    Speech Recognition Using Demi-Syllable Neural ...
2069    Analytic Solutions to the Formation of Feature...
5047    Multi-Digit Recognition Using a Space Displace...
1305    Optoelectronic Implementation of a FitzHugh-Na...
1051    Speech Recognition with Missing Data using Rec...
Name: title, dtype: object
The number of results returned are 10
