In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlite3 import connect
import nltk
from tqdm import tqdm
%matplotlib inline

In [2]:
con = connect('../data/nips-papers/database.sqlite')

In [56]:
titles = [x[0] for x in con.execute('select title from papers;').fetchall()]
texts = [x[0] for x in con.execute('select paper_text from papers;').fetchall()]

## Vocabulary analysis

In [18]:
from collections import Counter

In [19]:
counter = Counter()
for text in tqdm(texts):
    counter.update(nltk.word_tokenize(text))

100%|██████████████████████████████████████████████████████████████████████████████| 6560/6560 [05:23<00:00, 15.91it/s]


**Top 10 tokens without filtration**

In [35]:
counter.most_common(10)

[(',', 1960823),
 ('.', 1621878),
 ('?', 1582064),
 ('the', 1549429),
 (')', 1047467),
 ('(', 1043377),
 ('of', 884950),
 ('and', 645434),
 ('a', 556237),
 ('to', 499428)]

**Top 10 tokens with length greater than 3**

In [48]:
words = counter.keys()
long_words = [w for w in words if len(w) > 3]
long_counter = Counter({k : counter[k] for k in long_words})

In [49]:
long_counter.most_common(10)

[('that', 262310),
 ('with', 196797),
 ('this', 114787),
 ('from', 107193),
 ('which', 95450),
 ('model', 85740),
 ('data', 76713),
 ('each', 70440),
 ('where', 68034),
 ('learning', 65504)]

**Top 10 tokens with length greater than 3 and without stopwords**

In [50]:
stop_words = set(nltk.corpus.stopwords.words('english'))
good_words = [w for w in words if len(w) > 3 and not w in stop_words]
good_counter = Counter({k : counter[k] for k in good_words})

In [51]:
good_counter.most_common(10)

[('model', 85740),
 ('data', 76713),
 ('learning', 65504),
 ('algorithm', 61377),
 ('function', 59779),
 ('This', 53537),
 ('using', 53371),
 ('number', 45582),
 ('problem', 44104),
 ('Figure', 42905)]

## Redo with better word filtering

In [None]:
stop_words = set(nltk.corpus.stopwords.words('english'))
def good_token(t):
    return len(t) > 3 and not t in stop_words
    

In [65]:
counter = Counter()
for text in tqdm(texts):
    tokens = [t.lower() for t in nltk.word_tokenize(text) if good_token(t)]
    counter.update(tokens)

100%|██████████████████████████████████████████████████████████████████████████████| 6560/6560 [05:36<00:00, 14.38it/s]


In [67]:
counter.most_common(100)

[('model', 91029),
 ('learning', 88581),
 ('data', 81327),
 ('algorithm', 71782),
 ('function', 60905),
 ('using', 59934),
 ('this', 53555),
 ('figure', 47961),
 ('number', 47635),
 ('problem', 46030),
 ('time', 45923),
 ('models', 42095),
 ('given', 40885),
 ('used', 40796),
 ('also', 40685),
 ('results', 40490),
 ('training', 39737),
 ('distribution', 38948),
 ('network', 38502),
 ('matrix', 36978),
 ('neural', 36735),
 ('information', 34217),
 ('first', 33385),
 ('method', 32634),
 ('error', 31947),
 ('different', 31809),
 ('methods', 30139),
 ('probability', 30086),
 ('input', 29678),
 ('case', 28982),
 ('algorithms', 28887),
 ('linear', 28605),
 ('performance', 28062),
 ('approach', 27340),
 ('parameters', 26712),
 ('space', 26327),
 ('networks', 26246),
 ('random', 25643),
 ('section', 25320),
 ('image', 25260),
 ('state', 25024),
 ('based', 24975),
 ('however', 24220),
 ('vector', 24214),
 ('value', 24071),
 ('functions', 24059),
 ('analysis', 23519),
 ('following', 23305),
 ('s