# Neural Information Processing Systems (NIPS) Dataset

# 1)- Importing key modules

In [2]:
import pandas as pd
import numpy as np
# LDA, tSNE
from sklearn.manifold import TSNE
from gensim.models.ldamodel import LdaModel
# NLTK
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import re
# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
%matplotlib inline
import seaborn as sns
# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Slider
from bokeh.layouts import column
from bokeh.palettes import all_palettes
output_notebook()

In [3]:
df = pd.read_csv("papers.csv")
df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


# 2) Pre-Processing

In [4]:
# Removing numerals:
df['paper_text_tokens'] = df.paper_text.map(lambda x: re.sub(r'\d+', '', x))
# Lower case:
df['paper_text_tokens'] = df.paper_text_tokens.map(lambda x: x.lower())
print(df['paper_text_tokens'][0][:500])



self-organization of associative database
and its applications
hisashi suzuki and suguru arimoto
osaka university, toyonaka, osaka , japan
abstract
an efficient method of self-organizing associative databases is proposed together with
applications to robot eyesight systems. the proposed databases can associate any input
with some output. in the first half part of discussion, an algorithm of self-organization is
proposed. from an aspect of hardware, it produces a new style of neural network. in


In [5]:
# Tokenize
df['paper_text_tokens'] = df.paper_text_tokens.map(lambda x: RegexpTokenizer(r'\w+').tokenize(x))
print(df['paper_text_tokens'][0][:25])

['self', 'organization', 'of', 'associative', 'database', 'and', 'its', 'applications', 'hisashi', 'suzuki', 'and', 'suguru', 'arimoto', 'osaka', 'university', 'toyonaka', 'osaka', 'japan', 'abstract', 'an', 'efficient', 'method', 'of', 'self', 'organizing']


In [6]:
# stemming
snowball = SnowballStemmer("english")  
df['paper_text_tokens'] = df.paper_text_tokens.map(lambda x: [snowball.stem(token) for token in x])
print(df['paper_text_tokens'][0][:25])

['self', 'organ', 'of', 'associ', 'databas', 'and', 'it', 'applic', 'hisashi', 'suzuki', 'and', 'suguru', 'arimoto', 'osaka', 'univers', 'toyonaka', 'osaka', 'japan', 'abstract', 'an', 'effici', 'method', 'of', 'self', 'organ']


In [7]:
# stop words
stop_en = stopwords.words('english')
df['paper_text_tokens'] = df.paper_text_tokens.map(lambda x: [t for t in x if t not in stop_en]) 
print(df['paper_text_tokens'][0][:25])

['self', 'organ', 'associ', 'databas', 'applic', 'hisashi', 'suzuki', 'suguru', 'arimoto', 'osaka', 'univers', 'toyonaka', 'osaka', 'japan', 'abstract', 'effici', 'method', 'self', 'organ', 'associ', 'databas', 'propos', 'togeth', 'applic', 'robot']


In [8]:
# final
df['paper_text_tokens'] = df.paper_text_tokens.map(lambda x: [t for t in x if len(t) > 1])
print(df['paper_text_tokens'][0][:25])

['self', 'organ', 'associ', 'databas', 'applic', 'hisashi', 'suzuki', 'suguru', 'arimoto', 'osaka', 'univers', 'toyonaka', 'osaka', 'japan', 'abstract', 'effici', 'method', 'self', 'organ', 'associ', 'databas', 'propos', 'togeth', 'applic', 'robot']


In [9]:
df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text,paper_text_tokens
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...,"[self, organ, associ, databas, applic, hisashi..."
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...,"[mean, field, theori, layer, iv, visual, corte..."
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...,"[store, covari, associ, long, term, potenti, d..."
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...,"[bayesian, queri, construct, neural, network, ..."
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a...","[neural, network, ensembl, cross, valid, activ..."


# 3) LDA Model

In [10]:
from gensim import corpora, models
np.random.seed(2017)
texts = df['paper_text_tokens'].values
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
"""
ldamodel = models.ldamodel.LdaModel(corpus, id2word=dictionary, 
                                    num_topics=8, passes=5, minimum_probability=0)
    
"""

In [11]:
#ldamodel.save('lda_model_AUTHOR')

In [12]:
from gensim.models import LdaModel
ldamodel = LdaModel.load('lda_model_AUTHOR')

In [13]:
from pprint import pprint
pprint(ldamodel.print_topics())

[(0,
  '0.016*"learn" + 0.013*"label" + 0.013*"use" + 0.011*"data" + 0.010*"set" + '
  '0.009*"train" + 0.009*"model" + 0.008*"featur" + 0.008*"class" + '
  '0.007*"classif"'),
 (1,
  '0.025*"model" + 0.013*"estim" + 0.012*"data" + 0.012*"distribut" + '
  '0.011*"use" + 0.009*"sampl" + 0.007*"paramet" + 0.007*"gaussian" + '
  '0.006*"method" + 0.006*"process"'),
 (2,
  '0.017*"state" + 0.011*"algorithm" + 0.011*"polici" + 0.010*"learn" + '
  '0.010*"action" + 0.009*"use" + 0.009*"time" + 0.008*"function" + '
  '0.008*"valu" + 0.008*"node"'),
 (3,
  '0.013*"use" + 0.012*"network" + 0.009*"train" + 0.009*"data" + '
  '0.008*"layer" + 0.008*"learn" + 0.007*"vector" + 0.007*"error" + '
  '0.006*"method" + 0.006*"function"'),
 (4,
  '0.019*"network" + 0.017*"learn" + 0.013*"model" + 0.012*"train" + '
  '0.012*"use" + 0.009*"unit" + 0.008*"neural" + 0.007*"input" + '
  '0.007*"output" + 0.007*"system"'),
 (5,
  '0.014*"neuron" + 0.012*"network" + 0.009*"model" + 0.009*"input" + '
  '0.009*"n

**Refactoring results of LDA into numpy matrix (number_of_papers x number_of_topics).**

In [14]:
hm = np.array([[y for (x,y) in ldamodel[corpus[i]]] for i in range(len(corpus))])

In [15]:
pprint(ldamodel.show_topics(num_topics=5))

[(7,
  '0.015*"algorithm" + 0.010*"bound" + 0.010*"function" + 0.009*"set" + '
  '0.009*"problem" + 0.009*"optim" + 0.006*"theorem" + 0.006*"use" + '
  '0.006*"result" + 0.006*"learn"'),
 (5,
  '0.014*"neuron" + 0.012*"network" + 0.009*"model" + 0.009*"input" + '
  '0.009*"neural" + 0.008*"activ" + 0.007*"time" + 0.007*"cell" + '
  '0.006*"system" + 0.006*"spike"'),
 (4,
  '0.019*"network" + 0.017*"learn" + 0.013*"model" + 0.012*"train" + '
  '0.012*"use" + 0.009*"unit" + 0.008*"neural" + 0.007*"input" + '
  '0.007*"output" + 0.007*"system"'),
 (0,
  '0.016*"learn" + 0.013*"label" + 0.013*"use" + 0.011*"data" + 0.010*"set" + '
  '0.009*"train" + 0.009*"model" + 0.008*"featur" + 0.008*"class" + '
  '0.007*"classif"'),
 (1,
  '0.025*"model" + 0.013*"estim" + 0.012*"data" + 0.012*"distribut" + '
  '0.011*"use" + 0.009*"sampl" + 0.007*"paramet" + 0.007*"gaussian" + '
  '0.006*"method" + 0.006*"process"')]


# 4)- Topic Analysis

The function get_term_topics returns the odds of a particular word belonging to some particular topic

In [16]:
ldamodel.get_term_topics('network')

[(0, 0.00026670936),
 (1, 0.0015290414),
 (2, 0.003867573),
 (3, 0.012419046),
 (4, 0.018771613),
 (5, 0.011829864),
 (6, 0.0023058944),
 (7, 9.057953e-05)]

The get_document_topics method returns topic distribution of the document along with topic distribution for each word in that document

In [17]:
print(df.paper_text_tokens[0][:25])

['self', 'organ', 'associ', 'databas', 'applic', 'hisashi', 'suzuki', 'suguru', 'arimoto', 'osaka', 'univers', 'toyonaka', 'osaka', 'japan', 'abstract', 'effici', 'method', 'self', 'organ', 'associ', 'databas', 'propos', 'togeth', 'applic', 'robot']


In [18]:
doc_number = 0
doc_topic, word_topic, phi_value = ldamodel.get_document_topics(corpus[doc_number], per_word_topics=True)

In [19]:
doc_topic

[(0, 0.02883484),
 (1, 0.007614151),
 (2, 0.13748665),
 (3, 0.06331377),
 (4, 0.2953236),
 (5, 0.08929087),
 (6, 0.218723),
 (7, 0.15941311)]

The output gives the topic distribution of the document

In [20]:
pprint(word_topic)

[(0, [4, 5, 2, 3, 1, 6, 7, 0]),
 (1, [5, 6, 4, 2, 7, 3, 0, 1]),
 (2, [7, 4, 6, 2, 5, 3, 0, 1]),
 (3, [4, 2, 6, 5, 7, 3, 0, 1]),
 (4, [4, 7, 3, 2, 6, 5, 1, 0]),
 (5, [4, 2, 6, 7, 5, 3, 1, 0]),
 (6, [4, 6, 5, 2, 7, 3, 0, 1]),
 (7, [4, 2, 7, 5, 6, 3, 0, 1]),
 (8, [7, 4, 6, 2, 3, 5, 0, 1]),
 (9, [4, 2, 7, 5, 6, 3, 0, 1]),
 (10, [5, 0, 1, 4, 6, 2, 3, 7]),
 (11, [4, 2, 6, 7, 3, 5, 0, 1]),
 (12, [4, 6, 2, 5, 7, 3, 0, 1]),
 (13, [6, 2, 4, 7, 3, 5, 0, 1]),
 (14, [4, 5, 6, 2, 7, 3, 0, 1]),
 (15, [6, 4, 2, 7, 3, 0, 5, 1]),
 (16, [4, 7, 2, 3, 6, 5, 0, 1]),
 (17, [4, 6, 7, 2, 5, 3, 0, 1]),
 (18, [7, 2, 4, 6, 3, 0, 5, 1]),
 (19, [6, 4, 2, 5, 7, 3, 0, 1]),
 (20, [4, 2, 6, 7, 3, 5, 0, 1]),
 (21, [4, 6, 7, 2, 5, 3, 0, 1]),
 (22, [4, 7, 6, 2, 3, 5, 0, 1]),
 (23, [4, 2, 7, 6, 5, 3, 0, 1]),
 (24, [4, 7, 5, 2, 6, 1, 3, 0]),
 (25, [6, 7, 3, 0, 1, 2, 5, 4]),
 (26, [5, 6, 4, 7, 3, 2, 0, 1]),
 (27, [7, 6, 4, 3, 5, 2, 0, 1]),
 (28, [4, 6, 5, 3, 7, 2, 0, 1]),
 (29, [4, 5, 6, 3, 2, 7, 1, 0]),
 (30, [7, 2, 4, 6, 5

Output gives you word belonging to topics in given documents. In our case, its 1st row

In [21]:
pprint(phi_value)

[(0,
  [(0, 5.498017e-07),
   (1, 0.005092027),
   (2, 0.19867596),
   (3, 0.05053126),
   (4, 0.41299367),
   (5, 0.32035697),
   (6, 2.4572459e-05),
   (7, 4.9428413e-06)]),
 (1,
  [(0, 6.4759165e-06),
   (1, 1.0926254e-06),
   (2, 3.359088e-05),
   (3, 1.292309e-05),
   (4, 5.6709236e-05),
   (5, 0.83341014),
   (6, 5.9312682e-05),
   (7, 1.4939518e-05)]),
 (2,
  [(0, 0.027027566),
   (1, 0.006179946),
   (2, 0.13655181),
   (3, 0.060502768),
   (4, 0.2336722),
   (5, 0.10148593),
   (6, 0.13787174),
   (7, 0.2965575)]),
 (3,
  [(0, 0.018995205),
   (1, 0.0043436736),
   (2, 0.20365109),
   (3, 0.047498524),
   (4, 0.36164206),
   (5, 0.09670621),
   (6, 0.17164406),
   (7, 0.09534053)]),
 (4,
  [(0, 0.0013469215),
   (1, 0.001547014),
   (2, 0.03218457),
   (3, 0.11096109),
   (4, 0.41862),
   (5, 0.015424802),
   (6, 0.026381342),
   (7, 0.39280692)]),
 (5,
  [(0, 0.019175293),
   (1, 0.030904312),
   (2, 0.53580534),
   (3, 0.056536328),
   (4, 0.96979535),
   (5, 0.06518757),
  

   (2, 1.2938488),
   (3, 0.42341208),
   (4, 1.3251),
   (5, 0.49695134),
   (6, 0.746982),
   (7, 2.4796696)]),
 (88,
  [(0, 0.0023865714),
   (1, 0.00036631417),
   (2, 0.06485209),
   (3, 2.3771284e-07),
   (4, 0.8787621),
   (5, 0.018059364),
   (6, 0.019422451),
   (7, 0.014168111)]),
 (89,
  [(0, 0.05713002),
   (1, 0.000380355),
   (2, 0.0019045039),
   (3, 0.0066994783),
   (4, 0.34535888),
   (5, 0.006158238),
   (6, 0.58027834),
   (7, 0.0019052848)]),
 (90,
  [(0, 0.0081808185),
   (1, 0.0024259281),
   (2, 0.027401086),
   (3, 0.14215143),
   (4, 0.12958999),
   (5, 0.17132899),
   (6, 0.41835693),
   (7, 0.10037635)]),
 (91,
  [(0, 0.015078556),
   (1, 0.00772075),
   (2, 0.16402277),
   (3, 0.061216842),
   (4, 0.20737723),
   (5, 0.15374671),
   (6, 0.059956644),
   (7, 0.3297284)]),
 (92,
  [(0, 0.030578945),
   (1, 0.025507182),
   (2, 0.54767394),
   (3, 0.16830462),
   (4, 1.6426629),
   (5, 0.7591106),
   (6, 0.5833971),
   (7, 0.24232605)]),
 (93,
  [(0, 0.0221750

   (6, 0.44161975),
   (7, 0.096539415)]),
 (171,
  [(0, 0.12520619),
   (1, 0.0016939703),
   (2, 0.0012605968),
   (3, 0.20273647),
   (4, 0.11347552),
   (5, 0.04271606),
   (6, 0.45509598),
   (7, 0.054699447)]),
 (172,
  [(0, 7.833402e-05),
   (1, 1.27766825e-05),
   (2, 0.00040287428),
   (3, 0.00013855507),
   (4, 0.00068710593),
   (5, 0.00018964005),
   (6, 0.0007094895),
   (7, 0.00017844373)]),
 (173,
  [(0, 0.10816316),
   (1, 0.0060852636),
   (2, 0.10981516),
   (3, 0.76002926),
   (4, 0.10424244),
   (5, 0.10305564),
   (6, 0.5586964),
   (7, 0.24959238)]),
 (174,
  [(0, 0.054397795),
   (1, 0.0037635088),
   (2, 0.016148781),
   (3, 0.062059265),
   (4, 0.2707825),
   (5, 0.18740295),
   (6, 0.40107948),
   (7, 0.00084277487)]),
 (175,
  [(0, 0.025168719),
   (1, 0.0026384832),
   (2, 0.05792473),
   (3, 0.02726773),
   (4, 0.4131843),
   (5, 0.08205654),
   (6, 0.2980777),
   (7, 0.09303045)]),
 (176,
  [(0, 0.25501022),
   (1, 0.17307875),
   (2, 0.724586),
   (3, 0.1

 (254,
  [(0, 0.021775642),
   (1, 0.006608513),
   (2, 0.12652144),
   (3, 0.07487438),
   (4, 0.33444193),
   (5, 0.11127321),
   (6, 0.2416161),
   (7, 0.08271205)]),
 (255,
  [(0, 0.00022753312),
   (1, 0.0002163563),
   (2, 0.0014051091),
   (3, 0.031287942),
   (4, 0.25028768),
   (5, 0.007358254),
   (6, 0.70833653),
   (7, 0.0007279272)]),
 (256,
  [(0, 0.06931173),
   (1, 0.01271511),
   (2, 0.13290136),
   (3, 0.04020632),
   (4, 0.19208208),
   (5, 0.03150987),
   (6, 0.28338742),
   (7, 0.23767892)]),
 (257,
  [(0, 0.014134591),
   (1, 0.024914525),
   (2, 0.4730384),
   (3, 0.096195824),
   (4, 0.72994286),
   (5, 0.10674322),
   (6, 0.22906886),
   (7, 0.32527065)]),
 (258,
  [(0, 0.031671785),
   (1, 0.010709496),
   (2, 0.11173657),
   (3, 0.27613333),
   (4, 0.5670437),
   (5, 1.4778134),
   (6, 0.50040215),
   (7, 0.023636624)]),
 (259,
  [(0, 0.010297517),
   (1, 0.0010779059),
   (2, 0.01384085),
   (3, 0.012791878),
   (4, 0.40312836),
   (5, 0.113988616),
   (6, 0

   (1, 0.01233514),
   (2, 0.03390618),
   (3, 0.0008726797),
   (4, 0.3099744),
   (5, 0.005239071),
   (6, 0.003033677),
   (7, 0.6210172)]),
 (338,
  [(0, 0.05452027),
   (1, 0.0058261985),
   (2, 0.118196465),
   (3, 0.0810743),
   (4, 1.2023757),
   (5, 0.14943707),
   (6, 0.2842039),
   (7, 0.10300571)]),
 (339,
  [(0, 0.027095243),
   (1, 0.017141836),
   (2, 0.18600936),
   (3, 0.03386266),
   (4, 6.540427e-05),
   (5, 0.006649842),
   (6, 0.04507632),
   (7, 0.6818647)]),
 (340,
  [(0, 0.075876966),
   (1, 0.009483721),
   (2, 0.32227805),
   (3, 0.05840264),
   (4, 1.0425286),
   (5, 0.017477851),
   (6, 0.2688569),
   (7, 0.20468816)]),
 (341,
  [(0, 0.03395136),
   (1, 0.019635022),
   (2, 0.39730543),
   (3, 0.09845428),
   (4, 0.33027846),
   (5, 0.1607022),
   (6, 0.27882904),
   (7, 0.6805111)]),
 (342,
  [(0, 0.052845333),
   (1, 3.683006e-07),
   (2, 9.390537e-06),
   (3, 0.4250032),
   (4, 1.6110262e-05),
   (5, 4.4989456e-06),
   (6, 0.3964493),
   (7, 0.079185456)]

  [(0, 0.16302979),
   (1, 0.02964288),
   (2, 0.22694945),
   (3, 0.22289151),
   (4, 0.279935),
   (5, 0.3754766),
   (6, 1.0061975),
   (7, 0.6948571)]),
 (479,
  [(0, 2.5625206e-05),
   (1, 4.1379517e-06),
   (2, 0.00013259782),
   (3, 4.7271966e-05),
   (4, 0.00022325308),
   (5, 6.739725e-05),
   (6, 0.00022696886),
   (7, 1.3491154)]),
 (480,
  [(0, 0.007691658),
   (1, 0.00418124),
   (2, 0.13476154),
   (3, 0.046175484),
   (4, 0.4379122),
   (5, 0.22847223),
   (6, 0.093346365),
   (7, 0.04485435)]),
 (481,
  [(0, 0.07368622),
   (1, 0.013397417),
   (2, 0.0012991985),
   (3, 0.49658576),
   (4, 0.6618066),
   (5, 0.05325576),
   (6, 0.62443924),
   (7, 0.072928905)]),
 (482,
  [(0, 0.02222336),
   (1, 0.0049644746),
   (2, 0.082153395),
   (3, 0.05198343),
   (4, 0.5742376),
   (5, 0.10277908),
   (6, 0.11233487),
   (7, 0.048331376)]),
 (483,
  [(0, 0.02271845),
   (1, 0.007581058),
   (2, 0.10225342),
   (3, 0.08654345),
   (4, 0.38291842),
   (5, 0.13396245),
   (6, 0.191

   (2, 5.3929725e-05),
   (3, 0.005124377),
   (4, 0.9453065),
   (5, 0.048991594),
   (6, 3.2995167e-06),
   (7, 1.1237705e-07)]),
 (550,
  [(0, 0.015627163),
   (1, 0.0073417313),
   (2, 0.2288293),
   (3, 0.037890982),
   (4, 0.3040458),
   (5, 0.022897217),
   (6, 0.12627535),
   (7, 0.25694412)]),
 (551,
  [(0, 0.014604216),
   (1, 0.0018091052),
   (2, 0.36204192),
   (3, 0.02108775),
   (4, 0.2722319),
   (5, 0.03060552),
   (6, 0.2517934),
   (7, 0.044783894)]),
 (552,
  [(0, 1.3893132),
   (1, 0.7757999),
   (2, 4.1847515),
   (3, 1.1740987),
   (4, 2.3299146),
   (5, 0.24668053),
   (6, 3.3858106),
   (7, 8.512243)]),
 (553,
  [(0, 4.374314e-06),
   (1, 0.0014468184),
   (2, 0.91084856),
   (3, 1.0566031e-06),
   (4, 0.7038109),
   (5, 0.22008054),
   (6, 0.0849252),
   (7, 0.0699967)]),
 (554,
  [(0, 0.017037425),
   (1, 0.0037870961),
   (2, 0.2001451),
   (3, 0.027257733),
   (4, 0.20724829),
   (5, 0.04924252),
   (6, 0.063792475),
   (7, 0.43079698)]),
 (555,
  [(0, 0.00

   (7, 0.089880854)]),
 (629,
  [(0, 0.0018018648),
   (1, 0.0014113344),
   (2, 0.046975214),
   (3, 0.005562216),
   (4, 0.92344344),
   (5, 0.0137017295),
   (6, 0.0036582204),
   (7, 0.0032069294)]),
 (630,
  [(0, 0.005672198),
   (1, 0.005020891),
   (2, 0.055643976),
   (3, 0.113675825),
   (4, 0.001574649),
   (5, 0.4061608),
   (6, 2.1171712e-05),
   (7, 0.39824203)]),
 (631,
  [(0, 1.2694764e-07),
   (1, 2.7528776e-07),
   (2, 1.085757e-06),
   (3, 1.5715594e-05),
   (4, 9.3006827e-07),
   (5, 0.99936676),
   (6, 8.714274e-06),
   (7, 5.675901e-07)]),
 (632,
  [(0, 0.040018946),
   (1, 0.023090338),
   (2, 0.9899058),
   (3, 0.29827696),
   (4, 4.416718),
   (5, 1.204227),
   (6, 0.71805364),
   (7, 0.30944556)]),
 (633,
  [(0, 0.098996766),
   (1, 0.027893754),
   (2, 0.95398134),
   (3, 0.19999012),
   (4, 0.9943107),
   (5, 0.29258984),
   (6, 0.7805163),
   (7, 0.65121514)]),
 (634,
  [(0, 2.4166903e-07),
   (1, 2.9376022e-06),
   (2, 0.09744103),
   (3, 4.054005e-05),
   

 (678,
  [(0, 0.028102696),
   (1, 0.032556344),
   (2, 0.38168836),
   (3, 0.10693934),
   (4, 0.47140822),
   (5, 0.04085492),
   (6, 0.26653203),
   (7, 0.6710582)]),
 (679,
  [(0, 0.013631824),
   (1, 0.0029783943),
   (2, 0.12984452),
   (3, 0.016657393),
   (4, 0.089158736),
   (5, 0.061732337),
   (6, 0.12717296),
   (7, 0.5585366)]),
 (680,
  [(0, 0.11913701),
   (1, 0.037217785),
   (2, 0.7036865),
   (3, 0.18675466),
   (4, 0.84906334),
   (5, 0.28416714),
   (6, 0.695603),
   (7, 1.1235785)]),
 (681,
  [(0, 0.14576656),
   (1, 0.031800393),
   (2, 0.48862067),
   (3, 0.33810452),
   (4, 1.4052472),
   (5, 0.16383332),
   (6, 1.035341),
   (7, 0.39123884)]),
 (682,
  [(0, 0.081631266),
   (1, 0.017916718),
   (2, 0.2452113),
   (3, 0.217127),
   (4, 0.59749115),
   (5, 0.12806804),
   (6, 0.3414327),
   (7, 0.37031603)]),
 (683,
  [(0, 0.110713355),
   (1, 0.039264865),
   (2, 2.1533139),
   (3, 0.3368171),
   (4, 1.3624185),
   (5, 0.4695346),
   (6, 0.6090971),
   (7, 0.918

Phi values are essentially the probability of that word in that document belonging to a particular topic

# 5)- Visualization

In [23]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary,sort_topics=True)

  nbits = re.compile('(\d+)bit').search(abits).group(1)
  "\s+stepping\s+(?P<STP>\d+)", re.IGNORECASE)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [24]:
vis

In [25]:
pyLDAvis.save_html(vis, 'Author_vis.html')

# 6)- t-SNE algorithm( 2-D Visuals)

In [26]:
tsne = TSNE(random_state=2017, perplexity=30)
embedding = tsne.fit_transform(hm)
embedding = pd.DataFrame(embedding, columns=['x','y'])
embedding['hue'] = hm.argmax(axis=1)

In [27]:
source = ColumnDataSource(
        data=dict(
            x = embedding.x,
            y = embedding.y,
            colors = [all_palettes['Set1'][8][i] for i in embedding.hue],
            title = df.title,
            year = df.year,
            alpha = [0.9] * embedding.shape[0],
            size = [7] * embedding.shape[0]
        )
    )
hover_tsne = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Title:</span>
            <span style="font-size: 12px">@title</span>
            <span style="font-size: 12px; font-weight: bold;">Year:</span>
            <span style="font-size: 12px">@year</span>
        </div>
    </div>
    """)
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(plot_width=700, plot_height=700, tools=tools_tsne, title='Papers')
plot_tsne.circle('x', 'y', size='size', fill_color='colors', 
                 alpha='alpha', line_alpha=0, line_width=0.01, source=source, name="df")

callback = CustomJS(args=dict(source=source), code="""
    var data = source.data;
    var f = cb_obj.value
    x = data['x']
    y = data['y']
    colors = data['colors']
    alpha = data['alpha']
    title = data['title']
    year = data['year']
    size = data['size']
    for (i = 0; i < x.length; i++) {
        if (year[i] <= f) {
            alpha[i] = 0.9
            size[i] = 7
        } else {
            alpha[i] = 0.05
            size[i] = 4
        }
    }
    source.trigger('change');
""")

slider = Slider(start=df.year.min(), end=df.year.max(), value=2016, step=1, title="Before year")
slider.js_on_change('value', callback)

layout = column(slider, plot_tsne)

In [28]:
show(layout)