In [68]:
import pandas as pd
import os
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(123)
import pickle
import nltk
nltk.download('wordnet')

redundant = ['abstract', 'purpose', 'paper', 'goal', 'usepackage', 'cod']
stemmer = PorterStemmer()

def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')
def preprocess_abstract(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in redundant:
            result.append(lemmatize_stemming(token))
    return " ".join(result)

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sijieliu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [69]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1816 entries, 0 to 1815
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   year                1816 non-null   int64 
 1   authors             1816 non-null   object
 2   title               1816 non-null   object
 3   abstract            1816 non-null   object
 4   times_cited         1816 non-null   int64 
 5   concepts            1816 non-null   object
 6   journal.title       1513 non-null   object
 7   HDSI_author         1816 non-null   object
 8   title_standardized  1816 non-null   object
 9   abstract_processed  1816 non-null   object
dtypes: int64(2), object(8)
memory usage: 142.0+ KB


In [70]:
type(data.iloc[0, 3])

str

In [71]:
# clean titles and abstracts
data = pd.read_csv('final_hdsi_faculty_updated.csv', index_col=0)
data = data[data['year'] >= 2015]
data['abstract'] = data['abstract'].apply(lambda x: '' if type(x) == float else x)

def standardize_abstract(abstract):
    abstract = abstract.replace('\n', ' ')
    abstract = abstract.replace('  ', ' ')
    abstract = abstract.replace('-', ' ')
    abstract = abstract.replace('.', '')
    abstract = abstract.replace(':', '')
    abstract = abstract.replace(';', '')
    abstract = abstract.replace(',', '')
    abstract = abstract.replace('"', '')
    abstract = abstract.lower()
    return abstract

def standardize_title(title):
    title = title.replace('\n', ' ')
    title = title.replace('  ', ' ')
    title = title.replace('-', ' ')
    title = title.replace('.', '')
    title = title.replace(':', '')
    title = title.replace(';', '')
    title = title.replace(',', '')
    title = title.replace('"', '')
    title = title.lower()
    return title

data['year'] = data['year'].astype(int)
data['abstract'] = [standardize_abstract(text) for text in data['abstract']]
data['title_standardized'] = [standardize_title(text) for text in data['title']]
data['abstract_processed'] = data['abstract'].apply(preprocess_abstract)
data.drop_duplicates(inplace=True, subset=['abstract'])
data.drop_duplicates(inplace=True, subset=['title_standardized'])
data.drop_duplicates(inplace=True, subset=['abstract_processed'])
# data.dropna(axis=0, how='any')
data.reset_index(inplace=True)
data.drop(axis=1, labels=['index'], inplace=True)
data.shape

(1816, 10)

In [72]:
authors = {}
for author in data.HDSI_author.unique():
    authors[author] = {
        2015 : list(),
        2016 : list(),
        2017 : list(),
        2018 : list(),
        2019 : list(),
        2020 : list(),
        2021 : list()
    }
for i, row in data.iterrows():
    authors[row['HDSI_author']][row['year']].append(row['abstract_processed'])

In [73]:
all_docs = []
missing_author_years = {author : list() for author in data.HDSI_author.unique()}
for author, author_dict in authors.items():
    for year, documents in author_dict.items():
        if len(documents) == 0:
            missing_author_years[author].append(year)
            continue
        all_docs.append(" ".join(documents))
len(all_docs)

257

In [74]:
countVec = CountVectorizer()
counts = countVec.fit_transform(all_docs)
names = countVec.get_feature_names()

In [75]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [78]:
import pandas as pd 
import pickle

models = pickle.load(open('models/10_15_20_25_30_models.pkl', 'rb'))
results = pickle.load(open('models/10_15_20_25_30_results.pkl', 'rb'))

In [None]:
results

In [80]:
topicnames = {
    num_topics : ["Topic" + str(i) for i in range(num_topics)] for num_topics in range(10, 35, 5)
}

# index names
docnames = ["Doc" + str(i) for i in range(len(all_docs))]

# Make the pandas dataframe
df_document_topic = {
    num_topics : pd.DataFrame(results[f'{num_topics}'], columns=topicnames[num_topics], index=docnames) for num_topics in range(10, 35, 5)
}

# Get dominant topic for each document
dominant_topic = {
    num_topics : np.argmax(df_document_topic[num_topics].values, axis=1) for num_topics in range(10, 35, 5)
}

for num_topics, df in df_document_topic.items():
    df['dominant_topic'] = dominant_topic[num_topics]


In [81]:
author_list = []
year_list = []
for author in authors.keys():
    for i in range(7):
        if (2015 + i) not in missing_author_years[author]:
            author_list.append(author)
            year_list.append(2015 + i)

for df in df_document_topic.values():
    df['author'] = author_list
    df['year'] = year_list


In [82]:
averaged = {
    num_topics : df_document_topic[num_topics].groupby('author').mean().drop(['dominant_topic', 'year'], axis=1) for num_topics in df_document_topic.keys()
}

filtered = {
    threshold : {num_topics : averaged[num_topics].mask(averaged[num_topics] < threshold, other=0) for num_topics in averaged.keys()} for threshold in [.1]
}

In [84]:
labels = {}
for num_topics in range(10, 35, 5):
    labels[num_topics] = filtered[.1][num_topics].index.to_list()
    labels[num_topics].extend(filtered[.1][num_topics].columns.to_list())


sources = {threshold : {} for threshold in [.1]}
targets = {threshold : {} for threshold in [.1]}
values = {threshold : {} for threshold in [.1]}

for threshold in [.1]:
    for num_topics in range(10, 35, 5):
        curr_sources = []
        curr_targets = []
        curr_values = []
        index_counter = 0
        for index, row in filtered[threshold][num_topics].iterrows():
            for i, value in enumerate(row):
                if value != 0:
                    curr_sources.append(index_counter)
                    curr_targets.append(50 + i)
                    curr_values.append(value)
            index_counter += 1
        sources[threshold][num_topics] = curr_sources
        targets[threshold][num_topics] = curr_targets
        values[threshold][num_topics] = curr_values

positions = {
    num_topics : {label : i for i, label in enumerate(labels[num_topics])} for num_topics in averaged.keys()
}

In [60]:
# sources[0.1][10]

In [85]:
def split_into_ranks(array):
    ranks = []
    for value in array:
        for i, percentage in enumerate(np.arange(.1, 1.1, .1)):
            if value <= np.quantile(array, percentage):
                ranks.append(i + 1)
                break
    return ranks

final_values = {threshold : {} for threshold in [.1]}

for threshold in [.1]:
    for num_topics in range(10, 35, 5):
        curr_values_array = np.array(values[threshold][num_topics])
        final_values[threshold][num_topics] = split_into_ranks(curr_values_array)


In [86]:
def display_topics_list(model, feature_names, no_top_words):
    topic_list = []
    for topic_idx, topic in enumerate(model.components_):
        topic_list.append(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
    return topic_list

In [123]:
link_labels = {}
for num_topics in range(10, 35, 5):
    link_labels[num_topics] = labels[num_topics].copy()
    link_labels[num_topics][50:] = display_topics_list(models[f'{num_topics}'], names, 10)

In [124]:
counts = CountVectorizer().fit_transform(data['abstract_processed'])
transformed_list = []
for model in models.values():
    transformed_list.append(model.transform(counts))

In [125]:
dataframes = {threshold : {} for threshold in [.1]}
for i, matrix in enumerate(transformed_list):
    for threshold in [.1]:
        df = pd.DataFrame(matrix)
        df.mask(df < threshold, other=0, inplace=True)
        df['HDSI_author'] = data['HDSI_author']
        df['year'] = data['year']
        df['citations'] = data['times_cited'] + 1

        # noralization of citations: Scaling to a range [0, 1]
        df['citations_norm'] = df.groupby(by=['HDSI_author', 'year'])['citations'].apply(lambda x: (x-x.min())/(x.max()-x.min()))#normalize_by_group(df=df, by=['author', 'year'])['citations']
        df['abstract'] = data['abstract']
        df['title'] = data['title']
        df.fillna(1, inplace=True)
        
        #alpha weight parameter for weighting importance of citations vs topic relation
        alpha = .75
        for topic_num in range((i+2) * 5):
            df[f'{topic_num}_relevance'] = alpha * df[topic_num] + (1-alpha) * df['citations_norm']
        dataframes[threshold][(i+2) * 5] = df

In [143]:
# pd.set_option('display.max_columns', None)


In [146]:
# dataframes[0.1][10]

In [139]:
# filtered[0.1][10]

In [142]:
def create_top_list(data_frame, num_topics, threshold):
    top_5s = []
    the_filter = filtered[threshold][num_topics]
    for topic in range(num_topics):
        relevant = the_filter[the_filter[f'Topic{topic}'] != 0].index.to_list()
#         print(relevant)
        to_append = data_frame[data_frame[f'{topic}_relevance'] > 0].reset_index()
        print(to_append.columns)
        to_append = to_append[to_append['HDSI_author'].isin(relevant)].reset_index()
        top_5s.append(to_append)
    return top_5s

tops = {
    threshold : {num_topics : create_top_list(dataframes[threshold][num_topics], num_topics, threshold) for num_topics in range(10, 35, 5)} for threshold in [.1]
}

Index([         'index',                0,                1,                2,
                      3,                4,                5,                6,
                      7,                8,                9,    'HDSI_author',
                 'year',      'citations', 'citations_norm',       'abstract',
                'title',    '0_relevance',    '1_relevance',    '2_relevance',
          '3_relevance',    '4_relevance',    '5_relevance',    '6_relevance',
          '7_relevance',    '8_relevance',    '9_relevance'],
      dtype='object')
Index([         'index',                0,                1,                2,
                      3,                4,                5,                6,
                      7,                8,                9,    'HDSI_author',
                 'year',      'citations', 'citations_norm',       'abstract',
                'title',    '0_relevance',    '1_relevance',    '2_relevance',
          '3_relevance',    '4_relevance',    '

Index([         'index',                0,                1,                2,
                      3,                4,                5,                6,
                      7,                8,                9,               10,
                     11,               12,               13,               14,
                     15,               16,               17,               18,
                     19,               20,               21,               22,
                     23,               24,    'HDSI_author',           'year',
            'citations', 'citations_norm',       'abstract',          'title',
          '0_relevance',    '1_relevance',    '2_relevance',    '3_relevance',
          '4_relevance',    '5_relevance',    '6_relevance',    '7_relevance',
          '8_relevance',    '9_relevance',   '10_relevance',   '11_relevance',
         '12_relevance',   '13_relevance',   '14_relevance',   '15_relevance',
         '16_relevance',   '17_relevance',   '18_rel

In [154]:
len(tops[0.1][10])

10

In [128]:
import dash
import dash_core_components as dcc
import dash_html_components as html
import dash_bootstrap_components as dbc
import plotly.graph_objects as go
from dash.dependencies import Input, Output, State

# sankey diagrams for diff numbers of topics

heights = {
  10 : 1000,
  15 : 1500,
  20 : 2000,
  25 : 2500,
  30 : 3000
}

figs = {threshold : {} for threshold in [.1]}
for threshold in [.1]:
    for num_topics in range(10, 35, 5):
        fig = go.Figure(data=[go.Sankey(
            node = dict(
                pad = 15,
                thickness = 20,
                line = dict(color = 'black', width = 0.5),
                label = labels[num_topics],
                color = ['#666699' for i in range(len(labels[num_topics]))],
                customdata = link_labels[num_topics],
                hovertemplate='%{customdata} Total Flow: %{value}<extra></extra>'
            ),
            link = dict(
                color = ['rgba(204, 204, 204, .5)' for i in range(len(sources[threshold][num_topics]))],
                source = sources[threshold][num_topics],
                target = targets[threshold][num_topics],
                value = final_values[threshold][num_topics]
            )
        )])
        fig.update_layout(title_text="Author Topic Connections", font=dict(size = 10, color = 'white'), height=heights[num_topics], paper_bgcolor="black", plot_bgcolor='black')
        figs[threshold][num_topics] = fig

In [172]:
# link_labels

In [129]:
top_words = {
    10 : display_topics_list(models['10'], names, 10),
    15 : display_topics_list(models['15'], names, 10),
    20 : display_topics_list(models['20'], names, 10),
    25 : display_topics_list(models['25'], names, 10),
    30 : display_topics_list(models['30'], names, 10)
}

combined = pd.read_csv('final_hdsi_faculty_updated.csv')
# combined[combined.title == 'Elder-Rule-Staircodes for Augmented Metric Spaces'].abstract

In [130]:
locations = {}
for i, word in enumerate(names):
    locations[word] = i

In [184]:
# tops[0.1][25]

In [185]:
# tops[0.1][25][0]

In [169]:
# names[0]
# locations
# doc_vec = np.zeros((1, len(names)))

In [179]:
doc_vec = np.zeros((1, len(names)))
words = ['aacr']
for word in words:
    doc_vec[0][locations[word]] += 1
    
num_topics = 10

relations = np.round(models[f'{num_topics}'].transform(doc_vec), 5).tolist()[0]

# relations

pairs = [(i, relation) for i, relation in enumerate(relations)]
pairs.sort(reverse=True, key=lambda x: x[1])

# to_return = [[html.Br(), f'Topic{pair[0]}: {pair[1]}', html.Br()] for pair in pairs]


# list(chain(*to_return)), [{'label' : f'{researcher}', 'value' : f'{researcher}'} for researcher in set(author_list)]

pairs

[(2, 0.55),
 (0, 0.05),
 (1, 0.05),
 (3, 0.05),
 (4, 0.05),
 (5, 0.05),
 (6, 0.05),
 (7, 0.05),
 (8, 0.05),
 (9, 0.05)]

In [182]:
# positions

In [183]:
from itertools import chain
threshold = .1
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.DARKLY])

app.layout = html.Div([
  dbc.Row([
      dcc.Dropdown(
        id='graph-dropdown',
        placeholder='select number of LDA topics',
        options=[{'label' : f'{i} Topic Model', 'value' : i} for i in range(10, 35, 5)],
        style={
          'color' : 'black',
          'background-color' : '#666699',
          'width' : '200%',
          'align-items' : 'left',
          'justify-content' : 'left',
          'padding-left' : '15px'
        },
        value=10
      )
  ]),
  dbc.Row([
    dbc.Col(html.Div([
      dcc.Graph(
        id = 'graph',
        figure = figs[.1][10]
      )
      ],
      style={
        'height' : '100vh',
        'overflow-y' : 'scroll'
      }
    )
    ),
      dbc.Col(html.Div([dbc.Col([
        dcc.Dropdown(
          id='dropdown_menu',
          placeholder='Select a topic',
          options=[{'label' : f'Topic {topic}: {top_words[10][topic]}', 'value' : topic} for topic in range(10)],
          style={
            'color' : 'black',
            'background-color' : 'white'
          }
        ),
        dcc.Dropdown(
          id='researcher-dropdown',
          placeholder='Select Researchers',
          options=[{'label' : f'{researcher}', 'value' : f'{researcher}'} for researcher in set(author_list)],
          style={
            'color' : 'black',
            'background-color' : 'white'
          }
        )]),
        dbc.Col(
          dcc.Dropdown(
            id='word-search',
            placeholder='Search by word',
            options=[{'label' : word, 'value' : word} for word in names],
            style={
              'color' : 'black',
              'background-color' : 'white'
            },
            value=[],
            multi=True
          )
        ),
        html.Div(
          id='paper_container', 
          children=[
            html.P(
              children=['Top 5 Papers'],
              id='titles_and_authors', 
              draggable=False, 
              style={
                'font-size' :'150%',
                'font-family' : 'Verdana'
              }
            ),
          ],
        ),
      ], 
        style={
          'height' : '100vh',
          'overflow-y' : 'scroll'
        }
      )
      )
    ]
  )]
)

@app.callback(
  Output('titles_and_authors', 'children'),
  Output('researcher-dropdown', 'options'),
  Input('dropdown_menu', 'value'),
  Input('graph-dropdown', 'value'),
  Input('researcher-dropdown', 'value'),
  Input('word-search', 'value')
)
def update_p(topic, num_topics, author, words):
  if len(words) != 0:
    doc_vec = np.zeros((1, len(names)))
    for word in words:
      doc_vec[0][locations[word]] += 1
    relations = np.round(models[f'{num_topics}'].transform(doc_vec), 5).tolist()[0]
    pairs = [(i, relation) for i, relation in enumerate(relations)]
    pairs.sort(reverse=True, key=lambda x: x[1])
    to_return = [[html.Br(), f'Topic{pair[0]}: {pair[1]}', html.Br()] for pair in pairs]
    return list(chain(*to_return)), [{'label' : f'{researcher}', 'value' : f'{researcher}'} for researcher in set(author_list)]

  if topic == None and author == None:
    return ['Make a selection'], [{'label' : f'{researcher}', 'value' : f'{researcher}'} for researcher in set(author_list)]

  if topic != None and author == None:
    df = tops[threshold][num_topics][topic]
    # df_authors = df.HDSI_author.unique()
    max_vals = df.groupby('HDSI_author').max()[f'{topic}_relevance']

    to_return = [[f'{name}:', html.Br(), 
      f'{df[df[f"{topic}_relevance"] == max_vals.loc[name]]["title"].to_list()[0]}',
      html.Details([html.Summary('Abstract'),
                    html.Div(combined[combined.title == f'{df[df[f"{topic}_relevance"] == max_vals.loc[name]]["title"].to_list()[0]}'].abstract)],
                    style={
                      'font-size' :'80%',
                      'font-family' : 'Verdana'}),
      html.Br()] for i, name in enumerate(max_vals.index)]
    return list(chain(*to_return)), [{'label' : f'{researcher}', 'value' : f'{researcher}'} for researcher in tops[threshold][num_topics][topic].HDSI_author.unique()]

  if topic == None and author != None:
    to_return = []
    for topic_num in range(num_topics):
      df = tops[threshold][num_topics][topic_num]
      if author in df.HDSI_author.unique():
        max_vals = df.groupby('HDSI_author').max()[f'{topic_num}_relevance']
  
        to_return.append([f'Topic {topic_num}:', html.Br(), 
          f'{df[df[f"{topic_num}_relevance"] == max_vals.loc[author]]["title"].to_list()[0]}', 
          html.Details([html.Summary('Abstract'), 
                        html.Div(combined[combined.title == f'{df[df[f"{topic_num}_relevance"] == max_vals.loc[author]]["title"].to_list()[0]}'].abstract)],
                        style={
                          'font-size' :'80%',
                          'font-family' : 'Verdana'},
                        ),
          html.Br()])
    return list(chain(*to_return)), [{'label' : f'{researcher}', 'value' : f'{researcher}'} for researcher in set(author_list)]

  if topic != None and author != None:
    df = tops[threshold][num_topics][topic]
    df = df[df['HDSI_author'] == author]
    df.sort_values(by=f'{topic}_relevance', ascending=False, inplace=True)
    titles = df.head(10)['title'].to_list()
    
    to_return = [
      [f'{i} : {title}', 
      html.Details([html.Summary('Abstract'), 
                    html.Div(combined[combined.title == title].abstract)], 
                    style={
                      'font-size' :'80%',
                      'font-family' : 'Verdana'}), 
      html.Br()] for i, title in enumerate(titles)]
    return list(chain(*to_return)), [{'label' : f'{researcher}', 'value' : f'{researcher}'} for researcher in tops[threshold][num_topics][topic].HDSI_author.unique()]
    


@app.callback(
  [Output('graph', 'figure'), Output('dropdown_menu', 'options')],
  [Input('graph-dropdown', 'value'), Input('dropdown_menu', 'value'), Input('researcher-dropdown', 'value'), Input('word-search', 'value')],
  State('graph', 'figure')
)
def update_graph(value, topic, author, words, previous_fig):
  if len(previous_fig['data'][0]['node']['color']) != value + 50:
    figs[threshold][value].update_traces(node = dict(color = ['#666699' for i in range(len(labels[value]))]), link = dict(color = ['rgba(204, 204, 204, .5)' for i in range(len(sources[threshold][value]))]))
    return figs[threshold][value], [{'label' : f'Topic {topic}: {top_words[value][topic]}', 'value' : topic} for topic in range(value)]

  if len(words) != 0:
    doc_vec = np.zeros((1, len(names)))
    for word in words:
      doc_vec[0][locations[word]] += 1
    relations = np.round(models[f'{value}'].transform(doc_vec), 3).tolist()[0]
    opacity = {(i+50) : relation for i, relation in enumerate(relations) if relation > .1}
    node_colors = ['#666699' if (i not in opacity.keys()) else f'rgba(255, 255, 0, {opacity[i]})' for i in range(len(labels[value]))]
    valid_targets = [positions[value][f'Topic{i-50}'] for i in opacity.keys()]
    link_colors = ['rgba(204, 204, 204, .5)' if target not in valid_targets else f'rgba(255, 255, 0, .5)' for target in targets[threshold][value]]
    figs[threshold][value].update_traces(node = dict(color = node_colors), link = dict(color = link_colors)),
    return figs[threshold][value], [{'label' : f'Topic {topic}: {top_words[value][topic]}', 'value' : topic} for topic in range(value)]


  if topic == None and author == None:
    figs[threshold][value].update_traces(node = dict(color = ['#666699' for i in range(len(labels[value]))]), link = dict(color = ['rgba(204, 204, 204, .5)' for i in range(len(sources[threshold][value]))]))
    return figs[threshold][value], [{'label' : f'Topic {topic}: {top_words[value][topic]}', 'value' : topic} for topic in range(value)]
  
  if topic != None and author == None:
    node_colors = ['#666699' if (i != positions[value][f'Topic{topic}']) else '#ffff00' for i in range(len(labels[value]))]
    link_colors = ['rgba(204, 204, 204, .5)' if target != positions[value][f'Topic{topic}'] else 'rgba(255, 255, 0, .5)' for target in targets[threshold][value]]
    figs[threshold][value].update_traces(node = dict(color = node_colors), link = dict(color = link_colors))
    return figs[threshold][value], [{'label' : f'Topic {topic}: {top_words[value][topic]}', 'value' : topic} for topic in range(value)]

  if topic == None and author != None:
    node_colors = ['#666699' if (i != positions[value][author]) else '#ffff00' for i in range(len(labels[value]))]
    link_colors = ['rgba(204, 204, 204, .5)' if source != positions[value][author] else 'rgba(255, 255, 0, .5)' for source in sources[threshold][value]]
    figs[threshold][value].update_traces(node = dict(color = node_colors), link = dict(color = link_colors))
    return figs[threshold][value], [{'label' : f'Topic {topic}: {top_words[value][topic]}', 'value' : topic} for topic in range(value)]

  if topic != None and author != None:
    node_colors = ['#666699' if (i != positions[value][author] and i != positions[value][f'Topic{topic}']) else '#ffff00' for i in range(len(labels[value]))]
    link_colors = ['rgba(204, 204, 204, .5)' if (source != positions[value][author] or target != positions[value][f'Topic{topic}']) else 'rgba(255, 255, 0, .5)' for source, target in zip(sources[threshold][value], targets[threshold][value])]
    figs[threshold][value].update_traces(node = dict(color = node_colors), link = dict(color = link_colors))
    return figs[threshold][value], [{'label' : f'Topic {topic}: {top_words[value][topic]}', 'value' : topic} for topic in range(value)]

@app.callback(
  Output('researcher-dropdown', 'value'),
  Input('dropdown_menu', 'value'),
  State('dropdown_menu', 'value')
)
def reset_author(topic, previous):
  if topic != previous:
    return None

app.run_server()

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [04/Dec/2021 23:56:53] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [04/Dec/2021 23:56:53] "GET /_dash-dependencies HTTP/1.1" 200 -
127.0.0.1 - - [04/Dec/2021 23:56:53] "GET /_dash-layout HTTP/1.1" 200 -
127.0.0.1 - - [04/Dec/2021 23:56:53] "GET /_dash-component-suites/dash_core_components/async-graph.js HTTP/1.1" 200 -
127.0.0.1 - - [04/Dec/2021 23:56:53] "GET /_dash-component-suites/dash_core_components/async-dropdown.js HTTP/1.1" 200 -
127.0.0.1 - - [04/Dec/2021 23:56:53] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [04/Dec/2021 23:56:53] "GET /_dash-component-suites/dash_core_components/async-plotlyjs.js HTTP/1.1" 200 -
127.0.0.1 - - [04/Dec/2021 23:56:54] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [04/Dec/2021 23:56:55] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [04/Dec/2021 23:56:56] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [04/Dec/2021 23:56:56] "

In [24]:
import pickle
pickle.dump(figs, open('sankey_dash\\figs.pkl', 'wb'))
pickle.dump(tops, open('sankey_dash\\tops.pkl', 'wb'))
pickle.dump(author_list, open('sankey_dash\\author_list.pkl', 'wb'))
pickle.dump(combined, open('sankey_dash\\combined.pkl', 'wb'))
pickle.dump(labels, open('sankey_dash\\labels.pkl', 'wb'))
pickle.dump(positions, open('sankey_dash\\positions.pkl', 'wb'))
pickle.dump(sources, open('sankey_dash\\sources.pkl', 'wb'))
pickle.dump(targets, open('sankey_dash\\targets.pkl', 'wb'))
pickle.dump(top_words, open('sankey_dash\\top_words.pkl', 'wb'))
pickle.dump(locations, open('sankey_dash\\locations.pkl', 'wb'))
pickle.dump(models, open('sankey_dash\\models.pkl', 'wb'))
pickle.dump(names, open('sankey_dash\\names.pkl', 'wb'))

In [50]:
locations = {}
for i, word in enumerate(names):
    locations[word] = i


In [39]:
text = ['neural', 'oscillations']

doc_vec = np.zeros((1, len(names)))
doc_vec.shape

(1, 15169)

In [51]:
text = ['neural', 'oscillations']

doc_vec = np.zeros((1, len(names)))
for word in text:
    doc_vec[0][locations[word]] += 1
models['10'].transform(doc_vec)

array([[0.03333413, 0.03333333, 0.03333529, 0.0333334 , 0.03333348,
        0.03333423, 0.03333962, 0.03334161, 0.03333402, 0.69998089]])

In [42]:
top_words[10]

['sequence cancer genome ecdna model approach result sample genes identify',
 'microbiome microbial study sample microbiota sequence human associate diversity data',
 'model time data learn network base process propose result performance',
 'data patients model study risk health clinical patient result time',
 'data model result research systems learn work workflow scientific causal',
 'model propose data learn base result network problem methods image',
 'power test field brain null approach beta procedure scan asymptotic',
 'model brain neural network spike neurons activity signal learn dynamics',
 'cell study cells data expression analysis result network gene type',
 'flow image neural patients cardiac pulmonary data oscillations measure disease']