In [None]:
import tomotopy as tp
import pandas as pd
import numpy as np
import sys
import nltk
import pickle
import random
from nltk import word_tokenize, RegexpTokenizer,PunktSentenceTokenizer, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import re

### 1. Load the model and the abstracts file 

In [None]:
models = []
names = ['LDAbatch1.bin','LDAbatch2.bin','LDAbatch3.bin','LDAbatch4.bin']
for name in names:
    models.append(tp.LDAModel.load(name))

In [None]:
normalised = pickle.load(open( "collection_cleaned.p", "rb" ))

In [None]:
# Important: Shuffle with same seed as in Streaming LDA notebook to align collection with models
SEED = 11
random.seed(SEED)
random.shuffle(normalised)

### 3. Extract topics for each document 


In [None]:
def get_top_topics(document, model, min_score=0.8):
    """ 
    Extracting top n topics for each document. 
    Selects the n most likely topics whose p(topic|document) sum to min_score.
    """
    # inserting the document in the model
    new_doc = model.make_doc(document)
    _,_ = model.infer(new_doc)
    # ordering from most probable topic to least one 
    dist = new_doc.get_topic_dist()
    indices = np.flip(np.argsort(dist))
    score = 0
    indices_kept = []
    probs_kept = []
    for index in indices:
        if score > min_score: break
        score += dist[index]
        indices_kept.append(index)
        probs_kept.append(dist[index])
    return list(zip(indices_kept, probs_kept))

In [None]:
docs2topics = []
num_batches = 4
batches = np.array_split(normalised, num_batches)

for i,model in enumerate(models):
    print("Working on batch "+str(i))
    batch = batches[i].tolist()
    docs2topics.append([get_top_topics(doc, model) for doc in batch])

In [None]:
docs2topics_flat = [item for sublist in docs2topics for item in sublist]
len(docs2topics_flat)

In [None]:
def get_batch_indices(batches):
    res = []
    for i,batch in enumerate(batches):
        res += [i]*len(batch)
    return res
batch_indices = get_batch_indices(batches)

In [None]:
print(batch_indices[1000])
print(batch_indices[6000])
print(batch_indices[11000])
print(batch_indices[16000])
print(len(batch_indices))

In [None]:
data = pd.read_csv("abstracts_eng.csv")
data.head()

In [None]:
# Before merging the two files we need to match them 
# Here we're going to build the list of indices of the "normalised" collection with respect to 
# the "data" collection 
random.seed(SEED)
indices = list(range(0,len(normalised)))
random.shuffle(indices)

Make sure the output from the two cells below correspond to the same publication!

In [None]:
data.iloc[indices[0]]["abstract"][0:300]

In [None]:
" ".join(normalised[0][0:100])

In [None]:
enriched = data.iloc[indices]
enriched["topics"] = docs2topics_flat
enriched["batchID"] = batch_indices
enriched.head(3)

In [None]:
# exploding the dataframe (to get one row for each document-topic pair)
enriched=enriched.explode("topics")
enriched.columns = ["abstract","publication_id","publication_title","topic","batch_id"]
enriched.head(3)

In [None]:
# separate topic id and probability

enriched[["topic_id","topic_probability"]] = pd.DataFrame(enriched["topic"].tolist(), 
                                                          index=enriched.index)
enriched.head()

In [None]:
enriched = enriched.drop(["topic"], axis = 1)
enriched.head(3)

In [None]:
# issue: quotes need to be escaped in neo4j
# the following function helps us avoid errors when importing

def add_quote(abstract):
    if isinstance(abstract, str):
        abstract = abstract.replace('\\"', '""')
        return abstract.replace('"', '""')

enriched["abstract"] = enriched["abstract"].apply(add_quote)

In [None]:
# now export the document-topic pairs to csv 
file_name="abstract+topic.csv"
enriched.to_csv(file_name,index=False)

### 4. Extract words for each topic

In [None]:
def get_top_words(topic, model, min_score=0.8):
    """
    Extracting top n words for each document. 
    Selects the n most likely words whose p(word|topic) sum to min_score.
    """
    dist = model.get_topic_word_dist(topic)
    indices = np.flip(np.argsort(dist))
    score = 0
    word_kept = []
    word_prob_kept = []
    for index in indices:
        if score > min_score: break
        score += dist[index]
        word_kept.append(model.used_vocabs[index])
        word_prob_kept.append(dist[index])
    return list(zip(word_kept, word_prob_kept))

In [None]:
num_topics = 125
topics2words = []
for i,model in enumerate(models): # note: preserving the order is extremely important here 
    print("Working on batch "+str(i))
    topics2words += [get_top_words(i, model, min_score=0.25) for i in range(num_topics)]
print(len(topics2words))

In [None]:
topic_ids = list(range(num_topics))*4
len(topic_ids)

In [None]:
topics_batch_indices = [0]*125+[1]*125+[2]*125+[3]*125
len(topics_batch_indices)

In [None]:
# creating new topic dataframe
topics_df = pd.DataFrame({"TopicID":topic_ids,"BatchID":topics_batch_indices,"TopicWords":topics2words})

In [None]:
topics_df.head(3)

In [None]:
topics_exploded = topics_df.explode("TopicWords")

In [None]:
topics_exploded.head()

In [None]:
topics_exploded[["word","word_probability"]] = pd.DataFrame(topics_exploded["TopicWords"].tolist(), 
                                                          index=topics_exploded.index)
topics_exploded.head()

In [None]:
topics_exploded = topics_exploded.drop(["TopicWords"], axis = 1)
topics_exploded.head(3)

In [None]:
# now export the list of topic words to csv 
file_name="topics.csv"
topics_df.to_csv(file_name,index=False)

In [None]:
# ... and the word-topic pairs
file_name = "words.csv"
topics_exploded.to_csv(file_name, index=False)

### Export to Neo4j

Topic nodes: 
- TopicID (long)
- Words (list(str)) 
<br>
<br>

        #Adding abstracts to existing publications [SHOULD HAVE BEEN DONE WITH METADATA]
        #LOAD CSV WITH HEADERS FROM "file:///abstract+topic.csv" AS line
        #WITH line WHERE line.publication_id IS NOT NULL
        #MATCH (publication:Publication {title: line.publication_title})
        #SET publication.abstract = line.abstract;
        
        #Defining the topic nodes
        CREATE CONSTRAINT ON (t:Topic) ASSERT (t.ID, t.batchID) IS NODE KEY;
        
        #Loading the topic nodes from CSV
        LOAD CSV WITH HEADERS FROM "file:///topics.csv" AS line
        WITH line where line.TopicID IS NOT NULL
        MERGE (t: Topic {ID: line.TopicID, batchID: line.BatchID})
        SET t.words= line.TopicWords;

        #Loading document<->topic relationships
        LOAD CSV WITH HEADERS FROM "file:///abstract+topic.csv" AS line
        MATCH (p:Publication {title: line.publication_title}),
               (t:Topic {ID:line.topic_id, batchID: line.batch_id})
        MERGE (p)-[r:IS_ABOUT {weight: round(1000 * toFloat(line.topic_probability)) / 1000}]->(t);
        
        #Constraint on word nodes being unique
        CREATE CONSTRAINT ON (c:Word) ASSERT c.name IS UNIQUE;
        
        #Loading the word nodes from CSV
        LOAD CSV WITH HEADERS FROM "file:///words.csv" AS line
        WITH line where line.word IS NOT NULL
        MERGE (w: Word{name: line.word});
        
        #Loading word<->topic relationships
        LOAD CSV WITH HEADERS FROM "file:///words.csv" AS line
        MATCH (t: Topic {ID: line.TopicID, batchID: line.BatchID}),
                (w: Word {name: line.word})
        MERGE (w)-[r:IS_IN {weight: round(1000 * toFloat(line.word_probability)) / 1000}]->(t);

### 5. Visualizations

Here we use the docs2topics and topics2words data extracted before to get nice visualisations of our collection 

In [None]:
topic_df = pd.read_csv("topics.csv")
topic_df.head()

In [None]:
def process_topic(ls):
    return ls.strip("][").replace("'","").split(', ')

In [None]:
topics2words = list(topic_df["TopicWords"].apply(process_topic))

In [None]:
# topic sparsity measure of our documents
docs2num_topics = [len(ts) for ts in docs2topics]

In [None]:
import matplotlib.pyplot as plt
h = plt.hist(docs2num_topics, bins=15, range=(0,10))
plt.show(h)

In [None]:
h = plt.hist(docs2num_topics, bins=10, range=(0,10), density=True, cumulative=True)
plt.show(h)

In [None]:
# word sparsity measure of our topics
topics2num_words = [len(ts) for ts in topics2words]

In [None]:
set(topics2num_words)

In [None]:
import matplotlib.pyplot as plt
h2 = plt.hist(topics2num_words, density = True)
plt.show(h2)

In [None]:
h3 = plt.hist(topics2num_words, density=True, cumulative=True)
plt.show(h3)