In [2]:
import tomotopy as tp
import pandas as pd
import numpy as np
import sys
import nltk
from nltk import word_tokenize, RegexpTokenizer,PunktSentenceTokenizer, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [3]:
model = tp.LDAModel.load("lda_model150.bin")

In [4]:
documents = pd.read_csv("abstracts_eng.csv")

In [5]:
#Preprocessing text (no stemming). Bigram does not seem to work
word_stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
stop_words = stopwords.words('english')
def normalisation(document, stemming = True, lemmatising = True, min_word_len = 3):
        tokens = gensim.utils.simple_preprocess(str(document), deacc=True, max_len = sys.maxsize)
        cleaned = [word for word in tokens if word not in stop_words]
        if stemming:
            cleaned = [word_stemmer.stem(word) for word in cleaned]
        if lemmatising:
            cleaned = [lemmatiser.lemmatize(word) for word in cleaned]
        cleaned = [word for word in cleaned if (min_word_len<=len(word))]

        return cleaned

In [6]:
document = ['hello hi I am a Data Science STUDENT here with Giulia and Andreas!U.S.A, United States of AMerica dixterochlomaterine hreoihso my dog ran away yesterday?! I will fly, to space tomorrow...','space tomorrow space tomorrow if the dog ran away fly to space',' fly to space  with my dog ran away']
lis = [normalisation(text, lemmatising = True, stemming = True, min_word_len = -1) for text in document]


In [21]:
def ngram(cleaned_docs, do_trigram = True, min_count_bigram = 5, threshold_bigram = 50, min_count_trigram = 5, threshold_trigram=50):
    #Bigrams
    bigram = gensim.models.Phrases(cleaned_docs, min_count= min_count_bigram, threshold=threshold_bigram) 
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    output = [bigram_mod[doc] for doc in cleaned_docs]
    #Trigrams
    if do_trigram:
        trigram = gensim.models.Phrases(output, min_count= min_count_trigram, threshold=threshold_trigram) 
        trigram_mod = gensim.models.phrases.Phraser(trigram)
        output = [trigram_mod[doc] for doc in output]
    return output

In [9]:
new_docs=[]
for doc in lis:
        new_docs.append(model.make_doc(doc))
#topic_dist, likelihood = model.infer(new_docs)
    

In [8]:
abs_list = list(documents['abstract'])

In [9]:
normalised = [normalisation(doc) for doc in abs_list]

In [1]:
def get_top_topics(document, model, min_score):
    new_docs = model.make_doc(document)
    _,_ = model.infer(new_docs)
    dist = new_docs.get_topic_dist()
    indices = np.flip(np.argsort(dist))
    score = 0
    indices_kept = []
    for index in indices:
        if score > min_score: break
        score += dist[index]
        indices_kept.append(index)
    return indices_kept
    
    

In [13]:
model.used_vocabs

['differ', 'measur', 'data', 'system', 'effect', 'show', 'method', 'process', 'gener', 'increas', 'observ', 'perform', 'develop', 'two', 'present', 'compar', 'function', 'structur', 'approach', 'activ', 'chang', 'also', 'howev', 'analysi', 'time', 'control', 'well', 'includ', 'provid', 'product', 'level', 'simul', 'specif', 'estim', 'high', 'cell', 'one', 'design', 'combin', 'potenti', 'interact', 'identifi', 'respons', 'addit', 'requir', 'investig', 'test', 'ass', 'find', 'associ', 'relat', 'applic', 'region', 'predict', 'dynam', 'condit', 'new', 'complex', 'network', 'limit', 'set', 'found', 'depend', 'experi', 'inform', 'within', 'suggest', 'sampl', 'indic', 'reduc', 'demonstr', 'improv', 'import', 'evalu', 'determin', 'larg', 'mechan', 'may', 'sever', 'distribut', 'valu', 'individu', 'appli', 'effici', 'rate', 'paramet', 'respect', 'first', 'impact', 'surfac', 'integr', 'local', 'allow', 'work', 'select', 'consid', 'protein', 'propos', 'group', 'optim', 'variabl', 'report', 'detect

In [8]:
model.get_topic_word_dist(0)

array([8.97087034e-07, 1.02276895e-02, 8.97087034e-07, ...,
       8.97087034e-07, 8.97087034e-07, 8.97087034e-07], dtype=float32)

In [18]:
def get_top_words(topic, model, min_score):
    dist = model.get_topic_word_dist(topic)
    indices = np.flip(np.argsort(dist))
    score = 0
    word_kept = []
    for index in indices:
        if score > min_score: break
        score += dist[index]
        word_kept.append(model.used_vocabs[index])
    return word_kept

TO-DO
1. Create topics df with id and words column - use get_top_words to do this
2. Add column with topics to abstract and id df and then explode to get all the topics
...
<br>n. Import to Neo4J


In [19]:
get_top_words(2, model, 0.8)


['tc',
 'spectrum',
 'peak',
 'ce',
 'ab',
 'spectral',
 'band',
 'deriv',
 'sa',
 'transit',
 'experi',
 'signal',
 'inclus',
 'cen',
 'atc',
 'pin',
 'nm',
 'specif_heat',
 'temperatur',
 'irradi',
 'muon_spin',
 'ihm',
 'baryon',
 'compound',
 'mass',
 'reconstruct',
 'μsr',
 'fit',
 'co',
 'temperatur_depend',
 'imprecis',
 'ft',
 'eno',
 'cef',
 'elast',
 'seri',
 'heat_rate',
 'nu',
 'site_specif',
 'ga',
 'compon',
 'myo_ip',
 'iaa',
 'chlamydia',
 'critic_temperatur',
 'amplif',
 'zero',
 'background',
 'channel',
 'crossov',
 'epr',
 'gamma',
 'axi',
 'liquid_water',
 'superfluid_densiti',
 'cross_section',
 'ion',
 'ceo_nm',
 'rotat_relax',
 'broad',
 'employ',
 'base_superconductor',
 'famili',
 'lat',
 'one_nucleu',
 'high_resolut',
 'iodin_intak',
 'axl',
 'epidot',
 'anthropogen_ce',
 'mu_sr',
 'ensembl',
 'fd',
 'superconduct_gap',
 'hyscor',
 'soc',
 'hadron',
 'ptfe',
 'amorph_ice',
 'calcit_wedg',
 'site_amplif',
 'sewag_sludg',
 'anisotrop',
 'wavenumb',
 'bedrock',


In [15]:
a = np.random.randn(150)
np.flip(np.argsort(a))

array([118, 111,  32,  98,  29, 116, 107,  63,  54, 141, 119,  23,  50,
        73, 124, 139,  78,   9, 123,  56,  89,  88,  62,  33, 145,  36,
         3,  47, 100,  70,  69,  14, 146, 132, 102,  21, 138,  16, 134,
        74, 106,  43, 144,  92,  27,  15,  66, 126,  42,  13,   4,  71,
        18, 120,  38,  45,  49,  64,   2,  48, 101,  10, 114, 128,  61,
       109,  51, 147,  39,  65,  37,  68,   5,   8, 137,  58,  41, 110,
       149, 105,  90,  67, 113,  80,  34,  17,  72, 115,   6,  95,  94,
       131, 133,  97,  35,  82,  12, 122, 140, 117,  22,  31,  84,  25,
        30,  81, 121,  79,  44,  53, 127,  86,  19,  96,  40, 135,  87,
        57,  20, 103,  55,  26, 142, 136,  59, 125,  85,   0,   7,  77,
       143, 129,  60, 130,  93, 108,  91,  46,  76, 112,  83,  99, 148,
        28, 104,   1,  11,  75,  52,  24], dtype=int64)

In [7]:
get_top_topics(lis[0], model, 0.5)

getting warmed up
bye
okay
hey


[61, 41, 6]

In [12]:
new_docs[0].get_topic_dist()

<tomotopy.Document with words="hi data scienc student giulia andrea unit state america dog ran away fli space tomorrow">

In [65]:
topic_dist[0]


array([0.00021095, 0.00022266, 0.00024476, 0.00025742, 0.00026388,
       0.00027699, 0.00031298, 0.00031747, 0.00033379, 0.00033483,
       0.00033894, 0.00035161, 0.00035175, 0.00035621, 0.00035923,
       0.00036826, 0.00037274, 0.00037918, 0.0003797 , 0.00038702,
       0.00039003, 0.00039752, 0.00039883, 0.0004023 , 0.00040487,
       0.0004055 , 0.00040712, 0.0004217 , 0.00042649, 0.00043166,
       0.0004379 , 0.00043901, 0.00044091, 0.00044337, 0.00044419,
       0.00044908, 0.00045016, 0.00045125, 0.00045322, 0.00045856,
       0.00045867, 0.00045871, 0.00045909, 0.00045925, 0.00046154,
       0.00046228, 0.00046233, 0.00046255, 0.00047902, 0.00048124,
       0.00048155, 0.00048256, 0.00048497, 0.0004939 , 0.00049755,
       0.00049775, 0.00049986, 0.00050029, 0.00050887, 0.00051399,
       0.00051472, 0.00051829, 0.00051945, 0.00052012, 0.00052204,
       0.00052244, 0.00053155, 0.00053449, 0.00053821, 0.00054281,
       0.00054418, 0.00055921, 0.00055956, 0.00056304, 0.00057

In [None]:
def get_test_LL(test_docs, model):
    
    # make a list of documents of type required by tp
    test_set = []
    for doc in test_docs:
        test_set.append(model.make_doc(doc))
    
    # return topic distribution and log-likelihood of new documents
    topic_dist, likelihood = model.infer(test_set)
    
    # use mean log-likelihood as performance measure
    return np.mean(likelihood)