# Topic modelling med gensim og dhlab

In [1]:
import dhlab as dh
import pandas as pd
import gensim
import pyLDAvis
from pprint import pprint
import pyLDAvis.gensim_models as genvis

In [2]:
# sprit urn URN:NBN:no-nb_digibok_2020090207537

Finn en bok

In [3]:
bok = dh.Corpus(doctype="digibok", limit=1, title="Norgeshistorie")

In [4]:
bok

Unnamed: 0,dhlabid,urn,title,authors,oaiid,sesamid,isbn10,city,timestamp,year,publisher,langs,subjects,ddc,genres,literaryform,doctype,ocr_creator,ocr_timestamp
0,100443356,URN:NBN:no-nb_digibok_2008071500119,Norgeshistorie for realskolen,"Jensen , Magnus",oai:nb.bibsys.no:999825090494702202,cc8bff9c30101cd12ecf5d436e91d30d,,Oslo,19530101,1953,,nob,,,,Uklassifisert,digibok,nb,20060101


In [5]:
urn = bok.frame.urn.tolist()[0]
urn

'URN:NBN:no-nb_digibok_2008071500119'

## Chunking

In [6]:
# Chunks 
res = dh.Chunks(chunks=1000, urn=urn)

In [7]:
len(res.chunks)

62

In [8]:
def chunks_to_corpus(chunks_list):
    res = []
    for x in chunks_list:
        inner_res = ""
        for y in x:
            inner_res += (y + " ") * x[y]
            
        res.append(inner_res)
    return res

In [9]:
texts = chunks_to_corpus(res.chunks)

## Find delta TFIDF

In [10]:
df = pd.DataFrame(res.chunks).transpose().fillna(0)

In [11]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,52,53,54,55,56,57,58,59,60,61
(,2.0,0.0,2.0,1.0,2.0,1.0,4.0,6.0,0.0,3.0,...,2.0,0.0,2.0,0.0,3.0,3.0,0.0,0.0,5.0,0.0
),2.0,0.0,4.0,1.0,2.0,1.0,4.0,6.0,0.0,3.0,...,2.0,0.0,2.0,0.0,5.0,3.0,0.0,1.0,5.0,0.0
",",30.0,105.0,34.0,44.0,41.0,30.0,43.0,35.0,47.0,41.0,...,44.0,39.0,39.0,39.0,41.0,43.0,45.0,80.0,43.0,38.0
.,63.0,2.0,45.0,60.0,61.0,53.0,58.0,75.0,55.0,58.0,...,64.0,71.0,56.0,49.0,51.0,70.0,57.0,42.0,51.0,56.0
/,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
trått,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
vanskene,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Übåtkrig,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
übåt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [12]:
# Import top 50000 tokens from NB collection
tot = dh.totals(50000)

In [13]:
# Divide corpus freq count by tot
res = df.sum(axis=1) /  tot.freq

In [14]:
# Get top 1000 more frequent tokens
target_tokens = res.sort_values(ascending=False).iloc[:1000].dropna().index

In [15]:
df.loc[target_tokens]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,52,53,54,55,56,57,58,59,60,61
årh.,0.0,0.0,0.0,0.0,1.0,2.0,3.0,0.0,0.0,1.0,...,2.0,2.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0
riker,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,...,1.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
adel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,3.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
Valdemar,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
valte,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vernepliktige,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
blodig,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dyrkede,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [16]:
outer_lst = []

for col in df.loc[target_tokens].columns:
    inner_str = ""
    for i, x in df.loc[target_tokens][col].items():
        if x > 0:
            inner_str += (i + " ") * int(x)
        
    outer_lst.append(inner_str)


In [17]:
len(outer_lst)

62

## Prep for LDA

In [18]:
data = [x.split() for x in outer_lst]

In [19]:
id2word = gensim.corpora.Dictionary(data)

In [20]:
corpus = [id2word.doc2bow(chunk) for chunk in data]

## Make model

In [22]:
lda_model = gensim.models.LdaMulticore(
    corpus = corpus,
    id2word = id2word,
    num_topics = 10
)

In [23]:
pprint(lda_model.print_topics())

[(0,
  '0.023*"Sverige" + 0.019*"kongen" + 0.019*"bøndene" + 0.018*"konge" + '
  '0.015*"Kristian" + 0.014*"Danmark" + 0.014*"danske" + 0.014*"svenske" + '
  '0.013*"makt" + 0.012*"gav"'),
 (1,
  '0.016*"Sverige" + 0.015*"lærte" + 0.015*"Danmark" + 0.012*"Norden" + '
  '0.011*"Russland" + 0.011*"krig" + 0.010*"romerne" + 0.009*"skapte" + '
  '0.009*"nordiske" + 0.008*"gav"'),
 (2,
  '0.032*"Håkon" + 0.023*"kongen" + 0.013*"Eirik" + 0.013*"Kristian" + '
  '0.013*"bøndene" + 0.013*"Sverdrup" + 0.012*"Sverige" + 0.009*"danske" + '
  '0.009*"Karl" + 0.008*"sønn"'),
 (3,
  '0.023*"Fredrik" + 0.019*"Finnland" + 0.015*"Kristian" + 0.014*"Sverige" + '
  '0.012*"Danmark" + 0.012*"gav" + 0.011*"svensk" + 0.011*"hær" + '
  '0.011*"kongen" + 0.010*"Gustav"'),
 (4,
  '0.014*"kongen" + 0.014*"gav" + 0.014*"satte" + 0.014*"bøndene" + '
  '0.013*"Kristian" + 0.011*"Gustav" + 0.010*"Sverige" + 0.009*"danske" + '
  '0.009*"skip" + 0.008*"tyskerne"'),
 (5,
  '0.029*"Danmark" + 0.028*"Sverige" + 0.016*"da

In [24]:
prep = genvis.prepare(lda_model, corpus, id2word)

In [25]:
# Save to html
# pyLDAvis.save_html(prep, "result.html")

In [26]:
pyLDAvis.enable_notebook()

In [27]:
pyLDAvis.display(prep)