# Topic modelling med gensim og dhlab

In [18]:
import dhlab as dh
import pandas as pd
import gensim
import pyLDAvis
from pprint import pprint
import pyLDAvis.gensim_models as genvis

In [19]:
# sprit urn URN:NBN:no-nb_digibok_2020090207537

Finn en bok

In [20]:
bok = dh.Corpus(doctype="digibok", limit=1, title="Norgeshistorie")

In [21]:
bok

Unnamed: 0,dhlabid,urn,title,authors,oaiid,sesamid,isbn10,city,timestamp,year,publisher,langs,subjects,ddc,genres,literaryform,doctype,ocr_creator,ocr_timestamp
0,100284219,URN:NBN:no-nb_digibok_2016021608256,"Norgeshistorie etter 1850, VK II : lærebok","Olstad , Finn",oai:nb.bibsys.no:999408838804702202,784f7a0ffde006a7f606ed4f65b29202,8200408442,,19940101,1994,Universitetsforl.,nob,Historie / historie / norge,,,Faglitteratur,digibok,nb,20060101


In [22]:
urn = bok.frame.urn.tolist()[0]
urn

'URN:NBN:no-nb_digibok_2016021608256'

## Chunking

In [23]:
# Chunks 
res = dh.Chunks(chunks=1000, urn=urn)

In [24]:
len(res.chunks)

116

In [25]:
def chunks_to_corpus(chunks_list):
    res = []
    for x in chunks_list:
        inner_res = ""
        for y in x:
            inner_res += (y + " ") * x[y]
            
        res.append(inner_res)
    return res

In [26]:
texts = chunks_to_corpus(res.chunks)

## Find delta TFIDF

In [27]:
df = pd.DataFrame(res.chunks).transpose().fillna(0)

In [28]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,106,107,108,109,110,111,112,113,114,115
(,3.0,2.0,2.0,4.0,0.0,2.0,6.0,0.0,3.0,0.0,...,1.0,5.0,3.0,5.0,4.0,2.0,4.0,0.0,7.0,1.0
),3.0,2.0,2.0,3.0,1.0,2.0,5.0,0.0,3.0,2.0,...,1.0,5.0,3.0,5.0,4.0,2.0,4.0,0.0,7.0,1.0
",",29.0,24.0,38.0,29.0,30.0,22.0,38.0,35.0,26.0,36.0,...,34.0,32.0,26.0,40.0,42.0,310.0,24.0,12.0,38.0,28.0
-,26.0,11.0,2.0,8.0,5.0,20.0,12.0,3.0,16.0,0.0,...,7.0,7.0,15.0,16.0,8.0,1.0,16.0,3.0,8.0,13.0
.,60.0,32.0,60.0,56.0,27.0,52.0,43.0,66.0,39.0,35.0,...,66.0,49.0,60.0,47.0,54.0,6.0,58.0,23.0,60.0,51.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
virkernidler,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
vitnet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Ågotnes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Åttetimersdag,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [29]:
# Import top 50000 tokens from NB collection
tot = dh.totals(50000)

In [30]:
# Divide corpus freq count by tot
res = df.sum(axis=1) /  tot.freq

In [31]:
# Get top 1000 more frequent tokens
target_tokens = res.sort_values(ascending=False).iloc[:1000].dropna().index

In [32]:
df.loc[target_tokens]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,106,107,108,109,110,111,112,113,114,115
bidrog,0.0,0.0,1.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
Eyde,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
regjeringsmakten,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
samfunnsutviklingen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Nygaardsvold,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
harme,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
statlige,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0
oppløst,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
spasere,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [43]:
outer_lst = []

for col in df.loc[target_tokens].columns:
    inner_str = ""
    for i, x in df.loc[target_tokens][col].items():
        if x > 0:
            inner_str += (i + " ") * int(x)
        
    outer_lst.append(inner_str)


In [34]:
len(outer_lst)

116

## Prep for LDA

In [35]:
data = [x.split() for x in outer_lst]

In [36]:
id2word = gensim.corpora.Dictionary(data)

In [37]:
corpus = [id2word.doc2bow(chunk) for chunk in data]

In [38]:
data

[['stemmerett',
  'stemmerett',
  'stemmerett',
  'kvinnenes',
  'kvinnenes',
  'kvinnenes',
  '1884',
  '1884',
  '1884',
  '1884',
  'Hansteen',
  '1880',
  '1880',
  '1880',
  'innebar',
  'århundreskiftet',
  '1884.',
  'arbeidslivet',
  '1850',
  'ugifte',
  'ugifte',
  'Hagbart',
  'Hagbart',
  '1920',
  'stemmeretten',
  'stemmeretten',
  'Arbeiderpartiet',
  'telegraf',
  'statsråder',
  'kvinnene',
  'kvinnene',
  'kvinnene',
  'kvinnene',
  'kvinnene',
  'årene',
  'årene',
  'årene',
  'årene',
  'årene',
  'brot',
  'dominert',
  'Kvinners',
  'borgerlige',
  'borgerlige',
  '1860',
  'likestilling',
  'likestilling',
  'arbeidsinntekt',
  'samfunnet',
  '1945',
  'preget',
  '1882.',
  'myndige',
  'Gina',
  'Gina',
  'Gina',
  'Gina',
  'Gina',
  'Rogstad',
  'arbeidere',
  'bestod',
  'bestod',
  'sosiale',
  'sosiale',
  'examen',
  'examen',
  'examen',
  'gav',
  'åta',
  'åta',
  'åta',
  'regjeringen',
  '1910',
  '1972',
  '1882',
  'Stortinget',
  'Stortinget',
  

## Make model

In [39]:
lda_model = gensim.models.LdaMulticore(
    corpus = corpus,
    id2word = id2word,
    num_topics = 10
)

In [40]:
pprint(lda_model.print_topics())

[(0,
  '0.030*"årene" + 0.026*"Stortinget" + 0.019*"regjeringen" + 0.014*"tyskerne" '
  '+ 0.012*"krigen" + 0.011*"Venstre" + 0.011*"Hvorfor" + 0.011*"Sverdrup" + '
  '0.011*"1880" + 0.009*"kvinnene"'),
 (1,
  '0.024*"årene" + 0.016*"regjeringen" + 0.015*"Stortinget" + 0.011*"gav" + '
  '0.010*"Arbeiderpartiet" + 0.010*"skapte" + 0.009*"økte" + '
  '0.009*"Gerhardsen" + 0.008*"samfunnet" + 0.008*"EF"'),
 (2,
  '0.024*"0,1" + 0.023*"NS" + 0.021*"årene" + 0.018*"krigen" + 0.018*"Kilde" + '
  '0.013*"Hvordan" + 0.013*"0,2" + 0.012*"tyskerne" + 0.012*"Nasjonal" + '
  '0.010*"0,3"'),
 (3,
  '0.018*"Arbeiderpartiet" + 0.017*"årene" + 0.009*"1920" + 0.009*"181" + '
  '0.009*"Hvordan" + 0.008*"257" + 0.008*"256" + 0.008*"gav" + 0.007*"Hvorfor" '
  '+ 0.007*"Gerhardsen"'),
 (4,
  '0.013*"årene" + 0.011*"stemmerett" + 0.011*"122" + 0.010*"194" + '
  '0.010*"regjeringen" + 0.009*"Backer" + 0.009*"abort" + 0.009*"1800" + '
  '0.009*"Stortinget" + 0.008*"Harriet"'),
 (5,
  '0.045*"årene" + 0.016*"1

In [41]:
prep = genvis.prepare(lda_model, corpus, id2word)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [42]:
# Save to html
pyLDAvis.save_html(prep, "result.html")

In [44]:
pyLDAvis.enable_notebook()

In [47]:
pyLDAvis.display(prep)