In [2]:
import dhlab as dh
from pymongo import MongoClient
import pandas as pd
from dotenv import load_dotenv
import os
from typing import Generator, List, Tuple, Union
from pymongo.collection import Collection


load_dotenv()

connection_string = os.getenv('MONGODB_LOCAL_URI')

db = MongoClient(connection_string).norn # Connect to database
tok = dh.nbtokenizer.tokenize # Tokenizer
collection = db['poems']

## Korpusmetrikker

In [3]:
def count_token(token: str, collection: Collection) -> int:
    total_count = 0
    for doc in collection.find():
        total_count += doc['text'].count(token)  # Replace 'text_field' with the name of your field containing the text
    return total_count

In [4]:
count_token('jeg', collection)

7344

In [5]:
# Ordfrekvens

from collections import Counter

# Initialize a counter for all texts
total_frequency = Counter()

for doc in collection.find():
    #words = doc['text'].split()  # Tokenize the text
    words = doc["tokens"]
    total_frequency.update(words)
    
pd.DataFrame.from_dict(total_frequency, orient='index', columns=['frequency']).sort_values(by='frequency', ascending=False).head(10)

Unnamed: 0,frequency
",",69401
.,36224
og,25679
i,15608
som,9880
!,9539
er,9313
det,9069
du,7909
jeg,7291


## Konkordans

In [6]:
from norn_dh.text.concordance import get_concordance

get_concordance('jeg', collection, before=10, after=10)

Unnamed: 0,text_id,before,token,after
0,65a929105a5e71a3c6f4f2b8,men hvor blev der av den ene ? Nu ser,jeg,ei mer end to. 64 Mama ! Tre smaagutter
1,65a929105a5e71a3c6f4f2bc,mot himlen vinked . 1 ) ved de følgende maa,jeg,nok i særlig grad paakalde læserens overbærenh...
2,65a929105a5e71a3c6f4f2bc,grad paakalde læserens overbærenhet med det pe...,jeg,faar haabe han godskriver mig paa sønlighetens...
3,65a929105a5e71a3c6f4f2bf,"stunder , og Herren os gjensynet under , nokk vet",jeg,", hvem da oven sky vil kjærlig i møte"
4,65a929105a5e71a3c6f4f2bf,og yndigt trilled dine toner – nu vil for dig,jeg,"synge da her for siste gang , om lytte"
...,...,...,...,...
7286,65a929125a5e71a3c6f4fe2f,"den store dag faar jeg hans krans , det har",jeg,hørt af fanden . q1
7287,65a929125a5e71a3c6f4fe30,"mine tanker : Al glød har himlen skabt , og",jeg,"har ranet fra den en stjerne , den har"
7288,65a929125a5e71a3c6f4fe31,"og varm , vær ikke saa bedrøvet , kom ,",jeg,"skal gjemme dig ved min barm , til solen"
7289,65a929125a5e71a3c6f4fe32,de sorte bjerge ligger en stille sjø . Der har,jeg,"ofte ønsket , jeg maatte faa lov til at"


## Kollokasjon

In [24]:
from collections import Counter
from norn_dh.text.concordance import concordance

res = []
for text in collection.find():
        conc_generator = concordance("kvinde", text['tokens'], 10, 10, return_tokens=True)
        res.append(conc_generator)


counter = Counter()

for gen in res:
    for before, token, after in gen:    
        counter.update(before)
        counter.update(after)
        
koll = pd.DataFrame.from_dict(counter, orient='index', columns=['frequency'])

In [26]:
koll

Unnamed: 0,frequency
",",219
sove,1
.,110
66,1
-,7
...,...
fandtes,1
bølgernes,1
skjød,1
vugger,1


In [20]:
# Total

from collections import Counter

# Initialize a counter for all texts
total_frequency = Counter()

for doc in collection.find():
    #words = doc['text'].split()  # Tokenize the text
    words = doc["tokens"]
    total_frequency.update(words)
    
tot = pd.DataFrame.from_dict(total_frequency, orient='index', columns=['frequency']).sort_values(by='frequency', ascending=False).head(10)

In [25]:
# Relative frequency
( koll / tot).sort_values(by='frequency', ascending=False).head(10)

Unnamed: 0,frequency
jeg,0.004663
du,0.004172
er,0.003973
!,0.003669
og,0.003388
som,0.00334
",",0.003156
.,0.003037
det,0.002316
i,0.001474


## Document Term Matrix

In [17]:
# DTM

documents = collection.find({}, {"tokens" : 1}) # Get all documents

df = pd.DataFrame(documents) # Create dataframe
s = df.set_index("_id").tokens.explode() # Explode tokens
t = s.to_frame().reset_index().reset_index().rename(columns={"index" : "n"}).groupby(["_id", "tokens"]).count() # Count tokens
r = t.reset_index().sort_values(by=["n"], ascending=False) # Sort by count
dtm = r.pivot(columns="_id", values="n", index="tokens").fillna(0) # Pivot table, fill NaN with 0

In [18]:
dtm

_id,65a929105a5e71a3c6f4f2b7,65a929105a5e71a3c6f4f2b8,65a929105a5e71a3c6f4f2b9,65a929105a5e71a3c6f4f2ba,65a929105a5e71a3c6f4f2bb,65a929105a5e71a3c6f4f2bc,65a929105a5e71a3c6f4f2bd,65a929105a5e71a3c6f4f2be,65a929105a5e71a3c6f4f2bf,65a929105a5e71a3c6f4f2c0,...,65a929125a5e71a3c6f4fe29,65a929125a5e71a3c6f4fe2a,65a929125a5e71a3c6f4fe2b,65a929125a5e71a3c6f4fe2c,65a929125a5e71a3c6f4fe2d,65a929125a5e71a3c6f4fe2e,65a929125a5e71a3c6f4fe2f,65a929125a5e71a3c6f4fe30,65a929125a5e71a3c6f4fe31,65a929125a5e71a3c6f4fe32
tokens,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!,8.0,8.0,14.0,8.0,12.0,4.0,1.0,4.0,7.0,12.0,...,1.0,1.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""",0.0,1.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
%,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
&,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
⁹le,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
€,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
⸗,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ꝛ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# Dtm sum by document
dtm.sum().to_frame().sort_values(by=0, ascending=False)

Unnamed: 0_level_0,0
_id,Unnamed: 1_level_1
65a929115a5e71a3c6f4f876,7176.0
65a929115a5e71a3c6f4f3cd,4612.0
65a929125a5e71a3c6f4fbf8,3840.0
65a929115a5e71a3c6f4f3d3,3462.0
65a929125a5e71a3c6f4fbc1,3232.0
...,...
65a929125a5e71a3c6f4fd0a,28.0
65a929115a5e71a3c6f4f590,27.0
65a929115a5e71a3c6f4f44f,24.0
65a929125a5e71a3c6f4fd01,22.0


In [21]:
# Percentage
dtm_percent = dtm.apply(lambda x: 100 * (x / x.sum()), axis=1)
dtm_percent.head(5)

_id,65a929105a5e71a3c6f4f2b7,65a929105a5e71a3c6f4f2b8,65a929105a5e71a3c6f4f2b9,65a929105a5e71a3c6f4f2ba,65a929105a5e71a3c6f4f2bb,65a929105a5e71a3c6f4f2bc,65a929105a5e71a3c6f4f2bd,65a929105a5e71a3c6f4f2be,65a929105a5e71a3c6f4f2bf,65a929105a5e71a3c6f4f2c0,...,65a929125a5e71a3c6f4fe29,65a929125a5e71a3c6f4fe2a,65a929125a5e71a3c6f4fe2b,65a929125a5e71a3c6f4fe2c,65a929125a5e71a3c6f4fe2d,65a929125a5e71a3c6f4fe2e,65a929125a5e71a3c6f4fe2f,65a929125a5e71a3c6f4fe30,65a929125a5e71a3c6f4fe31,65a929125a5e71a3c6f4fe32
tokens,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!,0.083866,0.083866,0.146766,0.083866,0.125799,0.041933,0.010483,0.041933,0.073383,0.125799,...,0.010483,0.010483,0.083866,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""",0.0,0.47619,0.0,0.0,0.0,0.0,1.428571,0.952381,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$,0.0,0.0,1.298701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
%,0.0,0.0,0.0,1.470588,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
&,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
dtm_percent.transpose()

tokens,!,"""",$,%,&,',(,),*,+,...,⁸g,⁸tille,⁸⁷,⁹,⁹l,⁹le,€,⸗,ꝛ,ꝛ8
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
65a929105a5e71a3c6f4f2b7,0.083866,0.00000,0.000000,0.000000,0.0,0.000000,0.19802,0.483481,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9.090909,0.0,0.0,0.0
65a929105a5e71a3c6f4f2b8,0.083866,0.47619,0.000000,0.000000,0.0,0.000000,0.19802,0.161160,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
65a929105a5e71a3c6f4f2b9,0.146766,0.00000,1.298701,0.000000,0.0,0.000000,0.00000,0.241741,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
65a929105a5e71a3c6f4f2ba,0.083866,0.00000,0.000000,1.470588,0.0,0.000000,0.09901,0.322321,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
65a929105a5e71a3c6f4f2bb,0.125799,0.00000,0.000000,0.000000,0.0,0.827653,0.09901,0.483481,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65a929125a5e71a3c6f4fe2e,0.000000,0.00000,0.000000,0.000000,0.0,0.048685,0.00000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
65a929125a5e71a3c6f4fe2f,0.000000,0.00000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
65a929125a5e71a3c6f4fe30,0.000000,0.00000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
65a929125a5e71a3c6f4fe31,0.000000,0.00000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [23]:
# Total percentages
s = dtm.sum(axis=1)
tot = s.sum()
sum_percent = s.apply(lambda x: 100 *(x / tot))

sum_percent.sort_values(ascending=False).head(20)

tokens
,      8.225265
.      4.293194
og     3.043423
i      1.849828
som    1.170957
!      1.130543
er     1.103758
det    1.074839
du     0.937359
jeg    0.864114
den    0.806633
;      0.780441
med    0.755315
til    0.748678
en     0.637508
paa    0.607997
for    0.585123
han    0.574694
:      0.573627
de     0.549923
dtype: float64

### Dokumentlikhet

In [27]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Create a sparse matrix
sparse_matrix = csr_matrix(dtm_percent.transpose().values)

# Compute the cosine similarity
cosine_sim = cosine_similarity(sparse_matrix)

# To dataframe
df = pd.DataFrame(cosine_sim, columns=list(dtm_percent.columns), index=list(dtm_percent.columns))

df.sample(5)

Unnamed: 0,65a929105a5e71a3c6f4f2b7,65a929105a5e71a3c6f4f2b8,65a929105a5e71a3c6f4f2b9,65a929105a5e71a3c6f4f2ba,65a929105a5e71a3c6f4f2bb,65a929105a5e71a3c6f4f2bc,65a929105a5e71a3c6f4f2bd,65a929105a5e71a3c6f4f2be,65a929105a5e71a3c6f4f2bf,65a929105a5e71a3c6f4f2c0,...,65a929125a5e71a3c6f4fe29,65a929125a5e71a3c6f4fe2a,65a929125a5e71a3c6f4fe2b,65a929125a5e71a3c6f4fe2c,65a929125a5e71a3c6f4fe2d,65a929125a5e71a3c6f4fe2e,65a929125a5e71a3c6f4fe2f,65a929125a5e71a3c6f4fe30,65a929125a5e71a3c6f4fe31,65a929125a5e71a3c6f4fe32
65a929115a5e71a3c6f4f57e,7.916359e-07,3.728405e-06,5.096515e-07,1e-06,2e-06,4.155875e-07,7.462132e-07,5e-06,4e-06,3.668746e-06,...,6.606717e-06,1.073882e-06,8.319638e-07,2.744097e-06,2.272936e-06,4.565707e-06,5.597311e-07,3.484334e-06,5.477286e-07,3.072413e-06
65a929115a5e71a3c6f4f474,2.57775e-06,5.991659e-05,9.502549e-06,1e-06,1.2e-05,1.196682e-07,9.078204e-05,6.1e-05,7.2e-05,9.744757e-07,...,8.704336e-07,8.739968e-06,1.607401e-07,1.589238e-05,8.483938e-07,5.374195e-08,0.0004407917,3.025195e-07,1.550074e-07,6.852658e-08
65a929115a5e71a3c6f4f89e,1.185351e-07,5.393737e-07,1.363878e-06,0.00075,7e-06,9.421539e-07,9.58528e-06,1.8e-05,1.2e-05,8.950041e-07,...,2.050635e-06,1.835821e-06,1.587652e-07,7.502376e-07,1.691032e-05,1.39631e-05,1.81899e-07,4.678152e-08,2.363027e-07,6.667207e-06
65a929125a5e71a3c6f4fad6,8.595741e-07,4.261962e-06,1.844713e-06,2.9e-05,2.4e-05,4.601509e-06,1.077574e-05,8e-06,4.9e-05,7.780069e-06,...,3.639091e-07,5.55395e-07,1.830115e-06,6.66671e-06,3.673634e-07,3.178977e-07,1.291571e-06,1.409926e-07,5.512411e-06,1.160345e-06
65a929115a5e71a3c6f4f419,3.132454e-06,3.627768e-06,4.522224e-06,4.4e-05,2e-06,7.868053e-05,3.834425e-05,3e-06,5e-06,2.703903e-05,...,1.459365e-05,7.098306e-06,1.042968e-06,0.0002539497,3.351914e-06,8.255474e-06,9.406954e-07,1.209585e-06,2.041424e-06,1.027077e-06


In [39]:
# Find different dokuments that are similar
different_documents = df.stack().reset_index().loc[lambda x: x.level_0 != x.level_1]


different_documents.sort_values(by=0, ascending=False).head(10)

Unnamed: 0,level_0,level_1,0
2423383,65a929115a5e71a3c6f4f5ef,65a929115a5e71a3c6f4f5ee,1.0
2458706,65a929115a5e71a3c6f4f5fb,65a929115a5e71a3c6f4f619,1.0
2546876,65a929115a5e71a3c6f4f619,65a929115a5e71a3c6f4f5fb,1.0
2420444,65a929115a5e71a3c6f4f5ee,65a929115a5e71a3c6f4f5ef,1.0
1488131,65a929115a5e71a3c6f4f4b1,65a929115a5e71a3c6f4f4a2,1.0
1444046,65a929115a5e71a3c6f4f4a2,65a929115a5e71a3c6f4f4b1,1.0
3361524,65a929115a5e71a3c6f4f72e,65a929115a5e71a3c6f4f707,1.0
3246903,65a929115a5e71a3c6f4f707,65a929115a5e71a3c6f4f72e,1.0
7614213,65a929125a5e71a3c6f4fcd4,65a929125a5e71a3c6f4fcb0,1.0
7631791,65a929125a5e71a3c6f4fcda,65a929125a5e71a3c6f4fc72,1.0


In [37]:
# Identiske dokumenter
len(different_documents.loc[lambda x: x[0] == 1.0].sort_values(by=0, ascending=False))

16

In [36]:
# Veldig like dokumenter
len(different_documents.loc[lambda x: x[0] >= 0.999].sort_values(by=0, ascending=False))

82

In [41]:
different_documents.loc[lambda x: x[0] >= 0.999].sort_values(by=0, ascending=False).head(5)

Unnamed: 0,level_0,level_1,0
2420444,65a929115a5e71a3c6f4f5ee,65a929115a5e71a3c6f4f5ef,1.0
2458706,65a929115a5e71a3c6f4f5fb,65a929115a5e71a3c6f4f619,1.0
2423383,65a929115a5e71a3c6f4f5ef,65a929115a5e71a3c6f4f5ee,1.0
2546876,65a929115a5e71a3c6f4f619,65a929115a5e71a3c6f4f5fb,1.0
3246903,65a929115a5e71a3c6f4f707,65a929115a5e71a3c6f4f72e,1.0


In [46]:
from bson import ObjectId

In [47]:
collection.find_one({"_id" : ObjectId("65a929115a5e71a3c6f4f5ee")})

{'_id': ObjectId('65a929115a5e71a3c6f4f5ee'),
 'urn': 'URN:NBN:no-nb_digibok_2013070308031',
 'dhlabid': 'dhlab_norn_poem_00583',
 'text': "Vær taus. \n\n\nOm høit din aand vil stræbe, \ndit hjerte livslydt banke, \n\nsaa luk, saa luk din læbe, \nforraad ei sjelens tanke. \n\n\nThi al den sang derinde, \nsom tonefyldt vil bæve, \nfaar aldrig ekkos vinge; \nthi luk, thi luk din læbe. \n\n\nMar du levet? \n\n\n9 — \n\n\n'il du leve rigt og fyldigt, \n\nnaa det maal, dit liv er værdigt, \nnaa saalangt, som du kan komme, \nmaa du aldrig tro dig færdig; \n\nfør du intet krav er skyldig, \n\nfør dit navn er kjendt for gyldigt, \nda er livets dag først omme, \n\nog dens hviles fred tør komme. \n\n\n\n\n",
 'comment': nan,
 'digital_visning': nan,
 'overlapp': 'x',
 'page_end': 24,
 'page_start': 24,
 'title': 'Har du levet?',
 'author': 'Edith [Sigurdsen, Sofie]',
 'book_title': 'Sange',
 'year': 1895,
 'tokens': ['Vær',
  'taus',
  '.',
  'Om',
  'høit',
  'din',
  'aand',
  'vil',
  'stræbe