In [1]:
import dhlab as dh
from pymongo import MongoClient
import pandas as pd
from dataclasses import dataclass
from scipy.spatial.distance import cosine


db = MongoClient().norn # Connect to database
tok = dh.nbtokenizer.tokenize # Tokenizer
collection = db['poems']

In [2]:
def get_text_from_token(token, collection):
    """Get all texts containing a token from a MongoDB collection"""
    return list(collection.find({'text': {'$regex': '.*' + token + '.*'}}))

def tokenize_lst_of_texts(lst_of_text_dct):
    """Tokenize a list of texts"""
    for dct in lst_of_text_dct:
        tokenized = tok(dct['text'])
        dct["tokens"] = tokenized
        
def find_indexes(lst, element):
    return [index for index, value in enumerate(lst) if value == element]


def concordance(token, token_lst, before = 5, after = 5, return_tokens = False):
    """Get a concordance for a token in a text"""
    
    token_indexes = find_indexes(token_lst, token)
    
    for token_index in token_indexes:
        start = token_index - before  # Find the start of the concordance
        end = token_index + after  # Find the end of the concordance
        # yield token_lst[start:end]  # Return the concordance
        before_tokens = token_lst[start:token_index]
        after_tokens = token_lst[token_index + 1:end]
        #yield (before_tokens, token, after_tokens)
        if return_tokens:
            yield before_tokens, token, after_tokens
        else:        
            yield " ".join(before_tokens), token, " ".join(after_tokens)

@dataclass
class ConcRes:
    text_id: str
    before: str
    token: str
    after: str
    
@dataclass
class ConcToks:
    text_id: str
    before: list
    token: str
    after: list


        
def get_concordance(token, collection, before = 5, after = 5):
    """Get concordance for a token in a MongoDB collection"""
    texts = get_text_from_token(token, collection)
    # tokenize_lst_of_texts(texts)
    
    concs = []
    for text in texts:
        conc_generator = concordance(token, text['tokens'], before, after)
        conc_list = [x for x in conc_generator]
        for x in conc_list:
            before_txt, token, after_txt = x
            concs.append(ConcRes(text["_id"], before_txt, token, after_txt))
    
    # return concs
    return pd.DataFrame(concs)


In [3]:
res = get_concordance(",", collection=collection, before=10, after=10)

In [4]:
res

Unnamed: 0,text_id,before,token,after
0,65a550f94a06fb61333ecc97,,",","niece "" p. M. Drost ( , Pip """
1,65a550f94a06fb61333ecc97,"Til min danske , niece "" p. M. Drost (",",","Pip "" ) , født Aabel . ( aar"
2,65a550f94a06fb61333ecc97,"niece "" p. M. Drost ( , Pip "" )",",",født Aabel . ( aar folen ganger til hvile
3,65a550f94a06fb61333ecc97,folen ganger til hvile . ) Min kære Petra Math...,",","som færdes paa Danmarks vang , saa gerne din"
4,65a550f94a06fb61333ecc97,"Min kære Petra Mathilde , som færdes paa Danma...",",",saa gerne din « onkel » vilde dig sende
...,...,...,...,...
68436,65a550fb4a06fb61333ed7da,,",",tæt under lien en rislende bæk . Solen i
68437,65a550fb4a06fb61333ed7da,lister sig væk . Aftenen sænker sig lurer og l...,",",soldagen dør i et smil . Her sukkes der
68438,65a550fb4a06fb61333ed7da,smil . Her sukkes der ikke i slotte og hytter,",","her er ingen nagende , slidende tvil . 31"
68439,65a550fb4a06fb61333ed7da,"ikke i slotte og hytter , her er ingen nagende",",",slidende tvil . 31 –


In [6]:
# TEST CONCORDANCE

test_txt = "'62 \n\n\nNaar en gang mot natten det stunder, \nog Herren os gjensynet under, \n\nnokk vet jeg, hvem da oven sky \n\nvil kjærlig i møte os fly. \n\n\nOver vore to smaa døde. \n1. Lovise Margrete, \nfød den 14de januar 1864, død 12te november 1874. \n(Nu titte til hinanden de fagre blomfter fmaa.) \nDu vesle «Visestubben» for os saa ofte sang – \nsaa blødt og yndigt trilled dine toner – \nnu vil for dig jeg synge da her for siste gang, \nom lytte du kan fra fjerne zoner. \n\n\nThi der saa visst jeg haaber, at nu du flagrer om \nfra favn til favn imellem flægt og venner. \n\nSaa mange kjære sjæle dig der i møte kom, \n\nsaa mange smaaengle alt du fjender. \n\n\n3a slaa da dine triller nu fuldt i englekor! \n\nSvng sødt for Krift om rette barneglæden! \n\nDit livsensskudd det brødes saa braatt her paa jord, – \nnu blomstrer det blidt i himlens eden. \n\n\n\n\n65 \n\n\n2. Peder Pavels Aabel. \nfød 15 mai 1867, død 29de november 1874. \n«Jesus, lukk opp for mig! –» saa lød hans ord, \nmens han snart skalv og snart brann her paa jord. \nHerre du milde, ham tag i din arm, \nkvæg du den lille ved kjærlige barm! \n\n\nNu har hans nagende lidelser slutt, \nnu er han trøstet, vor trofaste gutt: \nHan skal faa leke med systeren sin – \nlukk ham, o Jesus i himmelen ind! \n\n\nTil Mama! \nJannar 1875. \n(Kommen, vakra blommor) \n\nMama! Tre smaajenter \nfløj ifjor dig glad i fang: \nDen som havde dem omkring sig, \naldrig savned spil og sang. \nSom lysalfer over enge \ndansed de saa lett og lo – \nmen hvor blev der av den ene? \nNu ser jeg ei mer end to. \n\n\n\n\n'"
tokens = tok(test_txt)


list(concordance("jeg", tokens))

[('gjensynet under , nokk vet', 'jeg', ', hvem da oven'),
 ('– nu vil for dig', 'jeg', 'synge da her for'),
 ('. Thi der saa visst', 'jeg', 'haaber , at nu'),
 ('den ene ? Nu ser', 'jeg', 'ei mer end to.')]

In [7]:
documents = collection.find()

## Various metrics

In [8]:
# count token

collection = db.poems

# Token to search for
token = "jeg"  # Replace with the token you're interested in

# Query the database and process text
total_count = 0
for doc in collection.find():
    total_count += doc['text'].count(token)  # Replace 'text_field' with the name of your field containing the text
total_count

7184

In [9]:
# Count documents

collection.count_documents({})

2885

In [20]:
from collections import Counter

# Initialize a counter for all texts
total_frequency = Counter()

for doc in collection.find():
    #words = doc['text'].split()  # Tokenize the text
    words = doc["tokens"]
    total_frequency.update(words)
    
pd.DataFrame.from_dict(total_frequency, orient='index', columns=['frequency']).sort_values(by='frequency', ascending=False).head(10)

Unnamed: 0,frequency
",",68441
.,35793
og,25413
i,15450
som,9755
!,9411
er,9185
det,8985
du,7795
jeg,7131


In [21]:
all_docs = pd.DataFrame(collection.find())

## DTM

In [23]:
df = pd.DataFrame(documents)
s = df.set_index("_id").tokens.explode()
t = s.to_frame().reset_index().reset_index().rename(columns={"index" : "n"}).groupby(["_id", "tokens"]).count()
r = t.reset_index().sort_values(by=["n"], ascending=False)
dtm = r.pivot(columns="_id", values="n", index="tokens").fillna(0)

In [24]:
dtm.sum()

_id
65a550f94a06fb61333ecc97    291.0
65a550fa4a06fb61333ecc98    334.0
65a550fa4a06fb61333ecc99    311.0
65a550fa4a06fb61333ecc9a    512.0
65a550fa4a06fb61333ecc9b    449.0
                            ...  
65a550fb4a06fb61333ed7d7    194.0
65a550fb4a06fb61333ed7d8    194.0
65a550fb4a06fb61333ed7d9     60.0
65a550fb4a06fb61333ed7da     62.0
65a550fb4a06fb61333ed7db     75.0
Length: 2885, dtype: float64

In [25]:
dtm.sum(axis=1).sort_values(ascending=False).head(20)

tokens
,      68441.0
.      35793.0
og     25413.0
i      15450.0
som     9755.0
!       9411.0
er      9185.0
det     8985.0
du      7795.0
jeg     7131.0
den     6724.0
;       6492.0
med     6302.0
til     6235.0
en      5296.0
paa     5071.0
for     4855.0
han     4816.0
:       4763.0
de      4566.0
dtype: float64

In [43]:
dtm_percent = dtm.apply(lambda x: 100 * (x / x.sum()), axis=1)


In [42]:


s = dtm.sum(axis=1)
tot = s.sum()
sum_percent = s.apply(lambda x: 100 *(x / tot))

In [44]:
dtm_percent

_id,65a550f94a06fb61333ecc97,65a550fa4a06fb61333ecc98,65a550fa4a06fb61333ecc99,65a550fa4a06fb61333ecc9a,65a550fa4a06fb61333ecc9b,65a550fa4a06fb61333ecc9c,65a550fa4a06fb61333ecc9d,65a550fa4a06fb61333ecc9e,65a550fa4a06fb61333ecc9f,65a550fa4a06fb61333ecca0,...,65a550fb4a06fb61333ed7d2,65a550fb4a06fb61333ed7d3,65a550fb4a06fb61333ed7d4,65a550fb4a06fb61333ed7d5,65a550fb4a06fb61333ed7d6,65a550fb4a06fb61333ed7d7,65a550fb4a06fb61333ed7d8,65a550fb4a06fb61333ed7d9,65a550fb4a06fb61333ed7da,65a550fb4a06fb61333ed7db
tokens,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!,0.010626,0.074381,0.031878,0.074381,0.191266,0.085007,0.021252,0.053129,0.063755,0.095633,...,0.042503,0.0,0.00000,0.010626,0.085007,0.021252,0.031878,0.0,0.0,0.0
"""",1.428571,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.47619,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
$,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.298701,0.000000,...,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
%,0.000000,0.000000,0.000000,0.000000,1.470588,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
&,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
⁹le,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
€,0.000000,0.000000,9.090909,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
⸗,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
ꝛ,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0


In [47]:
sum_percent.sort_values(ascending=False).head(20)

tokens
,      8.216858
.      4.297219
og     3.051022
i      1.854889
som    1.171161
!      1.129861
er     1.102728
det    1.078717
du     0.935848
jeg    0.856130
den    0.807267
;      0.779413
med    0.756603
til    0.748559
en     0.635825
paa    0.608812
for    0.582879
han    0.578197
:      0.571834
de     0.548183
dtype: float64

In [56]:
dtm_percent.columns

Index([65a550f94a06fb61333ecc97, 65a550fa4a06fb61333ecc98,
       65a550fa4a06fb61333ecc99, 65a550fa4a06fb61333ecc9a,
       65a550fa4a06fb61333ecc9b, 65a550fa4a06fb61333ecc9c,
       65a550fa4a06fb61333ecc9d, 65a550fa4a06fb61333ecc9e,
       65a550fa4a06fb61333ecc9f, 65a550fa4a06fb61333ecca0,
       ...
       65a550fb4a06fb61333ed7d2, 65a550fb4a06fb61333ed7d3,
       65a550fb4a06fb61333ed7d4, 65a550fb4a06fb61333ed7d5,
       65a550fb4a06fb61333ed7d6, 65a550fb4a06fb61333ed7d7,
       65a550fb4a06fb61333ed7d8, 65a550fb4a06fb61333ed7d9,
       65a550fb4a06fb61333ed7da, 65a550fb4a06fb61333ed7db],
      dtype='object', name='_id', length=2885)

In [62]:
dtm_percent.transpose()

tokens,!,"""",$,%,&,',(,),*,+,...,⁸g,⁸tille,⁸⁷,⁹,⁹l,⁹le,€,⸗,ꝛ,ꝛ8
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
65a550f94a06fb61333ecc97,0.010626,1.428571,0.0,0.000000,0.0,0.000000,0.30426,0.246103,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
65a550fa4a06fb61333ecc98,0.074381,0.000000,0.0,0.000000,0.0,0.000000,0.20284,0.164069,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
65a550fa4a06fb61333ecc99,0.031878,0.000000,0.0,0.000000,0.0,0.000000,0.10142,0.246103,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9.090909,0.0,0.0,0.0
65a550fa4a06fb61333ecc9a,0.074381,0.000000,0.0,0.000000,0.0,0.000000,0.30426,0.574241,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
65a550fa4a06fb61333ecc9b,0.191266,0.000000,0.0,1.470588,0.0,0.000000,0.00000,0.246103,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65a550fb4a06fb61333ed7d7,0.021252,0.000000,0.0,0.000000,0.0,0.000000,0.00000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
65a550fb4a06fb61333ed7d8,0.031878,0.000000,0.0,0.000000,0.0,0.049044,0.00000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
65a550fb4a06fb61333ed7d9,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.00000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
65a550fb4a06fb61333ed7da,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.00000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [63]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

sparse_matrix = csr_matrix(dtm_percent.transpose().values)

# Compute the cosine similarity
cosine_sim = cosine_similarity(sparse_matrix)

In [52]:
sparse_matrix

<64191x2885 sparse matrix of type '<class 'numpy.float64'>'
	with 458654 stored elements in Compressed Sparse Row format>

In [82]:
df = pd.DataFrame(cosine_sim, columns=list(dtm_percent.columns), index=list(dtm_percent.columns))

In [83]:
df

Unnamed: 0,65a550f94a06fb61333ecc97,65a550fa4a06fb61333ecc98,65a550fa4a06fb61333ecc99,65a550fa4a06fb61333ecc9a,65a550fa4a06fb61333ecc9b,65a550fa4a06fb61333ecc9c,65a550fa4a06fb61333ecc9d,65a550fa4a06fb61333ecc9e,65a550fa4a06fb61333ecc9f,65a550fa4a06fb61333ecca0,...,65a550fb4a06fb61333ed7d2,65a550fb4a06fb61333ed7d3,65a550fb4a06fb61333ed7d4,65a550fb4a06fb61333ed7d5,65a550fb4a06fb61333ed7d6,65a550fb4a06fb61333ed7d7,65a550fb4a06fb61333ed7d8,65a550fb4a06fb61333ed7d9,65a550fb4a06fb61333ed7da,65a550fb4a06fb61333ed7db
65a550f94a06fb61333ecc97,1.000000e+00,0.005869,1.496246e-05,0.005858,8.819824e-05,3.827646e-04,0.002177,1.160571e-04,7.673607e-05,0.000252,...,2.893621e-05,4.535346e-06,2.389290e-05,9.292207e-06,8.338369e-07,0.000001,0.000024,0.000037,5.503523e-07,1.307436e-05
65a550fa4a06fb61333ecc98,5.869342e-03,1.000000,5.764397e-05,0.423763,4.917344e-05,1.018795e-03,0.000506,3.059971e-04,4.650922e-04,0.000083,...,1.802666e-06,1.557216e-04,9.725756e-06,2.155380e-05,2.474850e-06,0.000038,0.000031,0.000002,3.459646e-03,5.668403e-06
65a550fa4a06fb61333ecc99,1.496246e-05,0.000058,1.000000e+00,0.000050,2.001741e-04,5.559722e-05,0.000022,5.573492e-05,5.951901e-04,0.000021,...,3.261198e-06,2.359334e-06,6.927202e-07,8.284296e-06,4.480324e-05,0.000015,0.000001,0.000021,6.738780e-07,9.329065e-07
65a550fa4a06fb61333ecc9a,5.857554e-03,0.423763,4.979277e-05,1.000000,5.245776e-05,6.200159e-03,0.003817,6.124532e-04,1.033998e-04,0.000047,...,5.225687e-06,1.962974e-05,5.562919e-05,5.920385e-05,6.741526e-06,0.000030,0.000023,0.000018,7.809279e-05,2.872432e-04
65a550fa4a06fb61333ecc9b,8.819824e-05,0.000049,2.001741e-04,0.000052,1.000000e+00,7.480404e-05,0.000029,6.474298e-04,1.830230e-04,0.000297,...,2.447211e-06,2.022416e-03,9.154285e-06,3.771319e-06,1.570715e-04,0.000109,0.000001,0.000008,1.219457e-04,3.068378e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65a550fb4a06fb61333ed7d7,1.300767e-06,0.000038,1.452649e-05,0.000030,1.092947e-04,3.003424e-05,0.000057,5.239894e-06,1.047444e-06,0.000005,...,2.024320e-06,2.421867e-05,5.217514e-04,8.264549e-07,5.953057e-06,1.000000,0.000216,0.011271,1.239664e-06,1.072017e-05
65a550fb4a06fb61333ed7d8,2.428988e-05,0.000031,1.226270e-06,0.000023,1.233204e-06,5.228408e-06,0.000007,6.079376e-06,1.751739e-06,0.000026,...,2.580145e-07,1.631262e-04,1.070415e-04,4.091331e-06,5.780263e-07,0.000216,1.000000,0.000018,5.157412e-05,2.115255e-03
65a550fb4a06fb61333ed7d9,3.706130e-05,0.000002,2.071997e-05,0.000018,7.931544e-06,1.933432e-05,0.000014,2.270576e-04,1.022180e-06,0.000016,...,2.192246e-07,2.360213e-05,4.542782e-04,7.777239e-07,8.300851e-07,0.011271,0.000018,1.000000,2.668133e-05,6.696796e-06
65a550fb4a06fb61333ed7da,5.503523e-07,0.003460,6.738780e-07,0.000078,1.219457e-04,6.589762e-07,0.000150,2.929168e-07,2.058523e-07,0.000008,...,9.645691e-08,1.264551e-07,1.587897e-05,2.102318e-07,8.389673e-05,0.000001,0.000052,0.000027,1.000000e+00,2.547558e-07


In [84]:
df.stack().sort_values(ascending=False)

65a550fa4a06fb61333eced2  65a550fa4a06fb61333eced2    1.0
65a550fb4a06fb61333ed567  65a550fb4a06fb61333ed567    1.0
65a550fa4a06fb61333ecd8e  65a550fa4a06fb61333ecd8e    1.0
65a550fa4a06fb61333ecf60  65a550fa4a06fb61333ecf60    1.0
65a550fa4a06fb61333ecf32  65a550fa4a06fb61333ecf32    1.0
                                                     ... 
65a550fb4a06fb61333ed429  65a550fa4a06fb61333ecd89    0.0
                          65a550fa4a06fb61333ecd88    0.0
                          65a550fa4a06fb61333ecd87    0.0
                          65a550fa4a06fb61333ecd86    0.0
                          65a550fa4a06fb61333ecdaf    0.0
Length: 8323225, dtype: float64

In [99]:
df.stack().reset_index().loc[lambda x: x.level_0 != x.level_1].sort_values(by=0, ascending=False).head(30)

Unnamed: 0,level_0,level_1,0
2314599,65a550fa4a06fb61333ecfb9,65a550fa4a06fb61333ecfd4,1.0
2392467,65a550fa4a06fb61333ecfd4,65a550fa4a06fb61333ecfb9,1.0
3284125,65a550fa4a06fb61333ed109,65a550fa4a06fb61333ed07a,1.0
2871713,65a550fa4a06fb61333ed07a,65a550fa4a06fb61333ed109,1.0
3102424,65a550fa4a06fb61333ed0ca,65a550fa4a06fb61333ed0b0,1.0
2424260,65a550fa4a06fb61333ecfdf,65a550fa4a06fb61333ecff3,1.0
2481940,65a550fa4a06fb61333ecff3,65a550fa4a06fb61333ecfdf,1.0
3027440,65a550fa4a06fb61333ed0b0,65a550fa4a06fb61333ed0ca,1.0
3056189,65a550fa4a06fb61333ed0ba,65a550fa4a06fb61333ed065,1.0
7018848,65a550fb4a06fb61333ed617,65a550fb4a06fb61333ed677,1.0


In [102]:
stacked = df.stack().reset_index().loc[lambda x: x.level_0 != x.level_1].sort_values(by=0, ascending=False)

In [107]:
stacked[0].value_counts().sort_index(ascending=False).to_frame().head(50)

Unnamed: 0_level_0,count
0,Unnamed: 1_level_1
1.0,2
1.0,6
1.0,6
1.0,16
1.0,8
1.0,6
1.0,6
1.0,18
1.0,2
1.0,2


In [91]:
from bson import ObjectId

In [98]:
collection.find_one({"_id": ObjectId("65a550fa4a06fb61333ed07a")})

{'_id': ObjectId('65a550fa4a06fb61333ed07a'),
 'urn': 'URN:NBN:no-nb_digibok_2016051048054',
 'title': 'Atter til en søn',
 'text': '224 \n\n\nGuds fred jeg ønsker Dig idag, \nmin hilsen og Guds fred modkag; \ndin mage kjær og al din slegt \nGud fader tag i varetægt. \n\n\n218. Faderen til sin søn. \n\n\nNaar fienden synden dig maler saa skjøn, \nhusk døden sig skjuler deri da, o søn; \n\nfor glæden i synden er prisen for stor; \nthi evigt betales med pine derfor. \n\n\nMen luk du dit øie og øre igjen \n\nfor alt, som til synden vil drage dig hen, \nog blikket høit hæv over taage og sky \nmod Salems den skjønne og herlige by. \n\n\n219. AMffer kil en søn. \n\n\nHerren lede dig hernede \n\nalle dine aar. \n\nHusk din moders bønner \n\nfor de kjære sønner. \n\nKort er tiden, haard er striden, \nspænd dit sværd om lænd. \n\n\n220. Til en agronom. \n(Tone: Nit allerførste kongeord.) \nNu midt i livets lyse vaar \ndu staar idag \nAf haab og længsel hjertet slaar \nsaa varme slag. \n\n\n\n\n

In [97]:
collection.find_one({"_id": ObjectId("65a550fa4a06fb61333ed109")})

{'_id': ObjectId('65a550fa4a06fb61333ed109'),
 'urn': 'URN:NBN:no-nb_digibok_2016051048054',
 'title': 'Faderen til sin søn',
 'text': '224 \n\n\nGuds fred jeg ønsker Dig idag, \nmin hilsen og Guds fred modkag; \ndin mage kjær og al din slegt \nGud fader tag i varetægt. \n\n\n218. Faderen til sin søn. \n\n\nNaar fienden synden dig maler saa skjøn, \nhusk døden sig skjuler deri da, o søn; \n\nfor glæden i synden er prisen for stor; \nthi evigt betales med pine derfor. \n\n\nMen luk du dit øie og øre igjen \n\nfor alt, som til synden vil drage dig hen, \nog blikket høit hæv over taage og sky \nmod Salems den skjønne og herlige by. \n\n\n219. AMffer kil en søn. \n\n\nHerren lede dig hernede \n\nalle dine aar. \n\nHusk din moders bønner \n\nfor de kjære sønner. \n\nKort er tiden, haard er striden, \nspænd dit sværd om lænd. \n\n\n220. Til en agronom. \n(Tone: Nit allerførste kongeord.) \nNu midt i livets lyse vaar \ndu staar idag \nAf haab og længsel hjertet slaar \nsaa varme slag. \n\n\n\

In [95]:
collection.find_one({"_id": ObjectId("65a550fa4a06fb61333ecfd4")})

{'_id': ObjectId('65a550fa4a06fb61333ecfd4'),
 'urn': 'URN:NBN:no-nb_digibok_2013070308031',
 'title': 'Har du levet?',
 'text': "Vær taus. \n\n\nOm høit din aand vil stræbe, \ndit hjerte livslydt banke, \n\nsaa luk, saa luk din læbe, \nforraad ei sjelens tanke. \n\n\nThi al den sang derinde, \nsom tonefyldt vil bæve, \nfaar aldrig ekkos vinge; \nthi luk, thi luk din læbe. \n\n\nMar du levet? \n\n\n9 — \n\n\n'il du leve rigt og fyldigt, \n\nnaa det maal, dit liv er værdigt, \nnaa saalangt, som du kan komme, \nmaa du aldrig tro dig færdig; \n\nfør du intet krav er skyldig, \n\nfør dit navn er kjendt for gyldigt, \nda er livets dag først omme, \n\nog dens hviles fred tør komme. \n\n\n\n\n",
 'tokens': ['Vær',
  'taus',
  '.',
  'Om',
  'høit',
  'din',
  'aand',
  'vil',
  'stræbe',
  ',',
  'dit',
  'hjerte',
  'livslydt',
  'banke',
  ',',
  'saa',
  'luk',
  ',',
  'saa',
  'luk',
  'din',
  'læbe',
  ',',
  'forraad',
  'ei',
  'sjelens',
  'tanke',
  '.',
  'Thi',
  'al',
  'den',
 

In [96]:
collection.find_one({"_id": ObjectId("65a550fa4a06fb61333ecfb9")})

{'_id': ObjectId('65a550fa4a06fb61333ecfb9'),
 'urn': 'URN:NBN:no-nb_digibok_2013070308031',
 'title': 'Vær taus',
 'text': "Vær taus. \n\n\nOm høit din aand vil stræbe, \ndit hjerte livslydt banke, \n\nsaa luk, saa luk din læbe, \nforraad ei sjelens tanke. \n\n\nThi al den sang derinde, \nsom tonefyldt vil bæve, \nfaar aldrig ekkos vinge; \nthi luk, thi luk din læbe. \n\n\nMar du levet? \n\n\n9 — \n\n\n'il du leve rigt og fyldigt, \n\nnaa det maal, dit liv er værdigt, \nnaa saalangt, som du kan komme, \nmaa du aldrig tro dig færdig; \n\nfør du intet krav er skyldig, \n\nfør dit navn er kjendt for gyldigt, \nda er livets dag først omme, \n\nog dens hviles fred tør komme. \n\n\n\n\n",
 'tokens': ['Vær',
  'taus',
  '.',
  'Om',
  'høit',
  'din',
  'aand',
  'vil',
  'stræbe',
  ',',
  'dit',
  'hjerte',
  'livslydt',
  'banke',
  ',',
  'saa',
  'luk',
  ',',
  'saa',
  'luk',
  'din',
  'læbe',
  ',',
  'forraad',
  'ei',
  'sjelens',
  'tanke',
  '.',
  'Thi',
  'al',
  'den',
  'san

In [None]:


for x in dtm_percent:
    for y in dtm_percent:
        

tokens
!      1.129861
"      0.025212
$      0.009244
%      0.008164
&      0.003602
         ...   
⁹le    0.000120
€      0.001321
⸗      0.000240
ꝛ      0.000120
ꝛ8     0.000120
Length: 64191, dtype: float64