In [1]:
from urllib.parse import urljoin
import requests
import pandas as pd

api_path = "http://dhlab2.nb.no:5004/norn"

poem_path = urljoin(api_path, "norn/poems")


poems = requests.get(poem_path).json()

In [25]:
df = pd.DataFrame(poems)

In [33]:
def get_window(token, df, window):
    conc = []
    for _, row in df.iterrows():
        if token in row.tokens:
            #idx = row.tokens.index(token)
            idxs = [i for i, x in enumerate(row.tokens) if x == token]
            for word_index, idx in enumerate(idxs):
                start = row.tokens[max(0, idx-window):idx]
                end = row.tokens[idx+1:min(len(row.tokens), idx+window+1)]
                
                conc.append({
                    "dhlabid" : row.dhlabid,
                    "concordance_index" : word_index + 1,
                    "word_index" : idx,
                    "left" : start,
                    "token" : token,
                    "right" : end,
                        })
            
            
            # conc.append(" ".join(row.tokens[idx-window:idx+window+1]))
    return pd.DataFrame(conc)

In [4]:
def concordance(token, df, window=5, join=False):
    res = get_window(token, df, window)
    res.left = res.left.apply(lambda x: " ".join(x))
    res.right = res.right.apply(lambda x: " ".join(x))
    
    if join == True:
        res["concordance"] = res.left + " " + "<b>" + res.token + "</b>" + " " + res.right
        res = res.drop(["left", "token", "right"], axis=1)        
        
    return res

In [6]:
from typing import List
from collections import Counter

def find_index_distance(lst: List[int]):
    """Find distance between elements in a list

    Args:
        lst (List[str]): list of tokens

    Returns:
        List[int]: distance between tokens
    """
    return [j-i for i, j in zip(lst[:-1], lst[1:])]

def collocation(token, df , window = 10):
    hits = get_window(token, df, window)
    
    counter = Counter()
    
    for _, row in hits.iterrows():

        
        counter.update(row.left)
        counter.update(row.right)
        
    return pd.DataFrame.from_dict(counter, orient="index", columns=["frequency"]).sort_values(by="frequency", ascending=False)
    

In [53]:
def total_word_count(df):
    total = Counter()
    for i, row in df.iterrows():
        total.update(row.tokens)
        
    return pd.DataFrame.from_dict(total, orient="index", columns=["frequency"]).sort_values(by="frequency", ascending=False)

In [62]:
def get_dtm(df):
    dtm = df.tokens.apply(lambda x: pd.Series(Counter(x)))
    return dtm.fillna(0).astype(int)

In [91]:
dtm  = df.set_index("dhlabid").tokens.explode().reset_index().reset_index().rename({"index" : "frequency"}, axis = 1).groupby(["dhlabid", "tokens"]).count().unstack().fillna(0).astype(int)

In [96]:
dtm.transpose().droplevel(0)

dhlabid,dhlab_norn_poem_00001,dhlab_norn_poem_00002,dhlab_norn_poem_00003,dhlab_norn_poem_00004,dhlab_norn_poem_00005,dhlab_norn_poem_00006,dhlab_norn_poem_00007,dhlab_norn_poem_00008,dhlab_norn_poem_00009,dhlab_norn_poem_00010,...,dhlab_norn_poem_02931,dhlab_norn_poem_02932,dhlab_norn_poem_02933,dhlab_norn_poem_02934,dhlab_norn_poem_02935,dhlab_norn_poem_02936,dhlab_norn_poem_02937,dhlab_norn_poem_02938,dhlab_norn_poem_02939,dhlab_norn_poem_02940
tokens,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!,0,0,0,0,0,1,0,2,9,1,...,0,0,0,0,0,0,0,4,0,1
"""",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,3,0,0
$,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
%,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
&,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
⁹le,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
€,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
⸗,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ꝛ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [101]:
dtm.iloc[1].to_frame().head(30).style

Unnamed: 0_level_0,Unnamed: 1_level_0,dhlab_norn_poem_00002
Unnamed: 0_level_1,tokens,Unnamed: 2_level_1
frequency,!,0
frequency,"""",0
frequency,$,0
frequency,%,0
frequency,&,0
frequency,',0
frequency,(,1
frequency,),1
frequency,*,1
frequency,+,0


In [83]:
df.set_index("dhlabid").tokens.explode().index.value_counts()

dhlabid
dhlab_norn_poem_01104    7176
dhlab_norn_poem_02444    4612
dhlab_norn_poem_02504    3840
dhlab_norn_poem_02466    3462
dhlab_norn_poem_00967    3232
                         ... 
dhlab_norn_poem_01857      28
dhlab_norn_poem_01962      27
dhlab_norn_poem_01381      24
dhlab_norn_poem_01835      22
dhlab_norn_poem_01569       1
Name: count, Length: 2940, dtype: int64

In [63]:
get_dtm(df)

Unnamed: 0,51,Forsvarssange,.,1,Opfang,til,Totens,forsvarsforening,1889.,(,...,q1,duftglød,VISE,fioler,silkedyne,LANGT,BAG,SORTE,BJERGE,Ligge
0,1,1,13,2,1,2,1,1,1,2,...,0,0,0,0,0,0,0,0,0,0
1,0,0,9,0,0,1,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,1,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,8,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,0,37,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2935,0,0,8,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2936,0,0,7,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2937,0,0,3,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2938,0,0,4,0,0,1,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0


In [54]:
total_word_count(df)

Unnamed: 0,frequency
",",69401
.,36224
og,25679
i,15608
som,9880
...,...
lefled,1
g6,1
JA,1
fløilsøine,1


In [61]:
(collocation("havet", df, window=10)  / total_word_count(df)).sort_values(by="frequency", ascending=False).head(20)

Unnamed: 0,frequency
landsen,1.0
vikings,1.0
Blygraat,1.0
Bredt,1.0
medynkstaarer,1.0
vånde,1.0
vældede,1.0
voggad,1.0
gnikar,1.0
gjennemsigtig,1.0


In [60]:
concordance("vikings", df, window=10)

Unnamed: 0,dhlabid,concordance_index,word_index,left,token,right
0,dhlab_norn_poem_00961,1,1317,", lav og gold , af havet skyllet blank som",vikings,skjold . Igjennem regnfuld luft og tunge taage...


In [34]:
res = get_window("havet", df, 5)

res

Unnamed: 0,dhlabid,concordance_index,word_index,left,token,right
0,dhlab_norn_poem_02690,1,146,"[14, Sammen, vi, maa, mot]",havet,"[gaa, –, Endnu, er, færden]"
1,dhlab_norn_poem_02691,1,5,"[14, Sammen, vi, maa, mot]",havet,"[gaa, –, Endnu, er, færden]"
2,dhlab_norn_poem_02639,1,42,"[høsttunge, ,, lange, kvæld, ved]",havet,"[det, gamle, ,, evige, unge]"
3,dhlab_norn_poem_02639,2,80,"[kamp, med, forlevede, former, ,]",havet,"[derude, kalder, til, strid, og]"
4,dhlab_norn_poem_02638,1,221,"[høsttunge, ,, lange, kvæld, ved]",havet,"[det, gamle, ,, evige, unge]"
...,...,...,...,...,...,...
178,dhlab_norn_poem_00469,1,62,"[gled, de, om, pynten, mod]",havet,"[,, det, fril, Jeg, saa]"
179,dhlab_norn_poem_00482,1,65,"[Men, langt, histude, ,, hvor]",havet,"[gaar, i, dønninger, tunge, og]"
180,dhlab_norn_poem_00482,2,267,"[om, sin, pande, ., Og]",havet,"[hugger, ,, og, havet, slaar]"
181,dhlab_norn_poem_00482,3,271,"[Og, havet, hugger, ,, og]",havet,"[slaar, langt, ude, fra, verdens]"


In [35]:
res.groupby("dhlabid")["word_index"].count()

dhlabid
dhlab_norn_poem_00034    1
dhlab_norn_poem_00058    1
dhlab_norn_poem_00060    3
dhlab_norn_poem_00064    2
dhlab_norn_poem_00086    2
                        ..
dhlab_norn_poem_02886    3
dhlab_norn_poem_02888    1
dhlab_norn_poem_02891    1
dhlab_norn_poem_02910    2
dhlab_norn_poem_02914    1
Name: word_index, Length: 105, dtype: int64

In [37]:
vals = res.groupby("dhlabid")["word_index"].unique()
vals

dhlabid
dhlab_norn_poem_00034             [56]
dhlab_norn_poem_00058             [52]
dhlab_norn_poem_00060     [8, 52, 136]
dhlab_norn_poem_00064        [83, 107]
dhlab_norn_poem_00086         [52, 91]
                             ...      
dhlab_norn_poem_02886    [40, 73, 215]
dhlab_norn_poem_02888            [237]
dhlab_norn_poem_02891             [58]
dhlab_norn_poem_02910        [42, 159]
dhlab_norn_poem_02914             [92]
Name: word_index, Length: 105, dtype: object

In [46]:
for i, val in vals.items():
    distances = find_index_distance(val)
    if distances:
        for distance in distances:
            if distance < 10:
                print(i, val, distances, distance)
            
            
        # print(i, find_index_distance(val))

dhlab_norn_poem_00482 [ 65 267 271] [202, 4] 4
dhlab_norn_poem_01107 [1 4] [3] 3
dhlab_norn_poem_01340 [205 208 299] [3, 91] 3
dhlab_norn_poem_01898 [ 22 102 110] [80, 8] 8
dhlab_norn_poem_02333 [16 22 58] [6, 36] 6
dhlab_norn_poem_02352 [ 30  38 148] [8, 110] 8
dhlab_norn_poem_02427 [  37  219  569 1255 1258] [182, 350, 686, 3] 3


In [19]:
def count_token(token, df):
    return df.tokens.apply(lambda x: x.count(token)).sum()

def count_tokens(tokens, df):
    # return df.tokens.apply(lambda x: sum([x.count(token) for token in tokens])).sum()
    return pd.DataFrame.from_dict({token: count_token(token, df) for token in tokens}, orient="index", columns=["frequency"]).sort_values(by="frequency", ascending=False)



In [12]:
count_token("havet", df)

183

In [20]:
count_tokens(["havet", "skogen"], df)

Unnamed: 0,frequency
havet,183
skogen,81


In [76]:
collocation("hei", df)

Unnamed: 0,frequency
",",195
og,69
.,65
!,61
i,30
...,...
gaamo,1
kaane,1
Daganna,1
styttast,1


In [25]:
df.set_index("dhlabid").loc[:, ["text", "tokens"]]

Unnamed: 0_level_0,text,tokens
dhlabid,Unnamed: 1_level_1,Unnamed: 2_level_1
dhlab_norn_poem_02715,51 \n\n\nForsvarssange. \n1. Opfang til Totens...,"[51, Forsvarssange, ., 1, ., Opfang, til, Tote..."
dhlab_norn_poem_02723,65 \n\n\n2. Peder Pavels Aabel. \nfød 15 mai 1...,"[65, 2, ., Peder, Pavels, Aabel, ., fød, 15, m..."
dhlab_norn_poem_02693,"16 \n\n\nSend da opp, højere opp, \n\ntankens ...","[16, Send, da, opp, ,, højere, opp, ,, tankens..."
dhlab_norn_poem_02709,"45 \n\n\nJa, var ei Vesterhavets rand, \n\nhvo...","[45, Ja, ,, var, ei, Vesterhavets, rand, ,, hv..."
dhlab_norn_poem_02702,"50 \n\n\nStilt laag ,hinn mjove» bak Stavanesl...","[50, Stilt, laag, ,, hinn, mjove, », bak, Stav..."
...,...,...
dhlab_norn_poem_00488,DOMMEN. \n\n\nEn dag er dødl \n\nEn nat er kom...,"[DOMMEN, ., En, dag, er, dødl, En, nat, er, ko..."
dhlab_norn_poem_00494,I ØRKNEN. \n\n\nEn dødstræt palme skygger mig ...,"[I, ØRKNEN, ., En, dødstræt, palme, skygger, m..."
dhlab_norn_poem_00480,TIL - \n\n\nMod dig gaar al min længsel \nog a...,"[TIL, -, Mod, dig, gaar, al, min, længsel, og,..."
dhlab_norn_poem_00484,VISE: \n\n\nGaa med mig over fjeld og hei \nog...,"[VISE, :, Gaa, med, mig, over, fjeld, og, hei,..."


In [10]:
concordance("havet", df, 20, join = True).head().style

Unnamed: 0,0
0,"b'[{""author"":""Andreas Aabel"",""book_title"":""H\\..."
1,"b'dhlab_norn_poem_02715"",""digital_visning"":NaN..."
2,b'pfang til Totens forsvarsforening 1889. \\n(...
3,b'nat du snart er overfl\\u00f8jet \\nav din e...
4,b'ens hele stasen mu tter \\nfrir fra \\u00abC...
...,...
89355,"b'ed Gud. \\n\\n\\n\\n\\n"",""title"":""Langt bag ..."
89356,"b'rte"",""bjerge"",""ligger"",""en"",""stille"",""sj\\u0..."
89357,"b'""at"",""d\\u00f8"",""."",""Ligge"",""der"",""strakt"",""..."
89358,"b',""og"",""livet"",""rulle"",""mig"",""langsomt"",""forb..."
