In [1]:
from collections import Counter, defaultdict
from itertools import combinations, product
import time
from nltk.corpus import stopwords
from nltk import ngrams
import numpy as np
import re
import math
import pprint

In [2]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\monox\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
token_min_len = 2
token_max_len = 64

stop_words = stopwords.words("german")

In [4]:
content = open('Schamanismus.txt', "r").read()
content = " ".join([s.strip() for s in content.split("\n")])
sentences = []

token_count = 0
for sent in content.split("."):  # Thats an awesome sentence segmenter..
    sent = re.sub(r"[^a-zA-Z0-9öäüÖÄÜß\.\?!-]", " ", sent)
    tokens = [t.strip().lower() for t in sent.split(" ")]
    tokens = [t for t in tokens if len(t) > 1]
    # tokens = [t for t in tokens if t.lower() not in stop_words]
    sentences.append(tokens)
    token_count += len(tokens)
    
print(f"Tokens: {token_count}")
print(f"Sents: {len(sentences)}")

FileNotFoundError: [Errno 2] No such file or directory: 'Schamanismus.txt'

In [16]:
bigrams = []
unigrams = []  # just a list of all tokens
for sent in sentences:
    
    if len(sent) < 2:
        continue
    
    sent.insert(0, "<s>")
    sent.append("</s>")
    
    tmp_bigrams = list(ngrams(sent, 2))
    bigrams += tmp_bigrams
    
    unigrams += sent

In [6]:
bigrams

[('<s>', 'schamanismus'),
 ('schamanismus', 'bezeichnet'),
 ('bezeichnet', 'im'),
 ('im', 'engeren'),
 ('engeren', 'sinne'),
 ('sinne', 'die'),
 ('die', 'traditionellen'),
 ('traditionellen', 'ethnischen'),
 ('ethnischen', 'religionen'),
 ('religionen', 'des'),
 ('des', 'kulturareales'),
 ('kulturareales', 'sibirien'),
 ('sibirien', 'nenzen'),
 ('nenzen', 'jakuten'),
 ('jakuten', 'altaier'),
 ('altaier', 'burjaten'),
 ('burjaten', 'ewenken'),
 ('ewenken', 'auch'),
 ('auch', 'europäische'),
 ('europäische', 'samen'),
 ('samen', 'und'),
 ('und', 'andere'),
 ('andere', 'bei'),
 ('bei', 'denen'),
 ('denen', 'das'),
 ('das', 'vorhandensein'),
 ('vorhandensein', 'von'),
 ('von', 'schamanen'),
 ('schamanen', 'von'),
 ('von', 'europäischen'),
 ('europäischen', 'forschern'),
 ('forschern', 'der'),
 ('der', 'expansionszeit'),
 ('expansionszeit', 'als'),
 ('als', 'wesentliches'),
 ('wesentliches', 'gemeinsames'),
 ('gemeinsames', 'kennzeichen'),
 ('kennzeichen', 'erachtet'),
 ('erachtet', 'wurde'

In [17]:
unigram_counter = Counter(unigrams)
unigram_counter.most_common(25)

[('<s>', 358),
 ('</s>', 358),
 ('der', 285),
 ('und', 283),
 ('die', 274),
 ('in', 146),
 ('den', 96),
 ('des', 87),
 ('oder', 76),
 ('schamanismus', 67),
 ('als', 67),
 ('von', 63),
 ('sich', 62),
 ('mit', 60),
 ('eine', 58),
 ('das', 57),
 ('auf', 53),
 ('schamanen', 51),
 ('zu', 51),
 ('bei', 50),
 ('im', 49),
 ('es', 38),
 ('für', 38),
 ('auch', 37),
 ('ist', 37)]

In [18]:
bigram_counter = Counter(bigrams)
bigram_counter.most_common(25)

[(('in', 'der'), 29),
 (('<s>', 'die'), 24),
 (('in', 'den'), 20),
 (('vor', 'allem'), 16),
 (('und', 'die'), 14),
 (('auf', 'die'), 12),
 (('für', 'die'), 11),
 (('bei', 'den'), 11),
 (('<s>', 'in'), 10),
 (('in', 'die'), 10),
 (('und', 'der'), 9),
 (('der', 'schamanen'), 9),
 (('<s>', 'es'), 9),
 (('<s>', 'jahrhunderts'), 9),
 (('mit', 'der'), 8),
 (('<s>', 'der'), 8),
 (('schamanismus', 'als'), 8),
 (('mit', 'dem'), 8),
 (('des', 'schamanismus'), 7),
 (('bei', 'der'), 7),
 (('<s>', 'müller'), 7),
 (('<s>', 'dies'), 7),
 (('darüber', 'hinaus'), 7),
 (('bei', 'denen'), 6),
 (('<s>', 'im'), 6)]

In [20]:
bigram_model = defaultdict(dict)

for bigram, count in bigram_counter.items():
    w_1, w, = bigram
    w_1_frequ = unigram_counter[w_1]
    
    prob = count / w_1_frequ
    bigram_model[w_1][w] = prob

In [30]:
pprint.pprint(bigram_model["bezeichnet"])

{'</s>': 0.14285714285714285,
 'im': 0.14285714285714285,
 'schamanismus': 0.14285714285714285,
 'werden': 0.2857142857142857,
 'wird': 0.2857142857142857}


In [31]:
def eval_model(model, sentence):
    tokens = sentence.split()
    bigrams = ngrams(tokens, 2)
    
    log_prob = 0
    for bigram in bigrams:
        t_1, t = bigram
        
        try:
            probability = model[t_1][t]
        except:
            print("Prob not found for", bigram)
            probability = 0.00001
        
        log_prob += math.log(probability)
    
    log_prob = math.exp(log_prob)
    print(f"Probability for '{sentence}' is: {log_prob}")
    print("\n\n")


eval_model(bigram_model, "sibirischer schamanismus ist entscheidend")
eval_model(bigram_model, "Schamanismus bezeichnet auch".lower())
eval_model(bigram_model, "im engeren Sinne".lower())

Prob not found for ('sibirischer', 'schamanismus')
Prob not found for ('ist', 'entscheidend')
Probability for 'sibirischer schamanismus ist entscheidend' is: 4.477611940298505e-08



Probability for 'schamanismus bezeichnet schamanismus' is: 0.0021321961620469087



Probability for 'im engeren sinne' is: 0.010204081632653057



