In [8]:
import spacy

nlp: dict[str, spacy.Language] = {}

nlp["en"] = spacy.load("en_core_web_sm")
nlp["en"].max_length = 2000000

sources = {
    "en": "../data/moby_dick.txt",
}

sources_data = {}
for language in sources:
    file = open(sources[language], 'r')
    contents = file.read()
    sources_data[language] = contents


In [9]:
def tokenizer(text: str, model_lang: spacy.Language) -> list[str]:
    doc = model_lang(text)
    tokens = [token.text for token in doc if not token.is_space and not token.is_punct and not token.is_digit]
    return tokens

def tokens(sources_data: dict[str, str]) -> dict[str, list[str]]:
    tokenized_sources = {}
    for language in sources_data:
      tokenized_sources[language] = tokenizer(sources_data[language], nlp[language])
    return tokenized_sources

tokenized_sources = tokens(sources_data)

en
218310


In [20]:
import math
import matplotlib.pyplot as plt
import time

def process_data_d(data: list[str], d: int) -> dict[str, dict[str, int]]:
    # dict of token x -> token y -> f(x,y)
    processed_data: dict[str, dict[str, int]] = {}
    data_length = len(data)
    for i in range(data_length):
        token_data = processed_data.get(data[i], {})
        # TODO: should x == y be excluded?
        if i+d < data_length and data[i] != data[i+d]:
            # add 1 to the frequency of the token y that appears d tokens after the current token x
            token_data[data[i+d]] = token_data.get(data[i+d], 0) + 1
        elif i+d >= data_length:
            break
        processed_data[data[i]] = token_data
    return processed_data

def calculate_I_d(data: dict[str, dict[str, int]]) -> float:
    # calculate f(x|d) and f(y|d) and F
    f_x_d: dict[str, int] = {}
    f_y_d: dict[str, int] = {}
    F = 0
    for token_x in data:
        f_x_d[token_x] = 0
        for token_y in data[token_x]:
            f_y_d[token_y] = f_y_d.get(token_y, 0) + data[token_x][token_y]
            f_x_d[token_x] += data[token_x][token_y]
            F += data[token_x][token_y]

    # calculate I(d)
    I_d = 0
    for token_x in data:
        for token_y in data[token_x]:
            p_x_y = data[token_x][token_y]/F
            p_x_d = f_x_d[token_x]/F
            p_y_d = f_y_d[token_y]/F
            # sum p(x,y|d)*log(p(x,y|d)/(p(x|d)*p(y|d))) for each x and y
            I_d += p_x_y * math.log(p_x_y/(p_x_d * p_y_d))
    return I_d

for language in tokenized_sources:
    I_d: dict[int, float] = {}
    x = []
    y = []
    start = time.time()
    for d in range(1, len(tokenized_sources[language])-1):
        data_d= process_data_d(tokenized_sources[language], d)
        I_d[d] = calculate_I_d(data_d)
        if d%500 == 0 and d != 0:
            end = time.time()
            print(f"Progress: {d}/{len(tokenized_sources[language])-1} (elapsed time: {end - start})")
            start = time.time()
        x.append(d)
        y.append(I_d[d])

    print(f"Language: {language}")
    print(f"I(d) for d=1: {I_d[1]}")
    print(f"I(d) for d=2: {I_d[2]}")
    print(f"I(d) for d=3: {I_d[3]}")
    print(f"I(d) for d=4: {I_d[4]}")
    print(f"I(d) for d=5: {I_d[5]}")
    print()

    plt.plot(x, y)
    plt.show()


Progress: 50/218309
Progress: 100/218309
Progress: 150/218309
Progress: 200/218309
Progress: 250/218309
Progress: 300/218309
Progress: 350/218309
Progress: 400/218309
Progress: 450/218309
Progress: 500/218309
Progress: 550/218309
Progress: 600/218309
Progress: 650/218309
Progress: 700/218309
Progress: 750/218309
Progress: 800/218309
Progress: 850/218309
Progress: 900/218309
Progress: 950/218309
Progress: 1000/218309
Progress: 1050/218309
Progress: 1100/218309
Progress: 1150/218309
Progress: 1200/218309
Progress: 1250/218309
Progress: 1300/218309
Progress: 1350/218309
Progress: 1400/218309
Progress: 1450/218309
Progress: 1500/218309
Progress: 1550/218309
Progress: 1600/218309
Progress: 1650/218309
Progress: 1700/218309
Progress: 1750/218309
Progress: 1800/218309
Progress: 1850/218309
Progress: 1900/218309
Progress: 1950/218309
Progress: 2000/218309
Progress: 2050/218309
Progress: 2100/218309
Progress: 2150/218309
Progress: 2200/218309
Progress: 2250/218309
Progress: 2300/218309
Progress

KeyboardInterrupt: 