In [None]:
# imports
from typing import List
from datetime import datetime

import cv2
import os
import re
import requests
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# constants
VALID_CHAR_REGEX = "[A-Z0-9]"
VALID_CHAR_CZECH_REGEX = "[a-zA-ZáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ]" # czech lang
VALID_CHAR_POLISH_REGEX = "[a-zA-ZąćęłńóśżźĄĆĘŁŃÓŚŻŹ]" # polish lang
VALID_CHAR_GERMAN_REGEX = "[a-zA-ZÄäÖöÜüẞß]" # german lang
VALID_CHAR_ENGLISH_REGEX = "[a-zA-Z]"
INVALID_CHAR_REGEX = "[.,()«»?!-—:;…]"
WOJNICZ_INPUT_FILE = "inputs\\wojnicz.txt"
REAL_INPUT_FILE = "inputs\\dinosauri-clean.txt"
DPI = 1
FIGSIZE = (1200/DPI, 400/DPI)

VALID_CHAR_REAL_REGEX = VALID_CHAR_CZECH_REGEX

In [None]:
# def function: save list of strings to file, separated by newlines
def save_to_file(input: List[str], path: str):
    file = open(path, "w")
    for line in input[:-1]:
        file.write(line + "\n")
    file.write(input[-1])
    file.close()
    

In [None]:
def plot_occ(input: dict[str, int], filename: str):
    # generate occurance count graph
    plt.plot(range(500), list(input.values())[:500], 'o', color='#444499')
    plt.plot(range(200), list(input.values())[:200], 'o', color='#8888ff')

    # save plot to file and show
    if filename == "":
        filename = datetime.now().strftime("%Y%m%d-%H%M%S")
    plt.savefig("outputs\\occurance_" + filename + ".png")
    plt.show()
    

In [None]:
# def function: plot zipf numbers
def plot_zipf(input: dict[str, float], filename: str):
    sum = np.sum(list(input.values()))
    avg = np.average(list(input.values())[:200])
    avg_mid = np.average(list(input.values())[50:200])
    first = 0
    for z in input.values():
        first += 1
        if z >= avg:
            break

    print("Average value of first 200 items (highlighted):  ", avg)
    print("Average value of 50th to 200th item:             ", avg_mid)
    print("Index of first item to reach the average:        ", first)
    
    plt.plot(list(input.values())[:500], "#444499", label="Zipf values")
    plt.plot(list(input.values())[:200], "#8888ff", label="Zipf values (first 200)")
    plt.plot([avg for i in range(500)], "#ff1155", label="Average of first 200 Zipf values")
    plt.plot([avg_mid for i in range(500)], "#ff9944", label="Average of 50th to 200th Zipf values")
    plt.legend()

    if filename == "":
        filename = datetime.now().strftime("%Y%m%d-%H%M%S")
    plt.savefig("outputs\\zipf_" + filename + ".png")
    plt.show()

## Cleaning text file

Removing all unnecesary lines and characters from input file.

The end result is a text file with only lines containing words and no dashes at the end of the line.  
Each line is separated by newline '<code>\n</code>'  
Each word within line is separated by comma '<code>,</code>'  
  
Output is saved to <code>cleaned.txt</code>

In [None]:
# open input text
# https://www.ic.unicamp.br/~stolfi/voynich/mirror/reeds/docs/FSG.txt
text = open(WOJNICZ_INPUT_FILE, 'r').read().split("\n")

In [None]:
# remove lines with no text
parsed = []

for line in text:
    if line == "":
        continue
    if line == "\x0c":
        continue
    if line.startswith("#"):
        continue
    if not re.search(VALID_CHAR_REGEX, line):
        continue

    parsed.append(line)


In [None]:
# remove dashes and equality signs from end of each line
cleaned = []

for line in parsed:
    # last valid char position
    endpos = 0
    for i, char in enumerate(line):
        if re.match(VALID_CHAR_REGEX, char):
            endpos = i

    cleaned.append(line[:i])

In [None]:
# save file
save_to_file(cleaned, "cleaned.txt")

## Extracting valid words
This step further facilitates the analysis of the text.

Separate words are now extracted to a single list of words.  
Some words have not been transcripted fully and some characters may not be identified.  
Since it is not definite what the words may actually be, they are going to be ommited. 
  
Output is saved to <code>words.txt</code>, one word per line.

In [None]:
# extract valid words from each line
words = []

for line in cleaned:
    tokens = line.split(",")
    for word in tokens:
        if re.match("^"+VALID_CHAR_REGEX+"*$", word):
            words.append(word)

In [None]:
# save file
save_to_file(words, "words.txt")

## Analyzing words

Valid words are being mapped their count of occurance within the text.  

Based on this data, the Zipf Law is applied to check if the text is written in a realistic human language.  
Further analysis includes graphing and visualising the data.

In [None]:
# def function: map occurance count for each word and sort by occurance count descending
def occurance_dict(input: List[str]):
    output = {}

    for word in input:
        if word in output.keys():
            output[word] += 1
        else:
            output[word] = 1
            
    output_desc = dict(sorted(output.items(), key=lambda item: item[1], reverse=True))
    return output_desc

In [None]:
# def function: get occurance percentages for each word
def occurance_percentage(input: List[str]):
    word_count = len(input)
    count_dict = occurance_dict(input)
    output = {}

    for word in count_dict.keys():
        output[word] = count_dict[word] / word_count

    return output

In [None]:
# def function: calculate zipf value for each word (occurance percentage * index) 
def zipf_values(input: List[str]):
    word_count = len(input)
    perc_dict = occurance_percentage(input)
    output = {}

    for i, word in enumerate(perc_dict.keys()):
        output[word] = perc_dict[word] * (i+1) * 100

    return output 

In [None]:
# calculate all statistics for the text
wojnicz_count = len(words)
wojnicz_occ = occurance_dict(words)
wojnicz_perc = occurance_percentage(words)
wojnicz_zipf = zipf_values(words)
wojnicz_zipf

In [None]:
print("Number of all valid words found in text:         ", wojnicz_count)
plot_zipf(wojnicz_zipf, "wojnicz")
plot_occ(wojnicz_occ, "wojnicz")

## Converting and analyzing real-world language

Text content of a selected source is converted to simillar format as the source text in order to analyze it in the same way.  
After that, it is used to calculate the same statistics as with the previous text.

Converted text is saved to <code>words_real.txt</code>, one word per line.

In [None]:
# load text from file
text = open(REAL_INPUT_FILE, 'r', encoding="utf8").read().replace("\n", " ")

In [None]:
# convert text to list of words
parsed = ""

for char in text:
    if char == ' ' or re.match(VALID_CHAR_REAL_REGEX, char):
        parsed += char

parsed = parsed.split(" ")
words = []

for i, word in enumerate(parsed):
    if re.match("^"+VALID_CHAR_REAL_REGEX+"+$", word):
        words.append(word.upper())


In [None]:
# save file
save_to_file(words, "words_real.txt")

In [None]:
# analyze text
real_count = len(words)
real_occ = occurance_dict(words)
real_perc = occurance_percentage(words)
real_zipf = zipf_values(words)
real_zipf 

In [None]:
graph_name = re.sub("^inputs\\\\", "", re.sub("\\.txt$", "", REAL_INPUT_FILE))

print("Number of all valid words found in text:         ", real_count)
plot_zipf(real_zipf, graph_name)
plot_occ(real_occ, graph_name)

## Common analysis

Statistics are analyzed together for comparison.  
In addition, real-world language source is trimmed to contain the exact same amount of words as the Wojnicz manuscript.

In [None]:
# analyze portion of real text of same length as wojnicz manuscrypt
rshort_occ = occurance_dict(words[:wojnicz_count])
rshort_perc = occurance_percentage(words[:wojnicz_count])
rshort_zipf = zipf_values(words[:wojnicz_count])

In [None]:
plt.plot(list(wojnicz_zipf.values())[:100], "#0000ff", label="Wojnicz language")
plt.plot(list(real_zipf.values())[:100], "#ee6666", label="Real language - full sample")
plt.plot(list(rshort_zipf.values())[:100], "#ff0000", label="Real language - limited sample")
plt.legend()

plt.savefig("outputs\\comparison_" + graph_name + ".png")

plt.show()