In [1375]:
# imports
from typing import List

import re
import matplotlib.pyplot as plt

In [1376]:
# constants
VALID_CHAR_REGEX = "[A-Z0-9]"
VALID_CHAR_CZECH_REGEX = "[a-zA-ZáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ]" # czech lang
VALID_CHAR_POLISH_REGEX = "[a-zA-ZąćęłńóśżźĄĆĘŁŃÓŚŻŹ]" # polish lang
VALID_CHAR_GERMAN_REGEX = "[a-zA-ZÄäÖöÜüẞß]" # german lang
VALID_CHAR_ENGLISH_REGEX = "[a-zA-Z]"
INVALID_CHAR_REGEX = "[.,()«»?!-—:;…]"
WOJNICZ_INPUT_FILE = "inputs\\wojnicz.txt"
REAL_INPUT_FILE = "inputs\\dinosauri-clean.txt"
DPI = 1
FIGSIZE = (1200/DPI, 400/DPI)

VALID_CHAR_PUNC_REGEX = "[a-zA-Z0-9/=]"
PUNCT_REGEX = "[=]"

In [1377]:
# def function: save list of strings to file, separated by newlines
def save_to_file(input: List[str], path: str):
    file = open(path, "w")
    for line in input[:-1]:
        file.write(line + "\n")
    file.write(input[-1])
    file.close()

## Cleaning the text file
Just like in Zipf's law analysis, we have to clean the text file properly.
The difference is, now we have to keep all the punctation marks,
which in Voynich's Manuspcript case means that we have to keep
all the dots (represented as equality signs) separating each
sentence.

Output is saved to <code>cleaned_with_punc.txt</code>

In [1378]:
# open input text
# https://www.ic.unicamp.br/~stolfi/voynich/mirror/reeds/docs/FSG.txt
text = open(WOJNICZ_INPUT_FILE, 'r').read().split("\n")

In [1379]:
# remove lines with no text
parsed = []

for line in text:
    if line == "":
        continue
    if line == "\x0c":
        continue
    if line.startswith("#"):
        continue
    if not re.search(VALID_CHAR_PUNC_REGEX, line):
        continue

    parsed.append(line)

In [1380]:
# remove dashes from end of each line
cleaned = []

for line in parsed:
    # last valid char position
    endpos = 0
    for i, char in enumerate(line):
        if re.match(VALID_CHAR_PUNC_REGEX, char) or char == '=':
            endpos = i

    if line[endpos] == '=':
        cleaned.append(line)
    else:
        cleaned.append(line[:i])

In [1381]:
# save file
save_to_file(cleaned, "cleaned_with_punc.txt")

## Extracting valid words separated with punctuation
Similarly to Zipf's law analysis, we have to properly
prepare our words, but this time, without removing
the equality signs servis as a proper punctuation
or the manuscrips.

In [1382]:
# extract valid words from each line
words_punc = []
for line in cleaned:
    tokens = line.split(",")
    for word in tokens:
        if re.match("^"+VALID_CHAR_PUNC_REGEX+"*$", word) and len(word)>0:
            if (word[-1]) == '=':
                words_punc.append(word[:-1])
                words_punc.append('=')
            else:
                words_punc.append(word)

In [1383]:
# save file
save_to_file(words_punc, "words_punc.txt")

In [1384]:
# get all neighbors from one side of a word
def get_side(words, index, side):
    ret_list = []

    for i in range(1,3):
        if 0 <= index+side*i < len(words):
            if words[index+side*i] == '=':
                return ret_list
            else:
                ret_list.append(words[index+side*i])

    return ret_list

# get all neighbors of a word
def get_neighbors(words, index):
    ret_list = []
    left = get_side(words, index, -1)
    left.reverse()
    ret_list.extend(left)
    right = get_side(words, index, 1)
    ret_list.extend(right)
    return ret_list

## Grouping the neighbors for each word in text
Next, we iterate through every word in our text to get its neighbors
and add them to the dictionary. In our dictionary, words serves as
keys and key has a list of every neighboring word occurring in text.
We also have to make sure that we don't include any word more than
once.

In [1385]:
# create a dictionary of neighbors
neighbors = {}
i = 0
for word in words_punc:
    new_words = get_neighbors(words_punc, i)
    if word in neighbors:
        neighbors[word].extend(new_words)
        neighbors[word] = list(set(neighbors[word]))
    else:
        neighbors[word] = new_words
    i = i+1


In [1386]:
## show neighbors of each word
neighbors

{'FGAG2': ['GDAE', 'AR'],
 'GDAE': ['OHAR',
  'ODAEAE',
  'T8G',
  '4ODG',
  'FGAG2',
  'OEG',
  'ODAN',
  'GDAR',
  '4ODZG',
  '4ODAE8AR',
  'OE',
  '8AET8G',
  'TAE',
  'AR',
  '8AM',
  '8G8AN',
  'OEDAM',
  '4OHOEG',
  'GHAM',
  '8AE'],
 'AR': ['HTCO2',
  '2AM',
  'GDCCO8AK',
  'OHCCOR',
  'OE8G',
  '4ODAEOR',
  'TG',
  'GDAE',
  '2',
  'T8G',
  'GO8AK',
  'THZ',
  'OEANR',
  'GSCO2',
  'FAII2',
  'GHCCG',
  'THAM',
  'TCOCDAR',
  'GHC8G',
  'PZCM',
  'G8AN',
  'ODAN',
  'TOHTG',
  '8',
  'HAE',
  'GHOR',
  'O8',
  'OFTG',
  'OESC8G',
  'DAM',
  '8AR',
  'SG',
  '8AN',
  'SODORAK',
  'AEOK',
  'K',
  'ODAR',
  'SCCCHTG',
  'AE8AR',
  'AIDZG',
  'OHC8G',
  'TOE',
  '8AE',
  'OHAR',
  'ODCOAK',
  '4ODCG',
  'SAR',
  'TCCOG',
  'SDOF',
  'RAM',
  'GDT8G',
  'OHCOE',
  'TCAR',
  'DCO8G',
  '8AEG',
  'GHASG',
  'GHCO2',
  'ARG',
  'HGPTCG',
  'TCCG',
  'TORAEG',
  'OR',
  'OF',
  '8AIR',
  'DZCO2',
  'R',
  'AEDCG',
  'OR8AM',
  'OPTC2',
  'OEDC8G',
  'AR',
  '8OR',
  'PT8AR',
  'TAE',
 

We have to clean our dictionary to remove punctuation marks counted
as words as well as words with empty lists of neighbors.

In [1387]:
## clean the dictionary
to_pop = []
for word in neighbors.keys():
    if re.match(PUNCT_REGEX ,word) or neighbors[word] == []:
        to_pop.append(word)

for word in to_pop:
    neighbors.pop(word)

## Counting each word's possible neighbors
When we already extracted a list of different
neighbors for each word, we can easily count
how many unique neighbors each word does have
and sort  them.

In [1388]:
## count unique neighbors for each word
neighbor_count = {}
for word in neighbors.keys():
    neighbor_count[word] = len(neighbors[word])

neighbor_count = dict(sorted(neighbor_count.items(), key=lambda item: item[1], reverse=True))

neighbor_count

{'8AM': 794,
 'TOE': 409,
 'TOR': 340,
 '8G': 298,
 '8AR': 297,
 'SOE': 274,
 'TG': 249,
 '8AE': 240,
 'OR': 238,
 'HZG': 225,
 'SO': 225,
 '2': 214,
 'TCG': 202,
 'SOR': 201,
 '8AN': 192,
 'SG': 191,
 'AR': 179,
 'OE': 166,
 'AM': 166,
 'TC8G': 163,
 '8AK': 147,
 '2AM': 147,
 'ODG': 146,
 'ODAM': 144,
 'SCG': 140,
 'T8G': 139,
 '4ODTG': 131,
 '8OE': 130,
 '4OHTG': 130,
 'TDZG': 128,
 'OHG': 126,
 'TO8G': 125,
 '8OR': 122,
 'SC8G': 120,
 'HZOE': 117,
 'HZOR': 112,
 'G': 111,
 '8AIR': 110,
 'ODAE': 110,
 '4ODG': 110,
 'TCOR': 110,
 'TAR': 107,
 '4ODC8G': 105,
 'DZG': 103,
 'TAM': 102,
 'TO': 101,
 'OHAM': 100,
 '4ODAM': 98,
 'R': 98,
 'ODAR': 95,
 'OHTG': 94,
 'TCDG': 90,
 '4ODAR': 90,
 '4OHG': 89,
 'TCOE': 88,
 'O8AM': 86,
 'DTG': 85,
 'OHOE': 82,
 '4ODOE': 82,
 'ODC8G': 82,
 'HZCG': 81,
 'DAM': 81,
 'HTG': 81,
 'K': 81,
 '8': 79,
 'OHAE': 77,
 '2G': 76,
 'TOHG': 75,
 'SO8G': 74,
 '8TG': 74,
 'SCCG': 74,
 'OHTOE': 73,
 'TO8AM': 72,
 'ODOE': 69,
 'TODG': 68,
 '8TOR': 67,
 'ODTG': 66,
 '

## Checking the values for another text written in real-world language

In [1389]:
# load text from file
text = open(REAL_INPUT_FILE, 'r', encoding="utf8").read().replace("\n", " ")

VALID_CHAR_PUNC_REGEX = "[a-zA-ZáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ.!?()=]"
PUNCT_REGEX = "[.!?()=]"

In [1390]:
# convert text to list of words
parsed = ""

for char in text:
    if re.match(PUNCT_REGEX, char):
        parsed+= " = "
    elif char == ' ' or re.match(VALID_CHAR_PUNC_REGEX, char):
        parsed += char

parsed = parsed.split(" ")
words = []


for i, word in enumerate(parsed):
    if re.match("^"+VALID_CHAR_PUNC_REGEX+"+$", word):
            words.append(word.upper())

In [1391]:
# save file
save_to_file(words, "words_real_punc.txt")

In [1392]:
# create a dictionary of neighbors
neighbors = {}
i = 0
for word in words:
    new_words = get_neighbors(words, i)
    if word in neighbors:
        neighbors[word].extend(new_words)
        neighbors[word] = list(set(neighbors[word]))
    else:
        neighbors[word] = new_words
    i = i+1

In [1393]:
## show neighbors of each word
neighbors

{'DINOSAUŘI': ['APOD',
  'POKUD',
  'HNÍZDA',
  'NEPTAČÍ',
  'ZÁSTUPCI',
  'NEUROVASKULÁRNÍM',
  'HYPOTÉZ',
  'SCHOPNI',
  'MEZIOBRATLOVÉ',
  'JAKKOLIV',
  'MEZI',
  'OBECNĚ',
  'KTEŘÍ',
  'MALÍ',
  'JISTĚ',
  'DOMNĚNKY',
  'PŘEŽILI',
  'AGILNÍ',
  'VŠICHNI',
  'POPULÁRNÍ',
  'NAPŘÍKLAD',
  'LEONARDO',
  'PRAVDĚPODOBNĚ',
  'JAKÉ',
  'POSTUPNĚ',
  'DROBNÍ',
  'ROHATÍ',
  'TOM',
  'ČETNÍ',
  'MNOZÍ',
  'ŘÍCI',
  'DRUHOU',
  'EXISTOVAT',
  'BY',
  'V',
  'ROVNĚŽ',
  'TAK',
  'PSITTACOSAURUS',
  'NEBO',
  'PRO',
  'NADŘÁD',
  'OBVYKLE',
  'VE',
  'Z',
  'PROVOZOVAT',
  'VELMI',
  'TAKÉ',
  'BAREVNÝMI',
  'PŘEDMĚTEM',
  'MĚLI',
  'NA',
  'DINOSAURŮ',
  'SKUPINA',
  'VÝVOJOVĚ',
  'NICH',
  'PATŘÍ',
  'VŠEŽRAVÍ',
  'SE',
  'VYDÁVALI',
  'PLAZOPÁNVÍ',
  'BĚŽNOU',
  'PŠTROSÍ',
  'ZVUK',
  'DRUHOHORNÍCH',
  'SKUTEČNOSTI',
  'LITERATUŘE',
  'TYRANOSAURIDI',
  'ŽÁDNÝ',
  'EVOLUCE',
  'TAKTO',
  'JEDINÝMI',
  'JSOU',
  'SYSTÉM',
  'SVÁ',
  'EVOLUČNĚ',
  'BÝLOŽRAVÍ',
  'ZTRATILI',
  'ZATÍMCO',
  'UZ

In [1394]:
## clean the dictionary
to_pop = []
for word in neighbors.keys():
    if re.match(PUNCT_REGEX ,word) or neighbors[word] == []:
        to_pop.append(word)

for word in to_pop:
    neighbors.pop(word)

In [1395]:
## count unique neighbors for each word
neighbor_count = {}
for word in neighbors.keys():
    neighbor_count[word] = len(neighbors[word])

neighbor_count = dict(sorted(neighbor_count.items(), key=lambda item: item[1], reverse=True))

neighbor_count

{'A': 677,
 'V': 490,
 'SE': 389,
 'NA': 367,
 'DINOSAURŮ': 311,
 'JE': 261,
 'S': 246,
 'Z': 235,
 'O': 225,
 'I': 201,
 'ŽE': 200,
 'DINOSAUŘI': 198,
 'VŠAK': 172,
 'U': 172,
 'JIŽ': 151,
 'TAKÉ': 147,
 'JAKO': 144,
 'JSOU': 133,
 'JEJICH': 129,
 'NAPŘÍKLAD': 128,
 'K': 122,
 'AŽ': 120,
 'NEBO': 114,
 'VE': 112,
 'DO': 110,
 'BYLY': 109,
 'ALE': 101,
 'BYLA': 90,
 'ZA': 88,
 'BYLI': 83,
 'DRUHŮ': 78,
 'PO': 76,
 'OD': 76,
 'ZE': 73,
 'DINOSAURA': 73,
 'MEZI': 71,
 'BÝT': 71,
 'ROKU': 71,
 'KTERÉ': 70,
 'BY': 69,
 'PRO': 68,
 'LET': 66,
 'BYLO': 66,
 'OBDOBÍ': 65,
 'ASI': 65,
 'NĚKTERÝCH': 65,
 'NEŽ': 64,
 'BYL': 64,
 'PŘED': 60,
 'KTERÝ': 60,
 'DRUHU': 58,
 'TO': 57,
 'PRVNÍ': 57,
 'TAK': 57,
 'DINOSAURY': 56,
 'KTEŘÍ': 54,
 'VELMI': 54,
 'ZŘEJMĚ': 54,
 'TEDY': 54,
 'KDY': 54,
 'FOSILNÍ': 52,
 'JEN': 51,
 'PAK': 50,
 'PODLE': 50,
 'KOLEM': 50,
 'PRAVDĚPODOBNĚ': 49,
 'JAK': 49,
 'OBJEVY': 48,
 'FOSILNÍCH': 48,
 'POUZE': 47,
 'DINOSAUŘÍ': 47,
 'DRUHOHORNÍCH': 46,
 'COŽ': 46,
 'RODU': 4