In [92]:
# imports
from typing import List
from datetime import datetime

import cv2
import os
import re
import requests
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx

In [93]:
# constants
VALID_CHAR_REGEX = "[A-Z0-9]"
VALID_CHAR_CZECH_REGEX = "[a-zA-ZáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ]" # czech lang
VALID_CHAR_POLISH_REGEX = "[a-zA-ZąćęłńóśżźĄĆĘŁŃÓŚŻŹ]" # polish lang
VALID_CHAR_GERMAN_REGEX = "[a-zA-ZÄäÖöÜüẞß]" # german lang
VALID_CHAR_ENGLISH_REGEX = "[a-zA-Z]"
INVALID_CHAR_REGEX = "[.,()«»?!-—:;…]"
WOJNICZ_INPUT_FILE = "inputs\\wojnicz.txt"
REAL_INPUT_FILE = "inputs\\dinosauri-clean.txt"
DPI = 1
FIGSIZE = (1200/DPI, 400/DPI)

VALID_CHAR_PUNC_REGEX = "[a-zA-Z0-9/=]"
PUNCT_REGEX = "[=]"

In [94]:
# def function: save list of strings to file, separated by newlines
def save_to_file(input: List[str], path: str):
    file = open(path, "w")
    for line in input[:-1]:
        file.write(line + "\n")
    file.write(input[-1])
    file.close()

In [95]:
# open text prepared in connetwork file
# https://www.ic.unicamp.br/~stolfi/voynich/mirror/reeds/docs/FSG.txt
text = open("words_punc.txt", 'r').read().split("\n")

In [96]:
def find_bigrams(text):
    bigrams = []
    for i, word in enumerate(text):
        if not re.match(PUNCT_REGEX, word) and not re.match(PUNCT_REGEX, text[i+1]):
            bigrams.append((word, text[i+1]))
    return bigrams

bigrams = find_bigrams(text)
bigrams

[('FGAG2', 'GDAE'),
 ('GDAE', 'AR'),
 ('AR', 'GHAM'),
 ('GHAM', 'SOE'),
 ('SOE', 'SORG'),
 ('SORG', '0D0RC2'),
 ('0D0RC2', 'GDOR'),
 ('GDOR', 'SOE8G'),
 ('SOE8G', '2ORG'),
 ('2ORG', 'DZAR'),
 ('DZAR', 'ORG'),
 ('ORG', 'DAIR'),
 ('DAIR', 'THAM'),
 ('THAM', 'SOR'),
 ('SOR', 'AR'),
 ('AR', 'HZAR'),
 ('HZAR', 'HZAR'),
 ('HZAR', '8ALA'),
 ('8ALA', '2GAIIR'),
 ('2GAIIR', 'SCDG'),
 ('SCDG', 'OR'),
 ('OR', 'GDAM'),
 ('GDAM', 'SO8'),
 ('SO8', 'HZOARG'),
 ('HZOARG', 'HZC2'),
 ('HZC2', '8ARAM'),
 ('8ARAM', '2G'),
 ('2G', '0OM'),
 ('0OM', 'OHCCG'),
 ('OHCCG', 'OHCAR'),
 ('OHCAR', 'ROEOHG'),
 ('ROEOHG', 'HZAAR'),
 ('HZAAR', '8AM'),
 ('8AM', 'ODAM'),
 ('ODAM', 'OR'),
 ('OR', 'ODAL'),
 ('ODAL', '2AIRG'),
 ('2AIRG', 'TCAR'),
 ('TCAR', 'HZAM'),
 ('HZAM', 'PZAR'),
 ('PZAR', 'FZAM'),
 ('O8AR', 'SG'),
 ('SG', 'SOE'),
 ('SOE', 'PZOG'),
 ('PZOG', 'OG8AR'),
 ('OG8AR', 'S'),
 ('S', '2'),
 ('2', 'FZOAM'),
 ('FZOAM', 'SO8ARG'),
 ('SO8ARG', 'GSCG'),
 ('GSCG', 'SO8G'),
 ('SO8G', 'ODTO'),
 ('ODTO', 'G'),
 ('G', 'O

In [97]:
def count_occurance(bigrams):
    bigrams_count = {}
    for bigram in bigrams:
        if bigram in bigrams_count.keys():
            bigrams_count[bigram] += 1
        else:
            bigrams_count[bigram] = 1

    bigrams_count = dict(sorted(bigrams_count.items(), key=lambda item: item[1], reverse=True))
    return bigrams_count

In [98]:
bigrams_count = count_occurance(bigrams)
bigrams_count

{('TOE', '8AM'): 30,
 ('TOE', 'TOE'): 19,
 ('8AM', '8AM'): 12,
 ('8AM', 'HZG'): 11,
 ('TOR', '8AM'): 10,
 ('TG', '8AM'): 9,
 ('SOE', '8AM'): 9,
 ('TOR', 'TOE'): 8,
 ('TOE', 'SOE'): 8,
 ('8AM', 'HZOR'): 8,
 ('8AM', 'SO'): 7,
 ('TOE', 'TOR'): 7,
 ('8AM', '8AE'): 7,
 ('OR', 'AM'): 7,
 ('8AM', 'TOE'): 7,
 ('TOE', 'HZOE'): 6,
 ('8AM', 'HZOE'): 6,
 ('HZG', '8AM'): 6,
 ('8AM', 'TOR'): 6,
 ('OHOE', 'TOE'): 6,
 ('8AN', '8AM'): 6,
 ('8AM', '8AN'): 6,
 ('TG', 'DTG'): 6,
 ('8AM', '8AK'): 6,
 ('SG', '8AM'): 6,
 ('8AM', 'SOR'): 5,
 ('4ODOE', '8AM'): 5,
 ('TOR', 'TOR'): 5,
 ('HZOE', 'TOE'): 5,
 ('8G', '8AM'): 5,
 ('SOR', '8AM'): 5,
 ('8AM', 'TG'): 5,
 ('8G', '8G'): 5,
 ('TOE', 'HZG'): 5,
 ('8AM', '4ODTG'): 5,
 ('4ODG', '8AM'): 5,
 ('8AM', '8G'): 5,
 ('OHOE', '8AM'): 5,
 ('8AM', 'OR'): 5,
 ('8AM', '2'): 5,
 ('TG', 'TG'): 5,
 ('8AM', 'ODAM'): 4,
 ('HZOE', '8AM'): 4,
 ('8AR', 'TCG'): 4,
 ('TOR', 'TG'): 4,
 ('2', 'AM'): 4,
 ('8AM', 'THZG'): 4,
 ('TOE', 'TO8G'): 4,
 ('8OR', 'TOE'): 4,
 ('TOE', 'TG'): 4,
 

## Compare with real-world language

In [99]:
#open text
text_real = open("words_real_punc.txt", 'r').read().split("\n")

In [100]:
bigrams_real = find_bigrams(text_real)
bigrams_real

[('Z', 'ŘEC'),
 ('DEINOS', 'SAUROS'),
 ('STRAŠNÝ', 'JEŠTĚR'),
 ('JEŠTĚR', 'NĚKDY'),
 ('NĚKDY', 'PŘEKLÁDÁNO'),
 ('PŘEKLÁDÁNO', 'HROZNÝ'),
 ('HROZNÝ', 'PLAZ'),
 ('JSOU', 'ROZMANITOU'),
 ('ROZMANITOU', 'SKUPINOU'),
 ('SKUPINOU', 'OBRATLOVCŮ'),
 ('OBRATLOVCŮ', 'ŘAZENOU'),
 ('ŘAZENOU', 'MEZI'),
 ('MEZI', 'PLAZY'),
 ('KTERÁ', 'DOMINOVALA'),
 ('DOMINOVALA', 'FAUNĚ'),
 ('FAUNĚ', 'NA'),
 ('NA', 'PEVNINÁCH'),
 ('PEVNINÁCH', 'TÉTO'),
 ('TÉTO', 'PLANETY'),
 ('PLANETY', 'PŘES'),
 ('PŘES', 'MILIONŮ'),
 ('MILIONŮ', 'LET'),
 ('LET', 'V'),
 ('V', 'OBDOBÍ'),
 ('OBDOBÍ', 'DRUHOHOR'),
 ('ZEJMÉNA', 'V'),
 ('V', 'JURSKÉ'),
 ('JURSKÉ', 'A'),
 ('A', 'KŘÍDOVÉ'),
 ('KŘÍDOVÉ', 'PERIODĚ'),
 ('PERIODĚ', 'ASI'),
 ('ASI', 'PŘED'),
 ('PŘED', 'AŽ'),
 ('AŽ', 'MILIONY'),
 ('MILIONY', 'LET'),
 ('DLE', 'SOUČASNÉ'),
 ('SOUČASNÉ', 'SYSTEMATIKY'),
 ('SYSTEMATIKY', 'PATŘÍ'),
 ('PATŘÍ', 'MEZI'),
 ('MEZI', 'DINOSAURY'),
 ('DINOSAURY', 'TAKÉ'),
 ('TAKÉ', 'PTÁCI'),
 ('PTÁCI', 'KTEŘÍ'),
 ('KTEŘÍ', 'JSOU'),
 ('JSOU', 'JEJICH'),
 ('

In [101]:
bigrams_real_count = count_occurance(bigrams_real)
bigrams_real_count

{('V', 'ROCE'): 22,
 ('MILIONY', 'LET'): 16,
 ('DRUHŮ', 'DINOSAURŮ'): 16,
 ('NA', 'KONCI'): 14,
 ('DRUHOHORNÍCH', 'DINOSAURŮ'): 14,
 ('S', 'M'): 13,
 ('V', 'OBDOBÍ'): 11,
 ('PŘED', 'MILIONY'): 11,
 ('DINOSAURŮ', 'V'): 11,
 ('DINOSAUŘI', 'BYLI'): 11,
 ('V', 'SOUČASNOSTI'): 11,
 ('SE', 'V'): 11,
 ('DINOSAUŘI', 'SE'): 10,
 ('NA', 'ÚZEMÍ'): 10,
 ('JEDNÁ', 'SE'): 10,
 ('ŽE', 'DINOSAUŘI'): 10,
 ('JE', 'VŠAK'): 10,
 ('S', 'DRUHY'): 10,
 ('MILIONŮ', 'LET'): 9,
 ('KONCI', 'KŘÍDY'): 9,
 ('JIŽ', 'V'): 9,
 ('V', 'PRŮBĚHU'): 9,
 ('DINOSAURŮ', 'SE'): 9,
 ('TEROPODNÍCH', 'DINOSAURŮ'): 9,
 ('U', 'NĚKTERÝCH'): 9,
 ('PODOBNĚ', 'JAKO'): 9,
 ('Z', 'ROKU'): 8,
 ('ASI', 'PŘED'): 7,
 ('VYMÍRÁNÍ', 'NA'): 7,
 ('PODLE', 'NĚKTERÝCH'): 7,
 ('VE', 'SKUTEČNOSTI'): 7,
 ('SE', 'TAKÉ'): 7,
 ('BY', 'SE'): 7,
 ('SE', 'O'): 7,
 ('VÍCE', 'NEŽ'): 7,
 ('NEPTAČÍCH', 'DINOSAURŮ'): 7,
 ('DINOSAURŮ', 'BYLY'): 7,
 ('AŽ', 'MILIONY'): 6,
 ('BYLY', 'OBJEVENY'): 6,
 ('A', 'TO'): 6,
 ('V', 'DOBĚ'): 6,
 ('DINOSAUŘI', 'V'): 6,
 ('JAKO'