In [None]:
from typing import List
import cv2
import os
import re
import requests
import numpy as np
import matplotlib.pyplot as plt

## Cleaning text file

Removing all unnecesary lines and characters from input file (<code>FSG.txt</code>).

The end result is a text file with only lines containing words and no dashes at the end of the line.  
Each line is separated by newline '<code>\n</code>'  
Each word within line is separated by comma '<code>,</code>'  
  
Output is saved to <code>cleaned.txt</code>

In [None]:
# constants

VALID_CHAR_REGEX = "[A-Z0-9]"
VALID_CHAR_REAL_REGEX = "[a-zA-ZáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ]" # czech lang
INVALID_CHAR_REGEX = "[.,()-]"

In [None]:
# open input text
# https://www.ic.unicamp.br/~stolfi/voynich/mirror/reeds/docs/FSG.txt
text = open('FSG.txt', 'r').read().split("\n")

In [None]:
# remove lines with no text
parsed = []

for line in text:
    if line == "":
        continue
    if line == "\x0c":
        continue
    if line.startswith("#"):
        continue
    if not re.search(VALID_CHAR_REGEX, line):
        continue

    parsed.append(line)


In [None]:
# remove dashes and equality signs from end of each line
cleaned = []

for line in parsed:
    # last valid char position
    endpos = 0
    for i, char in enumerate(line):
        if re.match(VALID_CHAR_REGEX, char):
            endpos = i

    cleaned.append(line[:i])

In [None]:
# save file
file = open("cleaned.txt", "w")
for line in cleaned[:-1]:
    file.write(line + "\n")
file.write(cleaned[-1])
file.close()

## Extracting valid words
This step further facilitates the analysis of the text.

Separate words are now extracted to a single list of words.  
Some words have not been transcripted fully and some characters may not be identified.  
Since it is not definite what the words may actually be, they are going to be ommited. 
  
Output is saved to <code>words.txt</code>, one word per line.

In [None]:
# open input file
text = open('cleaned.txt', 'r').read().split("\n")

In [None]:
# extract valid words from each line
words = []



for line in text:
    list = line.split(",")
    for word in list:
        if re.match("^"+VALID_CHAR_REGEX+"*$", word):
            words.append(word)

In [None]:
# save file
file = open("words.txt", "w")
for line in words[:-1]:
    file.write(line + "\n")
file.write(words[-1])
file.close()

## Analyzing words

Valid words are being mapped their count of occurance within the text.  

Based on this data, the Zipf Law is applied to check if the text is written in a realistic human language.  
Further analysis includes graphing and visualising the data.

In [None]:
# def function: map occurance count for each word and sort by occurance count descending
def occurance_dict(input: List[str]):
    output = {}

    for word in input:
        if word in output.keys():
            output[word] += 1
        else:
            output[word] = 1
            
    output_desc = dict(sorted(output.items(), key=lambda item: item[1], reverse=True))
    return output_desc

In [None]:
# get occurance percentages and calculate zipf value for each word (occurance percentage * index) 
def zipf_values(input: List[str]):
    word_count = len(input)
    count_dict = occurance_dict(input)
    occurance_percentage = {}

    for word in count_dict.keys():
        occurance_percentage[word] = count_dict[word] / word_count

    output = {}

    for i, word in enumerate(occurance_percentage.keys()):
        output[word] = occurance_percentage[word] * (i+1) * 100

    return output 

In [None]:
# open input file
text = open('words.txt', 'r').read().split("\n")

In [None]:
# map occurance count for each word
wojnicz_dict_desc = occurance_dict(text)
wojnicz_dict_desc

In [None]:
# calculate zipf value for each word (occurance percentage * index) 
zipf = zipf_values(text)
zipf 

## Converting and analyzing Wikipedia page

Text content of a Wikipedia page is converted to simillar format as the source text in order to analyze it in the same way.  
After that, it is used to calculate the same statistics as with the previous text.

Converted text is saved to <code>words_wiki.txt</code>, one word per line.

In [None]:
# load text from file
text = open('kralovec.txt', 'r', encoding="utf8").read().replace("\n", " ")

In [None]:
# convert text to list of words
parsed = ""

for char in text:
    if char == ' ' or re.match(VALID_CHAR_REAL_REGEX, char):
        parsed += char

parsed = parsed.split(" ")
words = []

for i, word in enumerate(parsed):
    if re.match("^"+VALID_CHAR_REAL_REGEX+"+$", word):
        words.append(word.upper())


In [None]:
# save file
file = open("words_wiki.txt", "w")
for line in words[:-1]:
    file.write(line + "\n")
file.write(words[-1])
file.close()

In [None]:
# analyze text
zipf_wiki = zipf_values(words)
zipf_wiki

TODO
- sprawdzić czy to wgl to o co chodzi bo robiłem z pamięci
- jakieś wykresy gośc chciał
- potem jeszce jakiś graf dwudzielny (????????????)