In [None]:
import cv2
import os
import re
import requests
import numpy as np
import matplotlib.pyplot as plt

## Cleaning text file

Removing all unnecesary lines and characters from input file (<code>FSG.txt</code>).

The end result is a text file with only lines containing words and no dashes at the end of the line.  
Each line is separated by newline '<code>\n</code>'  
Each word within line is separated by comma '<code>,</code>'  
  
Output is saved to <code>cleaned.txt</code>

In [None]:
# constants

VALID_CHAR_REGEX = "[a-zA-Z0-9]"

In [None]:
# open input text
# https://www.ic.unicamp.br/~stolfi/voynich/mirror/reeds/docs/FSG.txt
text = open('FSG.txt', 'r').read().split("\n")

In [None]:
# remove lines with no text
parsed = []

for line in text:
    if line == "":
        continue
    if line == "\x0c":
        continue
    if line.startswith("#"):
        continue
    if not re.search(VALID_CHAR_REGEX, line):
        continue

    parsed.append(line)


In [None]:
# remove dashes and equality signs from end of each line
cleaned = []

for line in parsed:
    # last valid char position
    endpos = 0
    for i, char in enumerate(line):
        if re.match(VALID_CHAR_REGEX, char):
            endpos = i

    cleaned.append(line[:i])

In [None]:
# save file
file = open("cleaned.txt", "w")
for line in cleaned[:-1]:
    file.write(line + "\n")
file.write(cleaned[-1])
file.close()

## Extracting valid words
This step further facilitates the analysis of the text.

Separate words are now extracted to a single list of words.  
Some words have not been transcripted fully and some characters may not be identified.  
Since it is not definite what the words may actually be, they are going to be ommited. 
  
Output is saved to <code>words.txt</code>, one word per line.

In [None]:
# open input file
text = open('cleaned.txt', 'r').read().split("\n")

In [None]:
# extract valid words from each line
words = []



for line in text:
    list = line.split(",")
    for word in list:
        if re.match("^"+VALID_CHAR_REGEX+"*$", word):
            words.append(word)

In [None]:
# save file
file = open("words.txt", "w")
for line in words[:-1]:
    file.write(line + "\n")
file.write(words[-1])
file.close()

## Analyzing words

Valid words are being mapped their count of occurance within the text.  

Based on this data, the Zipf Law is applied to check if the text is written in a realistic human language.  
Further analysis includes graphing and visualising the data.

In [None]:
# open input file
text = open('words.txt', 'r').read().split("\n")

In [None]:
# map occurance count for each word
wojnicz_dict = {}

for word in text:
    if word in wojnicz_dict.keys():
        wojnicz_dict[word] += 1
    else:
        wojnicz_dict[word] = 1

In [None]:
# sort words by occurance descending
wojnicz_dict_desc = dict(sorted(wojnicz_dict.items(), key=lambda item: item[1], reverse=True))
wojnicz_dict_desc

In [None]:
# get occurance percentages
word_count = len(text)
occurance_percentage = {}

for word in wojnicz_dict_desc.keys():
    occurance_percentage[word] = wojnicz_dict_desc[word] / word_count

occurance_percentage

In [None]:
# calculate zipf value for each word (occurance percentage * index) 
zipf_values = {}

for i, word in enumerate(occurance_percentage.keys()):
    zipf_values[word] = occurance_percentage[word] * (i+1) * 100

zipf_values 

TODO
- sprawdzić czy to wgl to o co chodzi bo robiłem z pamięci
- jakieś wykresy gośc chciał
- potem to samo z innym językiem z wiki
  - sformatować do pliku z pojedynczymi słowami
  - zamienić powyższe komórki na funkcje żeby nie pisać tego samego dwa razy
  - przeanalizować podobnie i porównać
- potem jeszce jakiś graf dwudzielny (????????????)