In [74]:
import re
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
from networkx.algorithms import bipartite

In [75]:
def get_words_list(text):
    return [x.lower() for x in re.findall(r'[a-zA-Z0-9_\(\)\|]+', text)]


def split_text_by_char(fileName, char):
    with open(fileName, 'r') as file:
        data = file.read().replace('\n', ' ')

    return data.split(char)


def get_bigrams_from_file_by_char(fileName, char):
    text_sentences = split_text_by_char(fileName, char)
    words_in_sentences = [get_words_list(sentence) for sentence in text_sentences]
    words_in_sentences = [x for x in words_in_sentences if x]
    bigrams = {}
    for sentence in words_in_sentences:
        for i in range(len(sentence)):
            if i < len(sentence) - 1:
                if sentence[i] + ', ' + sentence[i + 1] in bigrams.keys():
                    bigrams[sentence[i] + ', ' + sentence[i + 1]] += 1
                else:
                    bigrams[sentence[i] + ', ' + sentence[i + 1]] = 1
    bigrams = sorted(bigrams.items(), key=lambda x: x[1], reverse=True)
    return bigrams



# Functions to get stats
def get_top_n_bigrams(bigrams, n):
    return bigrams[:n]

def get_bigrams_with_frequency(bigrams, frequency):
    return [x for x in bigrams if x[1] >= frequency]

def get_bigrams_with_frequency_range(bigrams, min_frequency, max_frequency):
    return [x for x in bigrams if x[1] >= min_frequency and x[1] <= max_frequency]

def get_bigrams_with_frequency_range_and_n(bigrams, min_frequency, max_frequency, n):
    return [x for x in bigrams if x[1] >= min_frequency and x[1] <= max_frequency][:n]

def get_top_n_bigrams_with_frequency(bigrams, n, frequency):
    return [x for x in bigrams if x[1] >= frequency][:n]

def get_top_n_bigrams_with_frequency_range(bigrams, n, min_frequency, max_frequency):
    return [x for x in bigrams if x[1] >= min_frequency and x[1] <= max_frequency][:n]

def get_top_n_bigrams_with_frequency_range_and_n(bigrams, n, min_frequency, max_frequency, n2):
    return [x for x in bigrams if x[1] >= min_frequency and x[1] <= max_frequency][:n2][:n]

In [76]:
bigrams_sample = get_bigrams_from_file_by_char('../../resources/sample.txt', '.')
print(bigrams_sample)

[('dawida, ma', 2), ('ala, ma', 1), ('ma, kota', 1), ('kota, ma', 1), ('ma, ala', 1), ('kot, ma', 1), ('ma, ale', 1), ('ale, ma', 1), ('ma, kot', 1), ('ma, psa', 1), ('psa, ma', 1), ('ma, dawid', 1), ('pies, ma', 1), ('ma, dawida', 1), ('ma, pies', 1)]


In [77]:
bigrams_voynich = get_bigrams_from_file_by_char('../../resources/voynich.txt', '=')
# print(bigrams_voynich)

In [78]:
bigrams_spanish_wiki = get_bigrams_from_file_by_char('../../resources/spanish_wiki.txt', '.')
# print(bigrams_spanish_wiki)

In [81]:
# Top 10 bigrams in sample.txt
top_10_bigrams_sample = get_top_n_bigrams(bigrams_sample, 10)

# Top 10 bigrams in voynich.txt
top_10_bigrams_voynich = get_top_n_bigrams(bigrams_voynich, 10)

# Top 10 bigrams in spanish_wiki.txt
top_10_bigrams_spanish_wiki = get_top_n_bigrams(bigrams_spanish_wiki, 10)

In [82]:
print(top_10_bigrams_sample)
print(top_10_bigrams_voynich)
print(top_10_bigrams_spanish_wiki)

[('dawida, ma', 2), ('ala, ma', 1), ('ma, kota', 1), ('kota, ma', 1), ('ma, ala', 1), ('kot, ma', 1), ('ma, ale', 1), ('ale, ma', 1), ('ma, kot', 1), ('ma, psa', 1)]
[('toe, 8am', 30), ('toe, toe', 19), ('8am, 8am', 12), ('8am, hzg', 11), ('tor, 8am', 10), ('tg, 8am', 9), ('soe, 8am', 9), ('tor, toe', 8), ('toe, soe', 8), ('8am, hzor', 8)]
[('sistema, solar', 46), ('de, la', 43), ('m, s', 37), ('en, el', 34), ('n, de', 32), ('del, sistema', 29), ('los, planetas', 28), ('de, los', 21), ('a, la', 19), ('del, sol', 19)]
