In [1]:
import pandas as pd
import os
import re
from dataclasses import dataclass
import json
from pymongo import MongoClient
import sqlite3
from functions import get_current_project, get_db_name, get_project_config
from vars import CURRENT_PROJECT_FILE, DATA_DIR, JSON_READINGS_FILE, PROJECT_DIR, PROJECT_DIR_AUDIO, PROJECT_DIR_EXPORT, PROJECT_DIR_EXPORT_VERSES, PROJECT_DIR_EXPORT_CHAPTERS, PROJECT_CONFIG_FILE_NAME, PROJECT_DOWNLOADS_DIR, PROJECT_JSON_DIR, PROJECT_TEMP_DOWNLOADS_DIR, PROJECT_TRANSCRIPTS_DIR, PROJECT_TRANSCRIPTS_DIR, PROJECT_DOWNLOADS_DIR, PROJECT_TEMP_DOWNLOADS_DIR, PROJECT_CSV_DIR, CSV_SEGMENTS_FILE, CSV_SOURCES_FILE, CSV_SEARCHES_FILE

searches = {}
segments = {}

def search(word):
    global searches
    if word in searches:
        return searches[word]
    else:
        return []

def print_segment_range(s, e):
    print(" ".join([segments[i]["content"] for i in range(s, e + 1)]))

@dataclass
class Reference:
    book: str
    chapter: int
    verse: int
    content: str

@dataclass
class Reading:
    id: str
    start_time: float
    end_time: float
    start_seg: int
    end_seg: int
    content: str

# get Bible verses
bible_sqlite = "ESV.sqlite"
with open("./references.json", "r") as f:
    references = json.load(f)

def get_esv_content(book: str, chapter: int, verse: int) -> str:
    con = sqlite3.connect(bible_sqlite)
    cur = con.cursor()
    res = cur.execute(
        f"SELECT content FROM '{book}' WHERE chapter={chapter} AND verse={verse} ORDER BY verse ASC;")
    element = res.fetchone()
    return element[0]

def get_esv_book(book: str) -> dict[str, str]:
    con = sqlite3.connect(bible_sqlite)
    cur = con.cursor()
    res = cur.execute(
        f"SELECT chapter, verse, content FROM '{book}' ORDER BY verse, chapter ASC;")
    elements = res.fetchall()
    reference_to_content = {f"{book} {e[0]}:{e[1]}": e[2] for e in elements}
    return reference_to_content

def get_all_verses_and_content(book) -> list[Reference]:
    refs: list[Reference] = []
    for chapter, verses_in_chapter in enumerate(references[book]):
        if chapter == 0:
            continue
        for verse in range(1, verses_in_chapter + 1):
            content = get_esv_content(book, chapter, verse)
            ref = Reference(book, chapter, verse, content)
            refs.append(ref)
    return refs

client = MongoClient('mongodb://localhost:27017/')
db = client[get_db_name()]
config = get_project_config()

searches = {s["word"]: s["segments"] for s in db["searches"].find({})}
segments = {s["id"]: s for s in db["segments"].find({})}

data = {}
book = config["book"]

# for ref in get_all_verses_and_content(book):
#     full_ref = f"{ref.book} {ref.chapter}:{ref.verse}"
#     print(f"Finding '{full_ref}'", end = "")
#     data[full_ref] = [reading.__dict__ for reading in find_readings(ref)]
#     print(f" - {len(data[full_ref])} results")

project_name = get_current_project()
# file = os.path.join(PROJECT_DIR, project_name, PROJECT_JSON_DIR, JSON_READINGS_FILE)
# with open(file, "w") as f:
#     f.write(json.dumps(data, indent=2))


In [2]:
def find_readings(ref: Reference) -> list[Reading]:
    words = [
        word for word in
            re.split(r"\s",
                re.sub(r"[^A-z0-9\s\-]", "",
                    ref.content.lower()
                )#.replace("bondservant", "bond servant") # dont ask
            )
        if len(word) > 0
    ]
    word_count = len(words)
    valid_offset = 4
    accuracy = 0.6
    readings: list[Reading] = []
    for i, word in enumerate(words[:2]):
        start_word = word
        end_word = words[-1 - i]
        start_segs = search(start_word)
        end_segs = search(end_word)
        # print(start_segs)
        # print(end_segs)
        # reading_segment_ranges = [(s, s + size) for s in start_segs if s + size in end_segs]
        # segments_index = [(s, e) for s, e in zip(start_segs, end_segs) if (e - s < valid_offset) and (s > e)]
        segments_index = [(s, e) for s in start_segs for e in end_segs if (s < e) and (abs(e - s - word_count - (2*i)) < valid_offset)]
        # print(segments_index)
        for s, e in segments_index:
            valid_count = 0
            # print(s, e + 1)
            for j in range(s, e + 1):
                # print(j - s, words[j - s])
                if j - s >= word_count:
                    continue
                valid_count += 1 if segments[j]["content"] == words[j - s] else 0
            rating = (valid_count) / (e - s)
            # valid segment -> give reading
            # print(rating)
            if rating > accuracy:
                content = " ".join([segments[i]["content"] for i in range(s, e + 1)])
                readings.append(
                    Reading(
                        segments[s]["source"],
                        segments[s]["start"],
                        segments[e]["end"],
                        s,
                        e,
                        content
                    )
                )
            if len(readings) == 5:
                break
        if len(readings) > 0:
            break
    return readings

b = "1 Peter"
c = 1
v = 7
r = Reference(b, c, v, get_esv_content(b, c, v))
find_readings(r)

[]

In [3]:
def find_match_with_offset(start_word, end_word, words, offset, accuracy=0.9):
    offset = 0
    offset_in = -1
    offset_out = 1
    m = None
    for _ in range(3):
        if m is not None:
            break
        # maybe words are merged
        m = find_best_match(start_word, end_word, words, offset_in, accuracy)

        if m is not None:
            offset = offset_in
            break
        else:
            offset_in -= 1
        # maybe words are joined
        m = find_best_match(start_word, end_word, words, offset_out, accuracy)
        if m is not None:
            offset = offset_out
            break
        else:
            offset_out += 1
    return m, offset

# how to improve this to make it not get coincidences
# search words in the middle, say 80%
# see if they have an index that occurs inside the range
# i think change my next to a list and then find best match


def find_best_match(start_word, end_word, words, offset, accuracy=0.9):
    # matches = next(((swid for swid in word_map[start] if any(ewid == len(words) - 1 + offset + swid for ewid in word_map[end])), None))
    # for m in (((swid for swid in word_map[start] if any(ewid == len(words) - 1 + offset + swid for ewid in word_map[end])), None)):
    #     print(next(m))
    # l = [swid for swid in word_map[start] if any(ewid == len(words) - 1 + offset + swid for ewid in word_map[end])] + [None]
    # print(matches)
    # print(next(matches))

    # all words size
    awsize = len(words)
    # words not including start and end (other words size)
    words = list(filter(lambda w: w != start_word and w != end_word, words))
    owsize = len(words)

    matches = []
    best = 0
    best_inside = 0
    # start word id
    for swid in search(start_word):
        # end word id
        for ewid in search(end_word):
            # they are the proper distance from each other
            if ewid == awsize - 1 + offset + swid:
                m = swid
                matches.append(m)
                inside = 0
                prev_v = 0
                # print(swid, ewid)
                for w in words:
                    good_ids = list(filter(lambda v: swid < v and v < ewid, search(w)))
                    inside += len(good_ids)
                    if (inside > (accuracy * owsize)) or (inside + 1 == owsize):
                        return m
                    # ids = word_map[w]
                    # print(w, ids)
                    # for v in ids:
                    #     if swid < v and v < ewid and prev_v < v:
                    #         inside += 1
                    #         prev_v = v
                    #         if (inside > (accuracy * owsize)) or (inside + 1 == owsize):
                    #             return m
                    #     if v > ewid:
                    #         break
                if inside > best_inside:
                    best = m
                    best_inside = inside

    return None
    # return best

def find_readings(ref: Reference) -> list[Reading]:
    words = [
        word for word in
            re.split(r"\s",
                re.sub(r"[^A-z0-9\s\-]", "",
                    ref.content.lower()
                )#.replace("bondservant", "bond servant") # dont ask
            )
        if len(word) > 0
    ]
    start_word = words[0]
    end_word = words[-1]
    offset = 0
    m = find_best_match(start_word, end_word, words, offset)
    if m is None:
        m, offset = find_match_with_offset(
            start_word, end_word, words, offset)

    if m is None:
        m = find_best_match(start_word, end_word, words, offset, 0.8)
    if m is None:
        m, offset = find_match_with_offset(
            start_word, end_word, words, offset, 0.8)
    return m


b = "1 Peter"
c = 1
v = 6
r = Reference(b, c, v, get_esv_content(b, c, v))
find_readings(r)

170213

In [None]:

def find_readings(ref: Reference) -> list[Reading]:
    words = [
        word for word in
            re.split(r"\s",
                re.sub(r"[^A-z0-9\s\-]", "",
                    ref.content.lower()
                )#.replace("bondservant", "bond servant") # dont ask
            )
        if len(word) > 0
    ]
    start_word = words[0]
    end_word = words[-1]
    offset = 0
    m = find_best_match(start_word, end_word, words, offset)
    if m is None:
        m, offset = find_match_with_offset(
            start_word, end_word, words, offset)

    if m is None:
        m = find_best_match(start_word, end_word, words, offset, 0.8)
    if m is None:
        m, offset = find_match_with_offset(
            start_word, end_word, words, offset, 0.8)
    return m


b = "1 Peter"
c = 1
v = 6
r = Reference(b, c, v, get_esv_content(b, c, v))
find_readings(r)

In [23]:
id = '91921196561217'
start_time = 261
end_time = 269
start_seg = next((v for v in segments.values() if v['source'] == id and v['start'] >= start_time), None)
end_seg = next((v for v in segments.values() if v['source'] == id and v['end'] >= end_time), None)
print(start_seg)
print(end_seg)


{'_id': ObjectId('665932acc909a3ffac8e6317'), 'id': 170213, 'source': '91921196561217', 'content': 'in', 'start': 261.278, 'end': 261.379, 'sequence': 559}
{'_id': ObjectId('665932acc909a3ffac8e6329'), 'id': 170231, 'source': '91921196561217', 'content': 'trials.', 'start': 269.224, 'end': 269.784, 'sequence': 577}


In [63]:
sentence = "when you do not join them in the same flood of debauchery and they malign you"
words = [
    word for word in
        re.split(r"\s",
            re.sub(r"[^A-z0-9\s\-]", "",
                sentence.lower()
            )#.replace("bondservant", "bond servant") # dont ask
        )
    if len(word) > 0
]
def search_sequence(words: list[str]):
    stack = []
    for word in words:
        occurrences = search(word)
        # print(f"{word}: {len(occurrences)}")
        if len(stack) == 0:
            stack.append(occurrences)
        else:
            stack.append([o for o in occurrences if o - 1 in stack[-1]])
    # print([len(s) for s in stack])
    windows = [(end - len(words) + 1, end) for end in stack[-1]]
    return windows


windows = search_sequence(words)
print(windows)
context = 4
for beg, end in windows:
    print(f"{segments[beg]['source']}: {segments[beg]['start']}-{segments[end]['end']}")
    print(" ".join([segments[i]["content"] for i in range(beg - context, end+1 + context)]))
    print()
    # print([segments[i]["content"] for i in range(beg, end+1)])
# next(s for s in segments if s["id"] == 117286)

[]
