In [4]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from uuid import uuid4
import json
import os

# Open King James Bible.
data_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')
kjv_df = pd.read_csv(os.path.join(data_dir, 'en_kjv.csv'), index_col='index')
kjv_df.head()

Unnamed: 0_level_0,language,translation,book,chapter,verse,text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,en,kjv,Gen,1,1,In the beginning God created the heaven and th...
1,en,kjv,Gen,1,2,"And the earth was without form, and void; and ..."
2,en,kjv,Gen,1,3,"And God said, Let there be light: and there wa..."
3,en,kjv,Gen,1,4,"And God saw the light, that it was good: and G..."
4,en,kjv,Gen,1,5,"And God called the light Day, and the darkness..."


In [5]:
# Open metadata table.
book_metadata = pd.read_csv(os.path.join(data_dir, 'metadata', 'books.csv'))
book_metadata.loc[book_metadata['book'].isin(['Numbers', 'Jonah', 'Ruth', 'Mark', 'Titus', 'Revelation'])]

Unnamed: 0,book,chapters,verses,avg verse per chapter,testament,category,author,abbreviation
3,Numbers,36,1288,36,old,law,Moses,Num
7,Ruth,4,85,21,old,writing,Samuel,Ruth
31,Jonah,4,48,12,old,prophet,Jonah,Jonah
40,Mark,16,678,42,new,gospel,Mark,Mark
55,Titus,3,46,15,new,epistle,Paul,Titus
65,Revelation,22,404,18,new,revelation,John,Rev


In [6]:
# Combine all verses into books.
kjv_books = {}
book_title_list = book_metadata['abbreviation'].values
tween = ' ' # What goes between each verse
verse_total = book_metadata['verses'].sum() # Index of the final verse
print("Begin forming books...")
for book in book_title_list:
    book_df = kjv_df.loc[kjv_df['book'] == book]
    full_title = book_metadata.loc[book_metadata['abbreviation'] == book]['book'].values[0]
    full_book_text = ''
    for index, row in book_df.iterrows():
        full_book_text += row['text'] + tween
    kjv_books[full_title] = full_book_text
print("Finished forming books!")
print(kjv_books.keys())
# Store in file
out_path = os.path.join(data_dir, 'en_kjv.json')
if not os.path.isfile(out_path):
    with open(out_path, 'w', encoding='utf-8') as f:
        json.dump(kjv_books, f, indent=4, ensure_ascii=False)

Begin forming books...
Finished forming books!
dict_keys(['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', 'Ruth', '1 Samuel', '2 Samuel', '1 Kings', '2 Kings', '1 Chronicles', '2 Chronicles', 'Ezra', 'Nehemiah', 'Esther', 'Job', 'Psalms', 'Proverbs', 'Ecclesiastes', 'Song of Songs', 'Isaiah', 'Jeremiah', 'Lamentations', 'Ezekiel', 'Daniel', 'Hosea', 'Joel', 'Amos', 'Obadiah', 'Jonah', 'Micah', 'Nahum', 'Habakkuk', 'Zephaniah', 'Haggai', 'Zechariah', 'Malachi', 'Matthew', 'Mark', 'Luke', 'John', 'Acts', 'Romans', '1 Corinthians', '2 Corinthians', 'Galatians', 'Ephesians', 'Philippians', 'Colossians', '1 Thessalonians', '2 Thessalonians', '1 Timothy', '2 Timothy', 'Titus', 'Philemon', 'Hebrews', 'James', '1 Peter', '2 Peter', '1 John', '2 John', '3 John', 'Jude', 'Revelation'])


In [29]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import JSONLoader
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_ollama import OllamaEmbeddings, ChatOllama
import h5py

local_llm = 'llama3'
token_size=500
dimensions=300

def document_verses(doc_dict):
    documents = []
    count = 0 # Count batches for sanity's sake.
    for doc_index, doc in enumerate(doc_dict):
        count += 1
        doc = Document(
            page_content=doc_dict[doc],
            metadata={
                'title': doc,
                'author': book_metadata.loc[book_metadata['book'] == doc]['author'].values[0],
                'book_index': doc_index
            },
            id=count
        )
        documents.append(doc)
    uuids = [str(uuid4()) for _ in range(len(documents))]
    return documents, uuids

kjv_docs, kjv_ids = document_verses(kjv_books)
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=token_size, chunk_overlap=100
)
split_docs = text_splitter.split_documents(kjv_docs)
print("Documents split.")
embedding = OllamaEmbeddings(
    model='local_llm'
)

Documents split.


In [31]:
from langchain.text_procesing.tokenize import Basic

with open(os.path.join(data_dir, 'bible_embeddings.h5'), 'w') as hdf5_file:
    for sd_index, split_doc in enumerate(split_docs):
        tokens = word_tokenize(split_doc)
        input_matrix = np.zeros((token_size, dimensions))
        for token_index, token in enumerate(tokens):
            input_matrix[token_index] = embedding.get_word_vector(token)
        split_doc_embedding = embedding.get_embedding(input_matrix)
        hdf5_file.create_dataset('book_{:d}_chunk_{:d}'.format(split_doc.metadata['book_index'], sd_index))
        hdf5_file.attrs['book_title'] = split_doc.metadata['title']
        hdf5_file.attrs['author'] = split_doc.metadata['author']
        hdf5_file.attrs['chunk_index'] = split_doc_index
print("Documents stored.")

NameError: name 'word_tokenize' is not defined