# ETL Process for Project: WORDS OF WISDOM SCRIPTUM

## Imports

In [107]:
# - General Purpose
import os
import sys
import pickle
import json
import numpy as np

# - Extraction Libraries
from unstructured.partition.pdf import partition_pdf

# - Transform Libraries
from unstructured.chunking.title import chunk_by_title
from unstructured.embed.openai import OpenAIEmbeddingEncoder, OpenAIEmbeddingConfig
from langchain.embeddings import OpenAIEmbeddings
import pandas as pd

# - Loading Libraries
import pinecone
from pinecone_text.sparse import BM25Encoder
from tqdm.auto import tqdm

## Initialization

In [108]:
OPENAI_API_KEY = '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'
PINECONE_API_KEY = 'd3bb7c2c-247a-418e-a91e-3665d3919137'
PINECONE_ENV = 'eu-west-1'

## Extraction

### Data

In [109]:
data_folder = 'data/'
file_name = 'Words Of Wisdom Scriptum - Irgendetwas stimmt nicht.pdf'
file_path = data_folder + file_name

### Text Partition

In [110]:
elements = partition_pdf(
    filename=file_path, 
    languages=["deu"]
)

### Data Details

In [111]:
elements[0].text

'BERND KOLB WORDS OF WISDOM SCRIPTUM'

In [112]:
len(elements)

100

## Transform

### Table Creation

In [113]:
data = []
  
for c in elements:
  row = {} 
  row['Element Type'] = type(c).__name__
  row['Filename'] = c.metadata.filename
  row['Date Modified'] = c.metadata.last_modified
  row['Filetype'] = c.metadata.filetype
  row['Page Number'] = c.metadata.page_number
  row['text'] = c.text
  data.append(row)
  
df = pd.DataFrame(data)
df.head(10)

Unnamed: 0,Element Type,Filename,Date Modified,Filetype,Page Number,text
0,Title,Words Of Wisdom Scriptum - Irgendetwas stimmt ...,2024-07-15T12:01:16,application/pdf,1,BERND KOLB WORDS OF WISDOM SCRIPTUM
1,Title,Words Of Wisdom Scriptum - Irgendetwas stimmt ...,2024-07-15T12:01:16,application/pdf,1,IRGENDETWAS STIMMT NICHT
2,Title,Words Of Wisdom Scriptum - Irgendetwas stimmt ...,2024-07-15T12:01:16,application/pdf,2,VORWORT
3,NarrativeText,Words Of Wisdom Scriptum - Irgendetwas stimmt ...,2024-07-15T12:01:16,application/pdf,2,In dieser neuen Folge der WORTE DER WEISHEIT e...
4,NarrativeText,Words Of Wisdom Scriptum - Irgendetwas stimmt ...,2024-07-15T12:01:16,application/pdf,2,"Es geht also darum, wie wir erkennen können, d..."
5,NarrativeText,Words Of Wisdom Scriptum - Irgendetwas stimmt ...,2024-07-15T12:01:16,application/pdf,2,Im Kern handeln die östlichen Weisheitslehren ...
6,Title,Words Of Wisdom Scriptum - Irgendetwas stimmt ...,2024-07-15T12:01:16,application/pdf,2,Bernd Kolb im April 2024 auf der Insel Java
7,Title,Words Of Wisdom Scriptum - Irgendetwas stimmt ...,2024-07-15T12:01:16,application/pdf,3,BERND KOLB WORDS OF WISDOM SCRIPTUM
8,Title,Words Of Wisdom Scriptum - Irgendetwas stimmt ...,2024-07-15T12:01:16,application/pdf,3,IRGENDETWAS STIMMT NICHT
9,NarrativeText,Words Of Wisdom Scriptum - Irgendetwas stimmt ...,2024-07-15T12:01:16,application/pdf,4,In meinem letzten Podcast über die drei Fragen...


### Cleaning

In [114]:
for e in elements:
    if e.category == 'Footer':
        e.category = 'PageNumber'

elements[13].text = '1' 

### Chunking

In [115]:
chunks = chunk_by_title(elements, multipage_sections=True)
print(chunks[1].text)

VORWORT

In dieser neuen Folge der WORTE DER WEISHEIT erzähle ich über das Gefühl, das mich im Laufe meines Lebens beschlich, dass irgendetwas mit dieser Welt nicht stimmt. Dieses Gefühl hatte mich letztendlich zum Aufbruch bewogen hat, um das für mich selbst herauszufinden.

Es geht also darum, wie wir erkennen können, dass wir uns vielleicht in vielem täuschen, was wir für wahr und richtig halten. Und sich daraus ein Weltbild entwickelt, das im Grunde auf Illusionen beruht.


### Embeddings

In [116]:
embedding_encoder = OpenAIEmbeddingEncoder(
    config=OpenAIEmbeddingConfig(
        api_key=OPENAI_API_KEY, 
        model_name='text-embedding-ada-002'
    )
)

bm25 = BM25Encoder()

embed_model = OpenAIEmbeddings(
    model='text-embedding-ada-002', 
    api_key=OPENAI_API_KEY
)

In [117]:
dense_embeds = embedding_encoder.embed_documents(chunks)

In [118]:
chunks_text = [c.text for c in chunks]
bm25.fit(chunks_text)

100%|██████████| 44/44 [00:00<00:00, 872.15it/s]


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x7f23d940bd60>

### Chunk Table Creation

In [119]:
elements[0].metadata.fields

mappingproxy({'coordinates': CoordinatesMetadata(points=((158.4, 489.6061), (158.4, 562.4612), (434.8179999999999, 562.4612), (434.8179999999999, 489.6061)), system=<unstructured.documents.coordinates.PixelSpace object at 0x7f23d94b1ba0>),
              'file_directory': 'data',
              'filename': 'Words Of Wisdom Scriptum - Irgendetwas stimmt nicht.pdf',
              'languages': ['deu'],
              'last_modified': '2024-07-15T12:01:16',
              'links': [],
              'page_number': 1,
              'filetype': 'application/pdf'})

In [120]:
chunks[1].metadata.fields

mappingproxy({'file_directory': 'data',
              'filename': 'Words Of Wisdom Scriptum - Irgendetwas stimmt nicht.pdf',
              'filetype': 'application/pdf',
              'languages': ['deu'],
              'last_modified': '2024-07-15T12:01:16',
              'page_number': 2,
              'orig_elements': [<unstructured.documents.elements.Title at 0x7f23d94b1330>,
               <unstructured.documents.elements.NarrativeText at 0x7f23d94b1030>,
               <unstructured.documents.elements.NarrativeText at 0x7f23d94b0fa0>]})

In [134]:
data = []
  
for c in chunks:
  row = {} 
  row['Filename'] = c.metadata.filename
  row['Date Modified'] = c.metadata.last_modified
  row['Filetype'] = c.metadata.filetype
  row['Language'] = c.metadata.languages
  row['Page Number'] = c.metadata.page_number
  row['Elements'] = [e.category for e in c.metadata.orig_elements]
  row['text'] = c.text
  data.append(row)
  
df = pd.DataFrame(data)
df.head(5)

Unnamed: 0,Filename,Date Modified,Filetype,Language,Page Number,Elements,text
0,Words Of Wisdom Scriptum - Irgendetwas stimmt ...,2024-07-15T12:01:16,application/pdf,[deu],1,"[Title, Title]",BERND KOLB WORDS OF WISDOM SCRIPTUM\n\nIRGENDE...
1,Words Of Wisdom Scriptum - Irgendetwas stimmt ...,2024-07-15T12:01:16,application/pdf,[deu],2,"[Title, NarrativeText, NarrativeText]",VORWORT\n\nIn dieser neuen Folge der WORTE DER...
2,Words Of Wisdom Scriptum - Irgendetwas stimmt ...,2024-07-15T12:01:16,application/pdf,[deu],2,"[NarrativeText, Title, Title]",Im Kern handeln die östlichen Weisheitslehren ...
3,Words Of Wisdom Scriptum - Irgendetwas stimmt ...,2024-07-15T12:01:16,application/pdf,[deu],3,"[Title, NarrativeText]",IRGENDETWAS STIMMT NICHT\n\nIn meinem letzten ...
4,Words Of Wisdom Scriptum - Irgendetwas stimmt ...,2024-07-15T12:01:16,application/pdf,[deu],4,[NarrativeText],"Zitat: „Einem teils bewunderten, und oft benei..."


In [122]:
df.shape

(44, 7)

## Load

### Pinecone Initialization

In [123]:
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)
pc.list_indexes()

{'indexes': [{'dimension': 1536,
              'host': 'wordsofwisdom-j4bdkgt.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'wordsofwisdom',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [124]:
pc.describe_index('wordsofwisdom')

{'dimension': 1536,
 'host': 'wordsofwisdom-j4bdkgt.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'wordsofwisdom',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}

In [125]:
index = pc.Index(name='wordsofwisdom')
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

### Load to Vector DB

In [126]:
batch_size = 16

In [135]:
for i in tqdm(range(0, len(df), batch_size)):
    
    # Find the end of the current batch
    i_end = min(i+batch_size, len(df))
    
    # Extract the current batch from the dataframe
    df_batch = df.iloc[i:i_end]
    
    # Convert the batch to a list of dictionaries
    df_dict = df_batch.to_dict(orient="records")
    
    # Create unique IDs for the batch
    ids = [str(x) for x in range(i, i_end)]
    
    # Create a batch of metadata
    meta_batch = [
        " ".join(map(str, x)) for x in df_batch.loc[
            :, ~df_batch.columns.isin(['FileType', 'Date Modified'])
        ].values.tolist()
    ]
    
    # Convert text column to a list
    text_batch = df['text'][i:i_end].tolist()
    
    # Encode the metada batch
    sparse_embeds = bm25.encode_documents([text for text in meta_batch])
    
    # Create dense embeddings of the text batch
    dense_embeds = embed_model.embed_documents(text_batch)
    
    # Initialize upsert data
    upserts = []
    for _id, sparse, dense, meta in zip(ids, sparse_embeds, dense_embeds, df_dict): 
        upserts.append({
            'id': _id,
            'sparse_values': sparse,
            'values': dense,
            'metadata': meta
        })
    
    # Upsert batch data to the Pinecone index
    index.upsert(upserts)

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:04<00:00,  1.48s/it]
