In [None]:
import pandas as pd
from llama_index.core import Document 
from llama_index.core.node_parser import LangchainNodeParser
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.core.ingestion import IngestionPipeline

## PreProcess

In [None]:
df = pd.read_csv('podcastdata_dataset.csv')
def preprocess_tolower(text):
    text=text.lower()
    return text

df['text']=df['text'].apply(preprocess_tolower)

## Splitter

In [None]:
docs = [
    Document(
        text=row['text'],
        metadata={
            'Title': row['title'],
            'Guest': row['guest'],
            'id': row['id']
        }
    )
    for _, row in df.iterrows()
]

parser = LangchainNodeParser(RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap = 200
    )
)


In [None]:
docs

## Embedding model

In [None]:
# loads BAAI/bge-small-en-v1.5
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

## Vector DB

In [None]:
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb

# create client and a new collection
chroma_client = chromadb.EphemeralClient()
chroma_collection = chroma_client.create_collection(
                                                name="transcripts_db",
                                                metadata={"hnsw:space": "cosine"}
                                            )
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
transformations=[parser, Settings.embed_model]

pipeline = IngestionPipeline(
                transformations=transformations,
                vector_store=vector_store
            )   
            
# Ingest directly into a vector db
pipeline.run(documents=docs)

In [None]:
# save the pipeline
pipeline.persist("./pipeline_storage")

## load data

In [None]:
transformations=[parser, Settings.embed_model]

# load and restore state
new_pipeline = IngestionPipeline(
    transformations=transformations
)
new_pipeline.load("./pipeline_storage")

In [None]:
# will run instantly due to the cache
nodes = new_pipeline.run(documents=docs)
nodes

In [None]:
chroma_collection.query(
    query_texts=[
        '''
        Lex Fridman (/ˈfriːdmən/; born 15 August 1983)[2] is a Russian-American computer scientist and podcaster. Since 2018 he has hosted the Lex Fridman Podcast, where he interviews notable figures from various fields such as science, technology, sports, and politics.
        Fridman rose to prominence in 2019 after Elon Musk praised his study which concluded that drivers remained focused while using Tesla's semi-autonomous driving system. The study was criticized by AI experts and was not peer-reviewed.
        ''',
        '''
        Fridman was born in the Soviet Union and grew up in Moscow.[3] He is of Jewish descent.[5] His father Alexander Fridman is a plasma physicist and professor at Drexel University. His brother Gregory was also a professor at Drexel.
        '''
    ],
    n_results=10,
)