## Modules Installation

In [None]:
!pip install langchain

In [6]:
from langchain.llms import LlamaCpp
from langchain import PromptTemplate, LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import (StreamingStdOutCallbackHandler)
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter

In [18]:
company_names = "ByteBloom \n EcoElement \n SolarFlare Studios \n AquaPurity \n NanoNest \n ZenithZone \n PixelPond \n UrbanUtopia \n CloudCraft \n TerraTrek \n QuantumQuest \n VitalVista \n FusionFront \n MysticMode \n OrbitOcean \n AlphaAvenue \n BioBreeze \n CosmicCrest \n DigitalDawn \n EmberEdge \n FluxField \n GravityGuild \n HorizonHive \n InfinityInk \n JoltJourney \n KineticKey \n LuminousLink \n MotionMeadow \n NebulaNook \n OmegaOrbit \n PrismPath \n QuarkQuest \n RadiantRidge \n SpectrumSphere \n ThriveThrottle \n UltraUnity \n VertexVoyage \n WaveWhisper \n ZenithZone \n AzureAim \n BloomBridge \n CoreCraft \n DriftDream \n EchoElement \n FlareField \n GlowGuild \n HavenHue \n InsightIsle \n JewelJunction \n KiteKey \n LeafLift \n MeadowMind \n NectarNest \n OpalOrbit \n PulsePath \n QuillQuest \n RippleRidge \n SparkSphere \n TideThrive \n UmbraUnity \n VerveVista \n WhisperWave \n ZenZone \n ArcAvenue \n BlazeBridge \n CrestCore \n DuskDrift \n EssenceEcho \n FloraField \n GleamGuild \n HorizonHaven \n IrisIsle \n JoltJewel \n KindleKey \n LumenLeaf \n MysticMeadow \n NovaNectar \n OasisOrbit \n PeakPulse \n QuestQuill \n RadiantRipple \n SurgeSphere \n TidalThrive \n UmbraUnity \n VortexVista \n WhisperWave \n ZenithZone \n AzureArc \n BloomBridge \n CoreCraft \n DriftDream \n EchoElement \n FlareField \n GlowGuild \n HavenHue \n InsightIsle \n JewelJunction \n KiteKey \n LeafLift \n MeadowMind \n NectarNest \n OpalOrbit \n PulsePath \n QuillQuest \n RippleRidge \n SparkSphere \n TideThrive \n UmbraUnity \n VerveVista \n WhisperWave \n ZenZone \n ArcAvenue \n BlazeBridge \n CrestCore \n DuskDrift \n EssenceEcho \n FloraField \n GleamGuild \n HorizonHaven \n IrisIsle \n JoltJewel \n KindleKey \n LumenLeaf \n MysticMeadow \n NovaNectar \n OasisOrbit \n PeakPulse \n QuestQuill \n RadiantRipple \n SurgeSphere \n TidalThrive \n UmbraUnity \n VortexVista \n WhisperWave \n ZenithZone \n AzureArc \n BloomBridge \n CoreCraft \n DriftDream \n EchoElement \n FlareField \n GlowGuild \n HavenHue \n InsightIsle \n JewelJunction \n KiteKey \n LeafLift \n MeadowMind \n NectarNest \n OpalOrbit \n PulsePath \n QuillQuest \n RippleRidge \n SparkSphere \n TideThrive \n UmbraUnity \n VerveVista \n WhisperWave \n ZenZone \n ArcAvenue \n BlazeBridge \n CrestCore \n DuskDrift \n EssenceEcho \n FloraField \n GleamGuild \n HorizonHaven \n IrisIsle \n JoltJewel \n KindleKey \n LumenLeaf \n MysticMeadow \n NovaNectar \n OasisOrbit \n PeakPulse \n QuestQuill \n RadiantRipple \n SurgeSphere \n TidalThrive \n UmbraUnity \n VortexVista \n WhisperWave"

In [39]:
### We will reduce chunk size to get smaller sentence
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=20,
    chunk_overlap=0,
    length_function=len
)
chunks = text_splitter.split_text(company_names)


In [40]:
chunks[0]

'ByteBloom'

In [41]:
chunks[1]

'EcoElement'

### Embeddings

In [None]:
!pip install -U sentence-transformers

In [None]:
!pip install faiss-cpu

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings,HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS

model_name = "sentence-transformers/all-mpnet-base-v2"
#model_name = 'hkunlp/instructor-xl'
model_kwargs = {"device": "cpu"}

#embeddings = HuggingFaceInstructEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
embeddings = HuggingFaceEmbeddings(model_name = model_name, model_kwargs=model_kwargs)

In [42]:
# storing embeddings in the vector store
vectorstore = FAISS.from_texts(chunks, embeddings)

In [63]:
### Search for complete words
query = "Mystic"
search = vectorstore.similarity_search(query, k=5)

# Using list comprehension
page_contents = [doc.page_content for doc in search]
for  index,item in enumerate(page_contents):
  print(index, "  ", item)

0    MysticMode
1    MysticMeadow
2    MysticMeadow
3    MysticMeadow
4    QuantumQuest


Let's check the performance.


Note that the returned distance score is L2 distance. Therefore, a lower score is better.



### Evaluation 1 : Comfort Zone

In [65]:
### Search for complete words
query = "Mystic"
search = vectorstore.similarity_search_with_score(query, k=5)
# Using list comprehension
for index, doc in enumerate(search):
  print(index , " ",doc[0].page_content," ",doc[1])

0   MysticMode   0.61371815
1   MysticMeadow   0.67042565
2   MysticMeadow   0.67042565
3   MysticMeadow   0.67042565
4   QuantumQuest   1.1682106


In [71]:
### Search for complete words
query = "Pond"
search = vectorstore.similarity_search_with_score(query, k=5)
# Using list comprehension
for index, doc in enumerate(search):
  print(index , " ",doc[0].page_content," ",doc[1])

0   RippleRidge   0.9219454
1   RippleRidge   0.9219454
2   RippleRidge   0.9219454
3   MeadowMind   1.0694668
4   MeadowMind   1.0694668


In [69]:
### Search for complete words
query = "Quantum"
search = vectorstore.similarity_search_with_score(query, k=5)
# Using list comprehension
for index, doc in enumerate(search):
  print(index , " ",doc[0].page_content," ",doc[1])

0   QuantumQuest   0.4465906
1   QuarkQuest   0.98394775
2   UltraUnity   1.1959038
3   QuillQuest   1.2041839
4   QuillQuest   1.2041839


In [70]:
### Search for complete words
query = "Quest"
search = vectorstore.similarity_search_with_score(query, k=5)
# Using list comprehension
for index, doc in enumerate(search):
  print(index , " ",doc[0].page_content," ",doc[1])

0   QuestQuill   0.5457902
1   QuestQuill   0.5457902
2   QuestQuill   0.5457902
3   QuillQuest   0.9417101
4   QuillQuest   0.9417101


In [72]:
### Search for complete words
query = "Meadow"
search = vectorstore.similarity_search_with_score(query, k=5)
# Using list comprehension
for index, doc in enumerate(search):
  print(index , " ",doc[0].page_content," ",doc[1])

0   MeadowMind   0.42403907
1   MeadowMind   0.42403907
2   MeadowMind   0.42403907
3   FloraField   0.88558674
4   FloraField   0.88558674


Seems like score upto 0.7 is reasonable for L2

### Evaluation 2 :  Partial Words

In [61]:
### Let's evaluate the performance for partial words
query = "stic"
search = vectorstore.similarity_search_with_score(query, k=5)

# Using list comprehension
for index, doc in enumerate(search):
  print(index , " ",doc[0].page_content," ",doc[1])

0   OmegaOrbit   1.3203022
1   JoltJewel   1.4259626
2   JoltJewel   1.4259626
3   JoltJewel   1.4259626
4   PixelPond   1.4325418


In [66]:
### Let's evaluate the performance for partial words but few words from beginning
query = "Myst"
search = vectorstore.similarity_search_with_score(query, k=5)

# Using list comprehension
for index, doc in enumerate(search):
  print(index , " ",doc[0].page_content," ",doc[1])

0   DuskDrift   1.3157439
1   DuskDrift   1.3157439
2   DuskDrift   1.3157439
3   DriftDream   1.3394732
4   DriftDream   1.3394732


We are getting high scores but not able to find the right words.

Let's evaluate the case with bert-base-uncased model.

In [None]:

model_name = "bert-base-uncased"
#model_name = 'hkunlp/instructor-xl'
model_kwargs = {"device": "cpu"}

#embeddings = HuggingFaceInstructEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
embeddings = HuggingFaceEmbeddings(model_name = model_name, model_kwargs=model_kwargs)

In [74]:
# storing embeddings in the vector store
vectorstore2 = FAISS.from_texts(chunks, embeddings)

In [75]:
### Let's evaluate the performance for partial words but few words from beginning
query = "Myst"
search = vectorstore2.similarity_search_with_score(query, k=5)

# Using list comprehension
for index, doc in enumerate(search):
  print(index , " ",doc[0].page_content," ",doc[1])

0   CloudCraft   52.811188
1   HavenHue   59.372635
2   HavenHue   59.372635
3   HavenHue   59.372635
4   HorizonHaven   59.396706


In [76]:
### Let's evaluate the performance for partial words
query = "stic"
search = vectorstore.similarity_search_with_score(query, k=5)

# Using list comprehension
for index, doc in enumerate(search):
  print(index , " ",doc[0].page_content," ",doc[1])

0   OmegaOrbit   1.3203022
1   JoltJewel   1.4259626
2   JoltJewel   1.4259626
3   JoltJewel   1.4259626
4   PixelPond   1.4325418
