## Step 0

## - Click ⋮ on the menu to the right 
## - "Insert project token" 
## Use the ⬇ on the top menu to move the cell down and begin running the notebook

# Load Data into Milvus for RAG


 


<a class="anchor" id="setup"></a>
## 1. Set up the environment

### Install Libraries

We need to install the pymilvus package to the watsonx.ai Python environment.

In [None]:
!pip install grpcio==1.60.0 
!pip install pymilvus

## !!RESTART THE KERNAL AFTER pymilvus install!!

Certain dependencies need to be persisted. Restarting the kernal allows this to occur. 

In [None]:
!pip install ipython-sql==0.4.1
!pip install sqlalchemy==1.4.46
!pip install sqlalchemy==1.4.46 "pyhive[presto]"
!pip install python-dotenv
!pip install wikipedia
!pip install sentence_transformers

## Wikipedia Exploration

In [None]:
wikipedia_search_term = 'Climate Change'

In [None]:
import wikipedia

# search
search_results = wikipedia.search(wikipedia_search_term)
search_results

print(search_results)

# view article summary
article_summary = wikipedia.summary(search_results[0])
article_summary

print(article_summary)

In [None]:
import wikipedia

# fetch wikipedia articles
articles = {
    #'ADDITIONAL TITLES': None,
    'Climate change': None
}

for k,v in articles.items():
    article = wikipedia.page(k)
    articles[k] = article.content
    print(f"Successfully fetched {k}")

print(f"Successfully fetched {len(articles)} articles ")


### Split Wikipedia Data into Chunks

In [None]:
# Chunk data
def split_into_chunks(text, chunk_size):
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

split_articles = {}
for k,v in articles.items():
    split_articles[k] = split_into_chunks(v, 225)


In [None]:
article_titles = list(split_articles.keys())
article_chunks = list(split_articles.values())

In [None]:
## create titles_list for associates chunks to be loaded into milvus 

i = 0
for title in article_titles:
    list_length = len(article_chunks[i])
    article_titles[i] = [title] * list_length
    i+=1
    


## Insert Chunks with Embeddings into Milvus

In [None]:
wslib.list_connections()

In [None]:
# note if you named your Milvus connection something other than 'Milvus Connection' please replace the name below 

milvus_credentials = wslib.get_connection('Milvus Connection')

In [None]:
#milvus_credentials

In [None]:
from pymilvus import(
    Milvus,
    IndexType,
    Status,
    connections,
    FieldSchema,
    DataType,
    Collection,
    CollectionSchema,
)


url = milvus_credentials['host']
port = milvus_credentials['port']
apikey = milvus_credentials['password']
apiuser = 'ibmlhapikey'


connections.connect(alias="default", 
                    host=url, 
                    port=port, 
                    user=apiuser, 
                    password=apikey, 
                    secure=True)



In [None]:
# feel free to change the description and title of the newly created collection 

collection_description = 'collection description'
collection_name = 'wiki_articles'

In [None]:
# Create collection - define fields + schema

fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True), # Primary key
    FieldSchema(name="article_text", dtype=DataType.VARCHAR, max_length=2500,),
    FieldSchema(name="article_title", dtype=DataType.VARCHAR, max_length=200,),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=384),
]

schema = CollectionSchema(fields, collection_description)

wiki_collection = Collection(collection_name, schema)

# Create index
index_params = {
        'metric_type':'L2',
        'index_type':"IVF_FLAT",
        'params':{"nlist":2048}
}

wiki_collection.create_index(field_name="vector", index_params=index_params)


In [None]:
# we can run a check to see the collections in our milvus instance and we see the new collection has been created 

from pymilvus import utility
utility.list_collections()

In [None]:
# load data into Milvus
import pandas as pd
from sentence_transformers import SentenceTransformer
from pymilvus import Collection, connections
import warnings
warnings.filterwarnings('ignore')


for i in range(len(article_titles)):
    # Create vector embeddings + data
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') # 384 dim
    passage_embeddings = model.encode(article_chunks[i])

    basic_collection = Collection(collection_name) 
    data = [
        article_chunks[i],
        article_titles[i],
        passage_embeddings
    ]
               
    out = basic_collection.insert(data)
    basic_collection.flush()  # Ensures data persistence

    
    print("Wikipedia Article: \'" + article_titles[i][0] + "\' has been loaded.")


In [None]:
## check to ensure entities have been loaded into the collection

basic_collection = Collection(collection_name) 

basic_collection.num_entities 

### Prompt LLM with Query Results


In [None]:
from sentence_transformers import SentenceTransformer
from pymilvus import(
    Milvus,
    IndexType,
    Status,
    connections,
    FieldSchema,
    DataType,
    Collection,
    CollectionSchema,
)

url = milvus_credentials['host']
port = milvus_credentials['port']
apikey = milvus_credentials['password']
apiuser = 'ibmlhapikey'


connections.connect(alias="default", 
                    host=url, 
                    port=port, 
                    user=apiuser, 
                    password=apikey, 
                    secure=True)


# Load collection

basic_collection = Collection(collection_name)      
basic_collection.load()

# Query function
def query_milvus(query, num_results):
    
    # Vectorize query
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') # 384 dim
    query_embeddings = model.encode([query])

    # Search
    search_params = {
        "metric_type": "L2", 
        "params": {"nprobe": 5}
    }
    results = basic_collection.search(
        data=query_embeddings, 
        anns_field="vector", 
        param=search_params,
        limit=num_results,
        expr=None, 
        output_fields=['article_text'],
    )
    return results

In [None]:
## Consider some questions to ask regarding the topic you have chosen 

#question_text = "OTHER QUESTION TEXT"

question_text = "What is climate change?"

In [None]:
# Query Milvus 

num_results = 3
results = query_milvus(question_text, num_results)

relevant_chunks = []
for i in range(num_results):    
    #print(f"id: {results[0].ids[i]}")
    #print(f"distance: {results[0].distances[i]}")
    text = results[0][i].entity.get('article_text')
    relevant_chunks.append(text)
    
#print(relevant_chunks)

In [None]:
def make_prompt(context, question_text):
    return (f"{context}\n\nPlease answer a question using this text. "
          + f"If the question is unanswerable, say \"unanswerable\"."
          + f"\n\nQuestion: {question_text}")


# Build prompt w/ Milvus results
# Embed retrieved passages(context) and user question into into prompt text

context = "\n\n".join(relevant_chunks)
prompt = make_prompt(context, question_text)

In [None]:
print(prompt)

In [None]:
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams

# Model Parameters
params = {
        GenParams.DECODING_METHOD: "greedy",
        GenParams.MIN_NEW_TOKENS: 1,
        GenParams.MAX_NEW_TOKENS: 500,
        GenParams.TEMPERATURE: 0,
}


# please note if using a cloud account in a different geography the cloud URL will be different 
# Refer to this list: 
#    Dallas - https://us-south.ml.cloud.ibm.com
#    London - https://eu-gb.ml.cloud.ibm.com
#    Frankfurt - https://eu-de.ml.cloud.ibm.com
#    Tokyo - https://jp-tok.ml.cloud.ibm.com

creds = {
    "url": 'https://us-south.ml.cloud.ibm.com',
    "apikey": milvus_credentials['password'] 
}

model = Model(
        model_id='ibm/granite-13b-chat-v2', 
        #model_id='meta-llama/llama-2-70b-chat', 
        params=params, credentials=creds, 
        project_id=wslib.here.get_ID()
)

# Prompt LLM
response = model.generate_text(prompt)
print(f"Question: {question_text}{response}")