In [1]:
import os

from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import torch
import transformers
from transformers import AutoTokenizer

model = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto"
)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 2 files: 100%|██████████| 2/2 [01:58<00:00, 59.04s/it] 
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.37s/it]
Device set to use cuda:0


In [3]:
import time

session_start_time = time.time()

In [4]:
import pandas as pd
from datasets import load_dataset


dataset = load_dataset("sciq", split='train')

In [5]:
dataset[0]

{'question': 'What type of organism is commonly used in preparation of foods such as cheese and yogurt?',
 'distractor3': 'viruses',
 'distractor1': 'protozoa',
 'distractor2': 'gymnosperms',
 'correct_answer': 'mesophilic organisms',
 'support': 'Mesophiles grow best in moderate temperature, typically between 25°C and 40°C (77°F and 104°F). Mesophiles are often found living in or on the bodies of humans or other animals. The optimal growth temperature of many pathogenic mesophiles is 37°C (98°F), the normal human body temperature. Mesophilic organisms have important uses in food preparation, including cheese, yogurt, beer and wine.'}

In [6]:
len(dataset)

11679

In [7]:
filtered_dataset = dataset.filter(lambda x: x["support"] != "" and  x["correct_answer"] != "")

Filter: 100%|██████████| 11679/11679 [00:00<00:00, 303588.86 examples/s]


In [8]:
len(filtered_dataset)

10481

In [9]:
df = pd.DataFrame(filtered_dataset)

In [10]:
df.head(5)

Unnamed: 0,question,distractor3,distractor1,distractor2,correct_answer,support
0,What type of organism is commonly used in prep...,viruses,protozoa,gymnosperms,mesophilic organisms,"Mesophiles grow best in moderate temperature, ..."
1,What phenomenon makes global winds blow northe...,tropical effect,muon effect,centrifugal effect,coriolis effect,Without Coriolis Effect the global winds would...
2,Changes from a less-ordered state to a more-or...,endothermic,unbalanced,reactive,exothermic,Summary Changes of state are examples of phase...
3,What is the least dangerous radioactive decay?,zeta decay,beta decay,gamma decay,alpha decay,All radioactive decay is dangerous to living t...
4,Kilauea in hawaii is the world’s most continuo...,magma,greenhouse gases,carbon and smog,smoke and ash,Example 3.5 Calculating Projectile Motion: Hot...


In [11]:
df.drop(columns=['distractor3', 'distractor2', 'distractor1'], inplace=True)

In [12]:
df.head(3)

Unnamed: 0,question,correct_answer,support
0,What type of organism is commonly used in prep...,mesophilic organisms,"Mesophiles grow best in moderate temperature, ..."
1,What phenomenon makes global winds blow northe...,coriolis effect,Without Coriolis Effect the global winds would...
2,Changes from a less-ordered state to a more-or...,exothermic,Summary Changes of state are examples of phase...


In [13]:
df['completion'] = df['correct_answer'] + " because " + df["support"]
df.dropna(subset=["completion"], inplace=True)

In [14]:
df.head(3)

Unnamed: 0,question,correct_answer,support,completion
0,What type of organism is commonly used in prep...,mesophilic organisms,"Mesophiles grow best in moderate temperature, ...",mesophilic organisms because Mesophiles grow b...
1,What phenomenon makes global winds blow northe...,coriolis effect,Without Coriolis Effect the global winds would...,coriolis effect because Without Coriolis Effec...
2,Changes from a less-ordered state to a more-or...,exothermic,Summary Changes of state are examples of phase...,exothermic because Summary Changes of state ar...


In [15]:
df.shape

(10481, 4)

In [16]:
print(df.columns)

Index(['question', 'correct_answer', 'support', 'completion'], dtype='object')


### Creating the Chroma collection

In [17]:
import chromadb

client = chromadb.Client()

In [18]:
collection_name = "sciq_supports6"

In [19]:
collections = client.list_collections()
collections

[]

In [20]:
collection_exists = any(collection.name == collection_name for collection in collections)
print("Collection exists:", collection_exists)

Collection exists: False


In [21]:
if not collection_exists:
    collection = client.create_collection(collection_name)
else:
    print("Collection ", collection_name, " exists:", collection_exists)

In [22]:
results = collection.get()
for result in results:
    print(result)

ids
embeddings
documents
uris
included
data
metadatas


In [23]:
model = "all-MiniLM-L6-v2"

In [24]:
ldf = len(df)
ldf

10481

In [26]:
import time

nb = ldf
max_batch_size = 5461  # ChromaDB의 최대 배치 크기

start_time = time.time()

completion_list = df["completion"][:nb].astype(str).to_list()

if not collection_exists:
    # 데이터를 배치로 나누기
    for i in range(0, nb, max_batch_size):
        batch_end = min(i + max_batch_size, nb)  # 배치 끝 인덱스 계산
        batch_ids = [str(j) for j in range(i, batch_end)]
        batch_documents = completion_list[i:batch_end]
        batch_metadatas = [{"type": "completion"} for _ in range(len(batch_ids))]

        collection.add(
            ids=batch_ids,
            documents=batch_documents,
            metadatas=batch_metadatas
        )

response_time = time.time() - start_time
print(f"Response Time: {response_time:.2f} seconds")

Response Time: 140.31 seconds


In [27]:
result = collection.get(include=['embeddings'])

first_embedding = result['embeddings'][0]

embedding_length = len(first_embedding)

print("First embedding:", first_embedding)
print("Embedding length:", embedding_length)

First embedding: [ 3.68907079e-02 -5.88156618e-02 -4.81813326e-02  6.92331642e-02
  1.66964978e-02 -4.07537222e-02  1.88399665e-02  1.81023628e-02
  1.78051423e-02  7.78705478e-02  2.52816640e-02 -1.57923087e-01
 -2.36181635e-02  9.52994600e-02 -5.83179388e-03 -9.35172942e-03
  8.79396722e-02 -2.97825877e-02 -3.17596346e-02  3.58476944e-04
  4.81602177e-02  3.59455980e-02 -6.36885539e-02 -3.58013026e-02
  8.47947598e-03 -4.70491946e-02 -1.44115845e-02  1.53261637e-02
 -1.74492616e-02  3.77150923e-02 -5.39003126e-02  1.29380950e-03
  1.40758231e-01 -1.21125570e-02  1.60011258e-02  2.58895960e-02
  9.29332245e-03 -1.31458566e-01  4.73491177e-02  5.54820485e-02
 -2.50272304e-02  4.49109487e-02  6.07553348e-02 -1.31188298e-03
 -2.81656906e-02  1.87065490e-02 -5.63845932e-02  7.59200156e-02
 -7.12970924e-03 -6.82346597e-02 -9.04978346e-03  5.66561222e-02
 -1.45302843e-02  5.78948557e-02 -6.67471290e-02  2.99725756e-02
 -5.11366464e-02 -2.36395728e-02 -6.88513648e-03 -9.38077550e-03
  5.5031

In [28]:
result = collection.get(include=['documents'])

first_doc = result['documents'][0]

print("First document:", first_doc)

First document: mesophilic organisms because Mesophiles grow best in moderate temperature, typically between 25°C and 40°C (77°F and 104°F). Mesophiles are often found living in or on the bodies of humans or other animals. The optimal growth temperature of many pathogenic mesophiles is 37°C (98°F), the normal human body temperature. Mesophilic organisms have important uses in food preparation, including cheese, yogurt, beer and wine.


In [34]:
import time

start_time = time.time()

results = collection.query(
    query_texts=df['question'][:nb].to_list(),
    n_results=1
)

response_time = time.time() - start_time
print(f"Response Time: {response_time:.2f} seconds")

Response Time: 135.29 seconds


In [46]:
import spacy
import numpy as np

nlp = spacy.load('en_core_web_md')

def simple_text_similarity(text1, text2):
    # Convert the texts into spaCy document objects
    doc1 = nlp(text1)
    doc2 = nlp(text2)

    # Get the vectors for each document
    vector1 = doc1.vector
    vector2 = doc2.vector

    # Compute the cosine similarity between the two vectors
    # Check for zero vectors to avoid division by zero
    if np.linalg.norm(vector1) == 0 or np.linalg.norm(vector2) == 0:
        return 0.0  # Return zero if one of the texts does not have a vector representation
    else:
        similarity = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
        return similarity

In [50]:
nbqd = 100

acc_counter = 0
display_counter = 0

for i, q in enumerate(df['question'][:nb]):
    original_completion = df['completion'][i]
    retrieved_document = results['documents'][i][0]
    similarity_score = simple_text_similarity(original_completion, retrieved_document)
    if similarity_score > 0.7:
        acc_counter += 1
    display_counter += 1
    if display_counter <= nbqd or display_counter > nb - nbqd:
        print(i," ", f"Question: {q}")
        print(f"Retrieved document: {retrieved_document}")
        print(f"Original completion: {original_completion}")
        print(f"Similarity Score: {similarity_score:.2f}")
        print()  # Blank line for better readability between entries

if nb > 0:
    acc = acc_counter / nb
    print(f"Number of documents: {nb:.2f}")
    print(f"Overall similarity score: {acc:.2f}")

0   Question: What type of organism is commonly used in preparation of foods such as cheese and yogurt?
Retrieved document: lactic acid because Bacteria can be used to make cheese from milk. The bacteria turn the milk sugars into lactic acid. The acid is what causes the milk to curdle to form cheese. Bacteria are also involved in producing other foods. Yogurt is made by using bacteria to ferment milk ( Figure below ). Fermenting cabbage with bacteria produces sauerkraut.
Original completion: mesophilic organisms because Mesophiles grow best in moderate temperature, typically between 25°C and 40°C (77°F and 104°F). Mesophiles are often found living in or on the bodies of humans or other animals. The optimal growth temperature of many pathogenic mesophiles is 37°C (98°F), the normal human body temperature. Mesophilic organisms have important uses in food preparation, including cheese, yogurt, beer and wine.
Similarity Score: 0.88

1   Question: What phenomenon makes global winds blow nor

### Prompt and retrieval

In [51]:
# initial question
prompt = "Millions of years ago, plants used energy from the sun to form what?"
# variant 1 similar
#prompt = "Eons ago, plants used energy from the sun to form what?"
# variant 2 divergent
#prompt = "Eons ago, plants used sun energy to form what?"

In [54]:
import time
import textwrap


start_time = time.time()

results = collection.query(
    query_texts=[prompt],
    n_results=1
)

response_time = time.time() - start_time
print(f"Response Time: {response_time:.2f} seconds\n")

if results['documents'] and len(results['documents'][0]) > 0:
    wrapped_question = textwrap.fill(prompt, width=70)
    wrapped_document = textwrap.fill(results['documents'][0][0], width=70)
    print(f"Question\n: {wrapped_question}")
    print("\n")
    print(f"Retrieved document\n: {wrapped_document}")
    print()
else:
    print("No documents retrieved.")


Response Time: 0.11 seconds

Question
: Millions of years ago, plants used energy from the sun to form what?


Retrieved document
: chloroplasts because When ancient plants underwent photosynthesis,
they changed energy in sunlight to stored chemical energy in food. The
plants used the food and so did the organisms that ate the plants.
After the plants and other organisms died, their remains gradually
changed to fossil fuels as they were covered and compressed by layers
of sediments. Petroleum and natural gas formed from ocean organisms
and are found together. Coal formed from giant tree ferns and other
swamp plants.



### RAG with Hugging Face

In [58]:
def LLaMA2(promt):
    sequences = pipeline(
        prompt,
        do_sample=True,                         # 모델이 확률 분포에 따라 무작위로 토큰을 선택
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,    # 토큰화된 시퀀스의 끝을 표시하는 토큰
        max_new_tokens=100,
        temperature=0.5,
        repetition_penalty=2.0,                 # 모델이 같은 토큰을 반복하지 않도록 제한하는 역할
        truncation=True
    )
    return sequences

In [59]:
iprompt='Read the following input and write a summary for beginners.'
lprompt=iprompt + " " + results['documents'][0][0]

In [62]:
import time
start_time = time.time()  # Start timing before the request

response=LLaMA2(lprompt)
for seq in response:
    generated_part = seq['generated_text'].replace(iprompt, '')  # Remove the input part from the output

response_time = time.time() - start_time  # Measure response time
print(f"Response Time: {response_time:.2f} seconds")  # Print response time

Response Time: 0.87 seconds


In [64]:
response

[{'generated_text': 'Millions of years ago, plants used energy from the sun to form what?\nA) Carbon dioxide and water vapor. B ) Oxygen gas C). Nitrogen-based compounds D.) Fossil fuels E)) None Of The Above'}]

In [63]:
generated_part

'Millions of years ago, plants used energy from the sun to form what?\nA) Carbon dioxide and water vapor. B ) Oxygen gas C). Nitrogen-based compounds D.) Fossil fuels E)) None Of The Above'

In [65]:
wrapped_response = textwrap.fill(generated_part.strip(), width=70)
print(wrapped_response)

Millions of years ago, plants used energy from the sun to form what?
A) Carbon dioxide and water vapor. B ) Oxygen gas C). Nitrogen-based
compounds D.) Fossil fuels E)) None Of The Above


In [66]:
delete_collection = False
if delete_collection:
    client.delete_collection(collection_name)

In [None]:
# List all collections
collections = client.list_collections()

# Check if the specific collection exists
collection_exists = any(collection.name == collection_name for collection in collections)
print("Collection exists:", collection_exists)