## Load data and preprocess

In [1]:
import pandas as pd

# data = pd.read_csv('../data/reddit.csv')
data_json = pd.read_json("../AwareData/reddit.json")
data=pd.DataFrame(data_json)
data.loc[:, 'index'] = data.index




In [2]:
subs = data.loc[data['aware_post_type']=='submission'].reset_index(drop=True)

subs.loc[:, 'text_title'] = [str(subs['reddit_title'].iloc[i]) + ' [SEP] ' + str(subs['reddit_text'].iloc[i]) for i in range(len(subs))]
subs.loc[:, 'text_title_subreddit'] = [str(subs['reddit_title'].iloc[i]) + ' ' + str(subs['reddit_text'].iloc[i]) + ' #' + str(subs['reddit_subreddit'].iloc[i]) for i in range(len(subs))]

subs['comment_indices'] = [[] for _ in range(len(subs))]
for i in range(len(subs)):
    sub_index = subs.loc[i, 'index']
    next_sub_index = len(data) if i == len(subs)-1 else subs.loc[i+1,'index']
    for j in range(sub_index+1, next_sub_index):
        if data.loc[j, 'reddit_parent_id'] == subs.loc[i, 'reddit_name']:
            subs['comment_indices'].iloc[i].append(data.loc[j,'index'])

subs = subs.drop(subs[subs['comment_indices'].str.len()==0].index)

## Train test split

In [3]:
from sklearn.model_selection import train_test_split

subs_train, subs_test = train_test_split(subs, test_size=100)

subs_train = subs_train.reset_index(drop=True)
subs_test = subs_test.reset_index(drop=True)

## Generate test questions and ground truth

In [4]:
question = subs_test['text_title']

In [5]:
ground_truth = [data.loc[subs_test.iloc[i]['comment_indices'][0], 'reddit_text'] for i in range(len(subs_test))]

## Generate contexts

In [6]:
import torch

if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'

In [7]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2", device)
model.save(path='../emb/all-MiniLM-L6-v2', model_name='all-MiniLM-L6-v2')

In [8]:
vectors = model.encode(subs_train['text_title_subreddit'].values.tolist(),
                       convert_to_numpy=True,
                       normalize_embeddings=True)
subs_train['vector'] = vectors.tolist()

In [9]:
import lancedb

db = lancedb.connect("../.lancedb")
table = db.create_table("reddit_submissions", subs_train, mode="overwrite")
# table = db.create_table("reddit_submissions", subs, exist_ok=True)

In [10]:
context_num = 4
contexts = [['' for j in range(context_num)] for i in range(len(subs_test))]

for i in range(len(subs_test)):
    query = model.encode(question[i],
                       convert_to_numpy=True,
                       normalize_embeddings=True).tolist()
    response = table.search(query).limit(context_num).to_pandas()
    comment_indices = [response.loc[j, 'comment_indices'] for j in range(context_num)]
    for j in range(context_num):
    # contexts.append([data.loc[comment_indices[j], 'reddit_text'] for j in range(context_num)])
        contexts[i][j] = data.loc[comment_indices[j][0], 'reddit_text']

## Generate answers using llama3

In [11]:
from llama_cpp import Llama

n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.

# Make sure the model path is correct for your system!
generating_llm = Llama(
#     model_path="../llm/llama-3-8b.Q5_K_M.gguf",
    model_path="../AwareData/llama/llama-2-7b/llama-2-7b.Q5_K_M.gguf",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=800,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    verbose=True,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ../AwareData/llama/llama-2-7b/llama-2-7b.Q5_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_

In [12]:
answer = []

cutoff = 800

for i in range(len(subs_test)):
      context = contexts[i][0][:cutoff]
      prompt = "Summarize the following, " + str(context) + "Then answer question: " + str(question[i][:800]) + "Answer: "
      # prompt = 'Q:' + str(question[i]) + "A: "
      output = generating_llm(prompt,
            max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
            stop=["\n"], # Stop generating just before the model would generate a new question
            echo=False # Echo the prompt back in the output
      )
      answer.append(output['choices'][0]['text'])


llama_print_timings:        load time =    1937.43 ms
llama_print_timings:      sample time =       2.33 ms /    32 runs   (    0.07 ms per token, 13728.01 tokens per second)
llama_print_timings: prompt eval time =    1937.33 ms /   148 tokens (   13.09 ms per token,    76.39 tokens per second)
llama_print_timings:        eval time =    1442.45 ms /    31 runs   (   46.53 ms per token,    21.49 tokens per second)
llama_print_timings:       total time =    3418.72 ms /   179 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    1937.43 ms
llama_print_timings:      sample time =       2.20 ms /    32 runs   (    0.07 ms per token, 14565.32 tokens per second)
llama_print_timings: prompt eval time =    1394.42 ms /    73 tokens (   19.10 ms per token,    52.35 tokens per second)
llama_print_timings:        eval time =    1390.98 ms /    31 runs   (   44.87 ms per token,    22.29 tokens per second)
llama_print_timings:       total time =    2822.40 ms /   104 

llama_print_timings:       total time =    3625.42 ms /   244 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    1937.43 ms
llama_print_timings:      sample time =       2.29 ms /    32 runs   (    0.07 ms per token, 13992.13 tokens per second)
llama_print_timings: prompt eval time =     876.20 ms /    28 tokens (   31.29 ms per token,    31.96 tokens per second)
llama_print_timings:        eval time =    1346.39 ms /    31 runs   (   43.43 ms per token,    23.02 tokens per second)
llama_print_timings:       total time =    2259.78 ms /    59 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    1937.43 ms
llama_print_timings:      sample time =       2.26 ms /    32 runs   (    0.07 ms per token, 14165.56 tokens per second)
llama_print_timings: prompt eval time =    1367.31 ms /    78 tokens (   17.53 ms per token,    57.05 tokens per second)
llama_print_timings:        eval time =    1378.60 ms /    31 runs   (   44.47 ms

llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =    2298.01 ms /   233 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    1937.43 ms
llama_print_timings:      sample time =       0.08 ms /     1 runs   (    0.08 ms per token, 12987.01 tokens per second)
llama_print_timings: prompt eval time =    1913.91 ms /   181 tokens (   10.57 ms per token,    94.57 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =    1915.25 ms /   182 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    1937.43 ms
llama_print_timings:      sample time =       0.07 ms /     1 runs   (    0.07 ms per token, 13698.63 tokens per second)
llama_print_timings: prompt eval time =    1701.36 ms /   136 tokens (   12.51 ms

llama_print_timings: prompt eval time =    2070.30 ms /   196 tokens (   10.56 ms per token,    94.67 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =    2071.48 ms /   197 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    1937.43 ms
llama_print_timings:      sample time =       0.07 ms /     1 runs   (    0.07 ms per token, 13698.63 tokens per second)
llama_print_timings: prompt eval time =    2683.33 ms /   309 tokens (    8.68 ms per token,   115.16 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =    2684.73 ms /   310 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    1937.43 ms
llama_print_timings:      sample time =       2.19 ms /    32 runs   (    0.07 ms

llama_print_timings:      sample time =       2.15 ms /    32 runs   (    0.07 ms per token, 14856.08 tokens per second)
llama_print_timings: prompt eval time =    2290.05 ms /   228 tokens (   10.04 ms per token,    99.56 tokens per second)
llama_print_timings:        eval time =    1415.07 ms /    31 runs   (   45.65 ms per token,    21.91 tokens per second)
llama_print_timings:       total time =    3743.07 ms /   259 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    1937.43 ms
llama_print_timings:      sample time =       0.07 ms /     1 runs   (    0.07 ms per token, 13513.51 tokens per second)
llama_print_timings: prompt eval time =    1938.05 ms /   192 tokens (   10.09 ms per token,    99.07 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =    1938.70 ms /   193 tokens
Llama.generate: prefix-match hit

llama_print_ti

Llama.generate: prefix-match hit

llama_print_timings:        load time =    1937.43 ms
llama_print_timings:      sample time =       2.28 ms /    32 runs   (    0.07 ms per token, 14035.09 tokens per second)
llama_print_timings: prompt eval time =    1728.36 ms /   159 tokens (   10.87 ms per token,    91.99 tokens per second)
llama_print_timings:        eval time =    1392.93 ms /    31 runs   (   44.93 ms per token,    22.26 tokens per second)
llama_print_timings:       total time =    3159.95 ms /   190 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    1937.43 ms
llama_print_timings:      sample time =       2.33 ms /    32 runs   (    0.07 ms per token, 13757.52 tokens per second)
llama_print_timings: prompt eval time =    1194.86 ms /    45 tokens (   26.55 ms per token,    37.66 tokens per second)
llama_print_timings:        eval time =    1367.59 ms /    31 runs   (   44.12 ms per token,    22.67 tokens per second)
llama_print_timings:       to

llama_print_timings:       total time =    2599.79 ms /    74 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    1937.43 ms
llama_print_timings:      sample time =       0.29 ms /     4 runs   (    0.07 ms per token, 14035.09 tokens per second)
llama_print_timings: prompt eval time =    1204.58 ms /    38 tokens (   31.70 ms per token,    31.55 tokens per second)
llama_print_timings:        eval time =     131.05 ms /     3 runs   (   43.68 ms per token,    22.89 tokens per second)
llama_print_timings:       total time =    1339.42 ms /    41 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    1937.43 ms
llama_print_timings:      sample time =       1.30 ms /    18 runs   (    0.07 ms per token, 13846.15 tokens per second)
llama_print_timings: prompt eval time =    1204.43 ms /    47 tokens (   25.63 ms per token,    39.02 tokens per second)
llama_print_timings:        eval time =     743.37 ms /    17 runs   (   43.73 ms

## Evaluation

In [13]:
from langchain_community.llms import LlamaCpp
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.

# Make sure the model path is correct for your system!
evaluating_llm = LlamaCpp(
#     model_path="../llm/llama-3-8b.Q5_K_M.gguf",
    model_path="../AwareData/llama/llama-2-7b/llama-2-7b.Q5_K_M.gguf",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=800,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    verbose=True,
)

evaluating_llm = LangchainLLMWrapper(evaluating_llm)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ../AwareData/llama/llama-2-7b/llama-2-7b.Q5_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_

In [14]:
from langchain.embeddings import HuggingFaceEmbeddings

evaluating_embeddings = HuggingFaceEmbeddings(
    model_name="../emb/all-MiniLM-L6-v2",     # Provide the pre-trained model's path
    model_kwargs={'device': device}, # Pass the model configuration options
    encode_kwargs={'normalize_embeddings': True} # Pass the encoding options
)

evaluating_embeddings = LangchainEmbeddingsWrapper(evaluating_embeddings)

In [15]:
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)

In [16]:
from datasets import Dataset

data_samples = {'question': [q[:cutoff] for q in question], 'contexts': [[c[:cutoff] for c in context]for context in contexts], 'ground_truth': [g[:cutoff] for g in ground_truth], 'answer': [a[:cutoff] for a in answer]}
dataset = Dataset.from_dict(data_samples)

In [None]:
result = evaluate(
    dataset,
    llm = evaluating_llm,
    embeddings=evaluating_embeddings,
    metrics=[
        answer_relevancy,
        faithfulness,
        context_recall,
        context_precision,
    ])

Evaluating:   0%|          | 0/400 [00:00<?, ?it/s]