In [1]:
#installation and import statements

# ! pip install lancedb
# ! pip install sentence_transformers
# ! pip install llama-cpp-python

from __future__ import annotations
import numpy as np
import time
import lancedb # our vector database
from lancedb.embeddings import get_registry #embedding registry
from lancedb.pydantic import LanceModel, Vector, pydantic_to_schema
from lancedb.embeddings import TextEmbeddingFunction
import pandas as pd
import pyarrow as pa
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, pipeline
import transformers
import torch
from llama_cpp import Llama
import json




## Get Dataset

In [2]:
data_json = pd.read_json("../AwareData/reddit.json")
data=pd.DataFrame(data_json)
data.head()

Unnamed: 0,aware_post_type,aware_created_ts,reddit_id,reddit_name,reddit_created_utc,reddit_author,reddit_text,reddit_permalink,reddit_title,reddit_url,reddit_subreddit,reddit_link_id,reddit_parent_id,reddit_submission
0,submission,2023-04-02T13:58:03,129sqka,t3_129sqka,1680458283,MoodyStarGirl,That's it.,/r/starbucks/comments/129sqka/hot_chai_lattes_...,Hot chai lattes shouldn't have water,https://www.reddit.com/r/starbucks/comments/12...,starbucks,,,
1,comment,2023-04-02T14:32:57,jeounwc,t1_jeounwc,1680460377,Lost_Treat_6296,We should make the chai tea latte with the sam...,/r/starbucks/comments/129sqka/hot_chai_lattes_...,,,starbucks,t3_129sqka,t3_129sqka,129sqka
2,comment,2023-04-02T14:48:18,jeowus2,t1_jeowus2,1680461298,MoodyStarGirl,Oh like using the chai tea bags?,/r/starbucks/comments/129sqka/hot_chai_lattes_...,,,starbucks,t3_129sqka,t1_jeounwc,129sqka
3,comment,2023-04-02T14:48:49,jeowxe5,t1_jeowxe5,1680461329,Lost_Treat_6296,"No, the whole half water and half milk thing",/r/starbucks/comments/129sqka/hot_chai_lattes_...,,,starbucks,t3_129sqka,t1_jeowus2,129sqka
4,comment,2023-04-02T21:59:22,jeqiuw3,t1_jeqiuw3,1680487162,MoodyStarGirl,That's a lot of water :(,/r/starbucks/comments/129sqka/hot_chai_lattes_...,,,starbucks,t3_129sqka,t1_jeowxe5,129sqka


# Preprocess

Add a column of reddit subreddit labels 


In [3]:
subreddits = data['reddit_subreddit'].unique()
label_dict = dict(zip(subreddits, range(len(subreddits))))

# label_dict = {'starbucks': 0,
#             'starbucksbaristas': 1,
#             'UPSers': 2,
#             'Lowes': 3,
#             'DollarTree': 4,
#             'WalmartEmployees': 5,
#             'McLounge': 6,
#             'Fedexers': 7,
#             'TjMaxx': 8,
#             'RiteAid': 9,
#             'walmart': 10,
#             'TalesFromYourBank': 11,
#             'KrakenSupport': 12,
#             'BestBuyWorkers': 13,
#             'Bestbuy': 14,
#             'CVS': 15,
#             'Target': 16,
#             'disney': 17,
#             'Disneyland': 17,
#             'DisneyWorld': 17,
#             'WaltDisneyWorld': 17}

data['reddit_subreddit_label'] = data['reddit_subreddit'].apply(lambda x: label_dict[x])



Add a column of indices.


In [4]:
data.loc[:, 'index'] = data.index

Remove all line breaks in reddit_text.


In [6]:
# for i in range(len(data)):
#      data.loc[i, 'reddit_text'] = data.loc[i, 'reddit_text'].replace('\n', ' ')


In [7]:
data.to_csv('data/reddit.csv')




Make a new dataframe of submissions only.


In [8]:
subs = data.loc[data['aware_post_type']=='submission'].reset_index(drop=True)



Combine submission title and text.


In [9]:
subs.loc[:, 'text_and_title'] = [str(subs['reddit_title'].iloc[i]) + ' ' + str(subs['reddit_text'].iloc[i]) + '#' + str(subs['reddit_subreddit'].iloc[i]) for i in range(len(subs))]




Add a column of comment indices


In [10]:
subs['comment_indices'] = [[] for _ in range(len(subs))]
for i in range(len(subs)):
    sub_index = subs.loc[i, 'index']
    next_sub_index = len(data) if i == len(subs)-1 else subs.loc[i+1,'index']
    for j in range(sub_index+1, next_sub_index):
        if data.loc[j, 'reddit_parent_id'] == subs.loc[i, 'reddit_name']:
            subs['comment_indices'].iloc[i].append(data.loc[j,'index'])




Drop submissons with no comments


In [11]:
subs = subs.drop(subs[subs['comment_indices'].str.len()==0].index)

In [12]:
subs[0:5]

Unnamed: 0,aware_post_type,aware_created_ts,reddit_id,reddit_name,reddit_created_utc,reddit_author,reddit_text,reddit_permalink,reddit_title,reddit_url,reddit_subreddit,reddit_link_id,reddit_parent_id,reddit_submission,reddit_subreddit_label,index,text_and_title,comment_indices
0,submission,2023-04-02T13:58:03,129sqka,t3_129sqka,1680458283,MoodyStarGirl,That's it.,/r/starbucks/comments/129sqka/hot_chai_lattes_...,Hot chai lattes shouldn't have water,https://www.reddit.com/r/starbucks/comments/12...,starbucks,,,,0,0,Hot chai lattes shouldn't have water That's it...,"[1, 5, 12, 13]"
1,submission,2023-04-02T14:07:43,129t0vd,t3_129t0vd,1680458863,kotakehoe,i had the urge to make a cute baby frapp in a ...,/r/starbucks/comments/129t0vd/my_cute_little_f...,my cute little frappucino :),https://i.redd.it/0gd8tk2zyjra1.jpg,starbucks,,,,0,18,my cute little frappucino :) i had the urge to...,"[19, 20]"
2,submission,2023-04-02T14:20:39,129te77,t3_129te77,1680459639,Extension_Cricket_30,Hey guys! I was wondering what benefits people...,/r/starbucks/comments/129te77/partner_benefits...,Partner Benefits at your Starbucks?,https://www.reddit.com/r/starbucks/comments/12...,starbucks,,,,0,26,Partner Benefits at your Starbucks? Hey guys! ...,"[27, 28, 29]"
3,submission,2023-04-02T14:39:11,129txfg,t3_129txfg,1680460751,ateez-lvr,did anyone notice (or is it the same for you a...,/r/starbucks/comments/129txfg/not_really_a_ran...,"not really a rant, just an observation",https://www.reddit.com/r/starbucks/comments/12...,starbucks,,,,0,30,"not really a rant, just an observation did any...","[31, 37]"
4,submission,2023-04-02T14:43:36,129u231,t3_129u231,1680461016,chuchae,"hi friends, so i recently got back from a we...",/r/starbucks/comments/129u231/accidental_no_show/,accidental no show,https://www.reddit.com/r/starbucks/comments/12...,starbucks,,,,0,38,"accidental no show hi friends, so i recently...",[39]


# Embedding

Define the embedding function

Generate the vector embeddings for each submission text and title and store them in the vector database

In [13]:
if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'

model = SentenceTransformer("all-MiniLM-L6-v2", device)

vectors = model.encode(subs['text_and_title'].values.tolist(),
                       convert_to_numpy=True,
                       normalize_embeddings=True)

subs['vector'] = vectors.tolist()



# Open LanceDB and store embeddings

In [14]:

db = lancedb.connect("~/.lancedb")
table = db.create_table('reddit', subs, mode='overwrite')


# Build RAG pipeline

We search through submission database and pull out comment information from full database

In [15]:
#Query
query_string = "What is chai made of?" 
print("Query: ",query_string)
print()
query = model.encode(query_string,
                       convert_to_numpy=True,
                       normalize_embeddings=True).tolist()

#retrieval
response = table.search(query).limit(1).to_pandas()
response

response_str=""
#build context (submission + top comment for each retrieved post)
for i in range(len(response)):
    sub_index = response.loc[i, 'index']
    comment_indices = response.loc[i, 'comment_indices']
    print('Submission title: ', data.loc[sub_index, 'reddit_title'])
    print('Submission text:', data.loc[sub_index, 'reddit_text'])
    print('Comment:\n', data.loc[comment_indices[0], 'reddit_text'])
    
    response_str = response_str + ' Submission title: ' +  data.loc[sub_index, 'reddit_title'] + ' Submission text: ' + data.loc[sub_index, 'reddit_text'] + ' Comment: ' + data.loc[comment_indices[0], 'reddit_text']
    print()
    
# print(response_str)

Query:  What is chai made of?

Submission title:  Chai latte
Submission text: Newbie here. How is a hot Chai latte made? Thanks!
Comment:
 1. Steam 2% milk 2. Pump chai: 2/3/4/5 for short/tall/grande/venti  3. Fill cup halfway with hot water 4. Fill remaining half with steamed milk  (https://sbuxdates.com is a great site for checking recipes)



# Initialize LLM

I'm using the smallish llama 2 with 7 billion parameters.

This little llama seems okayish at convesation, but a few weeks ago it told me that comets were made of 90% ice and 10% peanut butter.

In [16]:
llm = Llama(model_path="../AwareData/llama/llama-2-7b/llama-2-7b.Q5_K_M.gguf")

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ../AwareData/llama/llama-2-7b/llama-2-7b.Q5_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_

# Gather retrieval for LLM

In [17]:
# Define prompt template
ret_query = "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use two sentences maximum and keep the answer concise. Question: '" + query_string + "' Context: " + response_str + " Answer: "
print(ret_query)


You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use two sentences maximum and keep the answer concise. Question: 'What is chai made of?' Context:  Submission title: Chai latte Submission text: Newbie here. How is a hot Chai latte made? Thanks! Comment: 1. Steam 2% milk 2. Pump chai: 2/3/4/5 for short/tall/grande/venti  3. Fill cup halfway with hot water 4. Fill remaining half with steamed milk  (https://sbuxdates.com is a great site for checking recipes) Answer: 


In [18]:
output = llm(
      "Q: " + ret_query + " A: ", # Prompt
      max_tokens=None, # Generate up to 32 tokens, set to None to generate up to the end of the context window
      stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
      echo=False # Echo the prompt back in the output
) # Generate a completion, can also call create_completion
print(output['choices'])


llama_print_timings:        load time =    2196.75 ms
llama_print_timings:      sample time =       6.50 ms /    76 runs   (    0.09 ms per token, 11692.31 tokens per second)
llama_print_timings: prompt eval time =    2196.63 ms /   180 tokens (   12.20 ms per token,    81.94 tokens per second)
llama_print_timings:        eval time =    3314.73 ms /    75 runs   (   44.20 ms per token,    22.63 tokens per second)
llama_print_timings:       total time =    5602.64 ms /   255 tokens


[{'text': '1. Steam 2% milk 2. Pump chai: 2/3/4/5 for short/tall/grande/venti  3. Fill cup halfway with hot water 4. Fill remaining half with steamed milk  (https://sbuxdates.com is a great site for checking recipes)', 'index': 0, 'logprobs': None, 'finish_reason': 'stop'}]


# Build Simple Query Function

In [24]:
def llamaRAGQuery(query_str, sub_data, full_data):
    
    #vector database
    db = lancedb.connect("~/.lancedb")
    table = db.create_table('reddit_sb', sub_data, mode='overwrite')
    
    #query
    query = model.encode(query_str,
                       convert_to_numpy=True,
                       normalize_embeddings=True).tolist()
    #retrieval
    response = table.search(query).limit(4).to_pandas()
    
    #build context (submission + top comment for each retrieved post)
    response_str = ""
    for i in range(len(response)):
        sub_index = response.loc[i, 'index']
        comment_indices = response.loc[i, 'comment_indices']
        response_str = response_str + ' Submission title: ' +  full_data.loc[sub_index, 'reddit_title'] + ' Submission text: ' + full_data.loc[sub_index, 'reddit_text'] + ' Comment: ' + full_data.loc[comment_indices[0], 'reddit_text']
    
    
    #feed query and context to llm
    ret_query = "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use two sentences maximum and keep the answer concise. Question: '" + query_str + "' Context: " + response_str + " Answer: "
    print(ret_query)
    print("----------------------------------------------------")
    
    output = llm(
          "Q: " + ret_query[0:480] + " A: ", # Prompt
          max_tokens=None, # Generate up to 32 tokens, set to None to generate up to the end of the context window
          stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
          echo=False # Echo the prompt back in the output
    ) # Generate a completion, can also call create_completion
    print(output['choices'])
    

In [26]:
llamaRAGQuery("Do starbucks employees get paid time off?", subs, data)


You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use two sentences maximum and keep the answer concise. Question: 'Do starbucks employees get paid time off?' Context:  Submission title: Sick hours Submission text: If I quit my job at Starbucks do they pay out our sick time? I am finally putting in my two week notice and I’m excited for something else. Comment: Only vacation Submission title: do vacation hours count toward benefit hours? Submission text:  Comment: Yes. Submission title: I know it varies by location/management…but does Starbucks typically honor scheduling restrictions? Submission text: I currently work a M-Th/T-F rotation and was thinking of applying to Starbucks for my Mondays, Fridays, Saturdays and Sundays off. If I can absolutely only work those days, will they typically stick to the hours you’re available, or do they try to throw you in

Llama.generate: prefix-match hit

llama_print_timings:        load time =    2196.75 ms
llama_print_timings:      sample time =      25.20 ms /   393 runs   (    0.06 ms per token, 15594.00 tokens per second)
llama_print_timings: prompt eval time =    1293.35 ms /    61 tokens (   21.20 ms per token,    47.16 tokens per second)
llama_print_timings:        eval time =   17471.35 ms /   392 runs   (   44.57 ms per token,    22.44 tokens per second)
llama_print_timings:       total time =   19256.00 ms /   453 tokens


[{'text': " It's not clear what your job at Starbucks was in terms of hours per week or how often you worked. I worked as a barista in a Starbucks  A:  I worked at a Starbucks in Texas. I only worked there for 3 months but I did receive my sick days pay out at the end of  A:  I worked at Starbucks in North Carolina in 2015. I was a barista and I was paid a salary.  A:  Yes I worked at Starbucks for 5 months. I quit my job about 2 months ago. I do not remember if I was paid out my sick time or not but I did receive my last 2 pay checks  A:  I worked at Starbucks in Texas for a little under a year. I quit in August 2014. I was paid my sick time out when I quit  A:  I worked at a Starbucks in Georgia. I quit in January 2015. I did receive my sick time pay out at the end of my last month  A:  I worked at Starbucks from January 2015 - October 2015. I am not sure if I got paid out my sick time but I was paid out my vacation time at the end of 2015  A:  I worked at Starbuck from October 2015 