# Amazon Reviews 2023 Dataset
 > Download from [here](https://amazon-reviews-2023.github.io/)

In [12]:
import os
import pandas as pd

# Load dataset
To keep things simple, we'll just download reviews for next-gen consoles

In [13]:
data_dir = "../data/raw"

meta = pd.read_json(os.path.join(data_dir, "meta_Video_Games.jsonl"), lines=True).query(
    "main_category == 'Video Games' & title in ['Xbox Series S', 'Xbox Series X', 'PlayStation 5 Console']"
)
meta.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
6426,Video Games,Xbox Series S,4.8,27684,[Go all digital with Xbox Series S and experie...,"[Introducing the Xbox Series S, the smallest, ...",279.0,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Xbox Series S Holiday Console Watc...,Xbox,[],"{'Release date': 'November 10, 2020', 'Pricing...",B0C37RBK2R,,,
10506,Video Games,Xbox Series X,4.8,21872,"[XBOX SERIES X: The fastest, most powerful Xbo...","[Xbox Series X, the fastest, most powerful Xbo...",499.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'What you NEED to know about the Xb...,Xbox,"[Video Games, Legacy Systems, Xbox Systems, Xb...","{'Release date': 'November 10, 2020', 'Best Se...",B0C1K1R6HK,,,
14128,Video Games,PlayStation 5 Console,4.9,27776,[Lightning Speed - Harness the power of a cust...,[The PS5 console unleashes new gaming possibil...,565.87,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Worth it if you can get one', 'url...",PlayStation,"[Video Games, PlayStation 5, Consoles]","{'Release date': 'November 12, 2020', 'Pricing...",B09B6DL81D,,,
20254,Video Games,PlayStation 5 Console,4.9,27774,[Lightning Speed - Harness the power of a cust...,[The PS5 console unleashes new gaming possibil...,448.19,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Worth it if you can get one', 'url...",PlayStation,"[Video Games, PlayStation 5, Consoles]","{'Release date': 'November 12, 2020', 'Pricing...",B08FC5L3RG,,,
99678,Video Games,PlayStation 5 Console,4.9,27748,[Lightning Speed - Harness the power of a cust...,[The PS5 console unleashes new gaming possibil...,522.2,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Please ps5', 'url': 'https://www.a...",PlayStation,"[Video Games, PlayStation 5, Consoles]","{'Release date': 'November 12, 2020', 'Pricing...",B08NWJZK77,,,


In [14]:
product_list = meta.parent_asin.tolist()

In [22]:
df = pd.read_json(os.path.join(data_dir, "Video_Games.jsonl"), lines=True).query(
    "parent_asin in @product_list"
)

# add product information
df = df.merge(meta[["parent_asin", "title"]], on="parent_asin", how="left")
df = df.rename(columns={"title_x": "title", "title_y": "product"})

# convert timestamp to date string
df["date"] = df["timestamp"].dt.strftime("%Y-%m-%d")

print(len(df))
df.head()

6196


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,product,date
0,5,Recommended,Great product. Would recommend. As described. ...,[],B08FC5L3RG,B08FC5L3RG,AG4B7E7QN3XUKWQNFKBECEFDKH5A,2021-05-05 23:15:45.109,0,True,PlayStation 5 Console,2021-05-05
1,3,Bugging Out,I had to return due to technical issues. Bugs ...,[],B08FC5L3RG,B08FC5L3RG,AHK66GBPGOUIHTO4QS5HXLL6FJRQ,2021-02-02 20:41:06.824,3,True,PlayStation 5 Console,2021-02-02
2,5,Great,Fantastic graphics and game play,[],B08FC5L3RG,B08FC5L3RG,AH2M45HGNMZUZOPEDAKCX26YYZIQ,2020-12-30 21:02:15.111,0,True,PlayStation 5 Console,2020-12-30
3,5,Fun,This was a great buy for my 6 year old. He lov...,[],B08G9J44ZN,B0C37RBK2R,AHLTW7IFMGYRDK522YW3AOO5DWIQ,2022-12-25 22:54:05.200,0,True,Xbox Series S,2022-12-25
4,5,Xbox Series S,"Logan, my son, got this to give to my daughter...",[],B08G9J44ZN,B0C37RBK2R,AHX2ASVMXSHSQDBYGTZFDMCGVCQA,2021-02-20 04:13:41.223,0,True,Xbox Series S,2021-02-20


In [23]:
df = df[["date", "title", "text", "rating", "helpful_vote", "product"]]
df.head()

Unnamed: 0,date,title,text,rating,helpful_vote,product
0,2021-05-05,Recommended,Great product. Would recommend. As described. ...,5,0,PlayStation 5 Console
1,2021-02-02,Bugging Out,I had to return due to technical issues. Bugs ...,3,3,PlayStation 5 Console
2,2020-12-30,Great,Fantastic graphics and game play,5,0,PlayStation 5 Console
3,2022-12-25,Fun,This was a great buy for my 6 year old. He lov...,5,0,Xbox Series S
4,2021-02-20,Xbox Series S,"Logan, my son, got this to give to my daughter...",5,0,Xbox Series S


# Create Chroma DB index

In [24]:
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import DataFrameLoader
from langchain_chroma import Chroma
from tqdm.auto import tqdm

In [25]:
load_dotenv()

True

In [26]:
dimensions = 1024

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

vector_store = Chroma(
    collection_name="amazon_reviews",
    embedding_function=embeddings,
    persist_directory="../data/chroma",
)

In [27]:
loader = DataFrameLoader(df, page_content_column="text")
docs = loader.load()

# add documents in batches
batch_size = 1000
for i in tqdm(range(0, len(docs), batch_size)):
    vector_store.add_documents(documents=docs[i : i + batch_size])

  0%|          | 0/7 [00:00<?, ?it/s]

In [36]:
def get_results_df(results):
    data = []
    for doc, score in results:
        row = doc.metadata
        row["page_content"] = doc.page_content
        row["score"] = score
        data.append(row)

    # Create dataframe
    df = pd.DataFrame(data)

    return df

In [37]:
results = vector_store.similarity_search_with_score("I like the controller", k=5)

get_results_df(results)

Unnamed: 0,date,helpful_vote,product,rating,title,page_content,score
0,2021-02-25,0,PlayStation 5 Console,5,Like everything except 1 thing,I like everything about it. Except the control...,0.45935
1,2022-09-23,0,Xbox Series S,5,Great product,The way the controller feels.,0.69067
2,2021-06-25,0,Xbox Series S,5,controller,good but the controller was used,0.709363
3,2021-04-30,0,PlayStation 5 Console,5,Best console,"Its great so far, they nailed it with the new ...",0.72913
4,2021-02-16,1,PlayStation 5 Console,5,Best Gaming System,Finally got it!!!! This thing is awesome. The ...,0.749155
