https://cseweb.ucsd.edu/~jmcauley/datasets.html#amazon_reviews

https://amazon-reviews-2023.github.io/

# Citation
Bridging Language and Items for Retrieval and Recommendation
Yupeng Hou, Jiacheng Li, Zhankui He, An Yan, Xiusi Chen, Julian McAuley
arXiv
[pdf](https://urldefense.com/v3/__https://arxiv.org/pdf/2403.03952.pdf__;!!Mih3wA!EGkx27nmvdVMhh2uxQ7Mc0rNrXQwV8GsOpd3uSc6ZjJGAVhgy9o5bn3Jeb73P4Lz7oL2dIDMdZR4IHI$)

https://scikit-learn.org/0.15/modules/scaling_strategies.html

In [None]:
!pip install pymongo
!pip install pandas
!pip install bs4
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
from MongoDB import MongoDB
import pandas as pd
from multiprocessing import Process


def load_amazon_data(mongo_db:MongoDB, amazon_reviews_db:MongoDB, category:str, url:str):
    if category not in mongo_db:
        print(category, mongo_db)
        df = pd.read_json(
            path_or_buf = url, 
            compression = "gzip",
            lines =
            True
        )
        Process(
            target = mongo_db.append,
            args = (
                category,
                df,
                (
                    {"timeField" : "timestamp"} 
                    if mongo_db == amazon_reviews_db
                    else {}
                ),
            )
        ).start()

In [None]:
import requests
from bs4 import BeautifulSoup
import re
from MongoDB import MongoDB


amazon_reviews_db = MongoDB("Amazon_Reviews")
amazon_items_db = MongoDB("Amazon_Items")
response = requests.get("https://amazon-reviews-2023.github.io/")
soup  = BeautifulSoup(response.text, "html.parser")
file_links = soup.find_all(
    name = "a",
    attrs = {
        "href" : re.compile(r'.*https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/.*')
    }
)
for file_link in file_links:
    categories = file_link.attrs["href"].split("/")[-1].split(".jsonl.gz")[0].partition("meta_")
    if categories[0] == "Video_Games":# or categories[-1] == "Video_Games":
        print(categories)
        mongo_db = amazon_reviews_db if categories[-1] == "" else amazon_items_db
        category = categories[0] if categories[-1] == "" else categories[-1]

In [None]:
len([
    file_link
    for file_link 
    in file_links
    if "meta" not in file_link.attrs["href"]
])

In [None]:
import os

ROOT = "data/"


df = pd.concat(
    objs = [
        pd.read_feather(ROOT + file)
        for file 
        in os.listdir(ROOT)
    ]
)

In [None]:
import random
from spacy.lang.en.stop_words import STOP_WORDS as stopwords_list

df["text_tokenized"] = df["text"].apply(
    lambda text: " ".join(
        [
            token
            for token 
            in nltk.word_tokenize(text)
            if token not in exclude_lists
        ]
    )
)
for text in reviews["text_tokenized"].values:
    for token in text.split(" "):
        vocabulary.add(token)
    
# temp_vectorizer = CountVectorizer()
# temp_vectorizer.fit(reviews["text_tokenized"])
# print(temp_vectorizer.vocabulary_.keys())
# vocabulary.update(temp_vectorizer.vocabulary_.keys())
        
reviews["sentiment"] = reviews["rating"].apply(
    lambda rating: 1 if rating > 3 else 0 if rating < 3 else -1
)

In [None]:
amazon_reviews_db = MongoDB("Amazon_Reviews") = MongoDB("Amazon_Reviews")

In [None]:
count = 0
for df in amazon_reviews_db.query("Video_Games",{}):
    count += len(df)

In [None]:
count

In [None]:
import multiprocessing

multiprocessing.cpu_count()

In [None]:
!pip install wordcloud

In [None]:
import pandas as pd
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import spacy
import string
from spacy.lang.en.stop_words import STOP_WORDS as stopwords_list
import os


nlp = spacy.load('en_core_web_sm') 
exclude_lists = list(stopwords_list) + list(string.punctuation)
ROOT = "data/"
df = pd.concat(
    objs = [
        pd.read_feather(ROOT + file)
        for file 
        in os.listdir(ROOT)
    ]
)
df["sentiment"] = df["rating"].apply(
    lambda rating: 1 if rating > 3 else 0 if rating < 3 else -1
)
df.query("sentiment != -1", inplace = True)
df = df.iloc[:100_000]
df.reset_index(drop = True, inplace = True)

In [None]:
nlp = spacy.load("en_core_web_sm", disable=['parser', 'tagger', 'ner'])
stops = spacy.lang.en.stop_words.STOP_WORDS
from tqdm import tqdm


def normalize(col, lowercase, remove_stopwords):
    print(col.name)
    comment = col["text"]
    if lowercase:
        comment = comment.lower()
    comment = nlp(comment)
    lemmatized = list()
    for word in comment:
        lemma = word.lemma_.strip()
        if lemma:
            if not remove_stopwords or (remove_stopwords and lemma not in stops):
                lemmatized.append(lemma)
    return " ".join(lemmatized)


df['cleaned_text'] = df.apply(normalize, lowercase=True, remove_stopwords=True, axis = 1)

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
     df["text"], 
     df["sentiment"], 
     test_size = 0.3, 
     random_state = 34,
     stratify = df["sentiment"]
)

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer


count_vect = CountVectorizer(
    stop_words = exclude_lists
)
clf = BernoulliNB().fit(count_vect.fit_transform(X_train.values).toarray(), y_train) 
clf.score(count_vect.transform(X_test.values).toarray(), y_test)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt


log_probabilities = clf.feature_log_prob_
feature_names = count_vect.get_feature_names_out()

for sentiment, color in zip([0, 1], ["Reds", "Greens"]):
    log_probabilities_sentiment = sorted(log_probabilities[sentiment,:])[::-1]
    features_sentiments = [feature_names[i] for i in log_probabilities[sentiment,:].argsort()[::-1]]
    
    # Create a WordCloud object with desired parameters
    wordcloud = WordCloud(
        width = 800, 
        height = 400, 
        background_color = "white",
        max_words = 100,
        colormap = color
    ).generate_from_frequencies(
        {
            feature: prob  
            for feature, prob 
            in zip(features_sentiments, log_probabilities_sentiment)
        }
    )

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off") # Turn off the axis labels and ticks
    plt.show()
    

In [None]:
from typing import Generator
import pandas as pd
import pymongo


class MongoDB:
    def __init__(self, database_name:str, url:str = "mongodb://localhost:27017/"):
        self.__database_name = database_name
        self.__url = url
        
        
    def append(self, df:pd.DataFrame, collection_name:str, timeseries:dict = None) -> bool:
        if not timeseries:
            timeseries = {}
            
        else:
            timeseries = {
                "timeseries" : timeseries
            }
            
        with pymongo.MongoClient(self.__url) as client:
            db = client[self.__database_name]
            if collection_name not in db.list_collection_names():
                db.create_collection(**{"name" : collection_name} | timeseries)
                
            db[collection_name].insert_many(
                df.to_dict(
                    orient = "records"
                )
            )
            
            
    def count(self, collection_name:str, query:dict = {}) -> int:
        with pymongo.MongoClient(self.__url) as client:
            db = client[self.__database_name]
            return db[collection_name].count_documents(query)
        
        
    def list_collection_names(self):
        with pymongo.MongoClient(self.__url) as client:
            db = client[self.__database_name]
            return db.list_collection_names()
        
        
    def query(self, collection_name:str, query:dict = {}):
        rows = []
        with pymongo.MongoClient(self.__url) as client:
            db = client[self.__database_name]
            cursor  = db[collection_name].find(
                filter = query
            )
            previous_retrieved = 0
            for row in cursor:
                if len(row) > 0 and cursor.retrieved != previous_retrieved:
                    previous_retrieved = cursor.retrieved
                    yield pd.DataFrame(rows)
                    rows.clear()
                        
                rows.append(row)
                
            yield pd.DataFrame(rows)
            
            
    def aggregate(self, collection_name:str, pipeline:list):
        rows = []
        with pymongo.MongoClient(self.__url) as client:
            db = client[self.__database_name]
            cursor  = db[collection_name].aggregate(pipeline)
            return pd.DataFrame(
                [
                    row
                    for row 
                    in cursor
                ]
            )
    
    
    def __contains__(self, collection_name):
        with pymongo.MongoClient(self.__url) as client:
            db = client[self.__database_name]
            return collection_name in db.list_collection_names()
        
        
    def __repr__(self):
        return self.__database_name

In [None]:
from MongoDB import MongoDB
import pandas as pd


amazon_reviews_db = MongoDB("Amazon_Reviews")
amazon_items_db = MongoDB("Amazon_Items")

In [None]:
# for collection_name in amazon_reviews_db.list_collection_names():.count(collection_name)
#     print(collection_name, amazon_reviews_db.count(collection_name))

In [None]:
results = """
Pet_Supplies 360000
system.buckets.Pet_Supplies 201892
Cell_Phones_and_Accessories 260000
system.buckets.Cell_Phones_and_Accessories 159883
Electronics 360000
system.buckets.Electronics 206652
CDs_and_Vinyl 400000
system.buckets.CDs_and_Vinyl 251001
Baby_Products 320000
system.buckets.Baby_Products 189706
Amazon_Fashion 2500939
system.buckets.Amazon_Fashion 884152
Toys_and_Games 360000
system.buckets.Toys_and_Games 198077
Magazine_Subscriptions 71497
system.buckets.Magazine_Subscriptions 58141
Grocery_and_Gourmet_Food 360000
system.buckets.Grocery_and_Gourmet_Food 199580
Industrial_and_Scientific 290000
system.buckets.Industrial_and_Scientific 173895
Movies_and_TV 410000
system.buckets.Movies_and_TV 222404
Patio_Lawn_and_Garden 250000
system.buckets.Patio_Lawn_and_Garden 150257
Software 370000
system.buckets.Software 212219
Gift_Cards 152410
system.buckets.Gift_Cards 101028
Books 270000
system.buckets.Books 170023
Clothing_Shoes_and_Jewelry 350000
system.buckets.Clothing_Shoes_and_Jewelry 187206
Health_and_Personal_Care 350000
system.buckets.Health_and_Personal_Care 228253
Musical_Instruments 310000
system.buckets.Musical_Instruments 188044
Kindle_Store 280000
system.buckets.Kindle_Store 168478
Handmade_Products 370000
system.buckets.Handmade_Products 220170
system.views 34
Home_and_Kitchen 340000
system.buckets.Home_and_Kitchen 185938
Automotive 290000
system.buckets.Automotive 169511
Subscription_Boxes 16216
system.buckets.Subscription_Boxes 13346
Health_and_Household 260000
system.buckets.Health_and_Household 149124
Tools_and_Home_Improvement 270000
system.buckets.Tools_and_Home_Improvement 159367
Sports_and_Outdoors 280000
system.buckets.Sports_and_Outdoors 166159
Office_Products 270000
system.buckets.Office_Products 161496
Video_Games 290000
system.buckets.Video_Games 184937
Beauty_and_Personal_Care 270000
system.buckets.Beauty_and_Personal_Care 152860
Unknown 330000
system.buckets.Unknown 190146
Arts_Crafts_and_Sewing 370000
system.buckets.Arts_Crafts_and_Sewing 202562
Appliances 340000
system.buckets.Appliances 210106
All_Beauty 701528
system.buckets.All_Beauty 278351
Digital_Music 130434
system.buckets.Digitcal_Musi 99288
""".split("\n")[1:-1]

In [None]:
cols = pd.DataFrame(
    [
        [col.split(" ")[0],float(col.split(" ")[1])]
        for col
        in results
        if not col.startswith("system")
    ]
)

In [None]:
amazon_reviews_db.aggregate(
    collection_name = "Magazine_Subscriptions",
    pipeline = [
        {
            "$group" : {
                "_id": None, 
                "Max Date": {
                    "$min": "$timestamp"
                } 
            }  
        }
    ]
)

In [None]:
cols[0].iloc[0]

https://medium.com/@aleksej.gudkov/llama-cpp-python-examples-a-guide-to-using-llama-models-with-python-1df9ba7a5fcd

In [None]:
import random
from spacy.lang.en.stop_words import STOP_WORDS as stopwords_list

def text_filter(a_dict:pd.Series, label:pd.Series, exclude_lists:list):
    data = []
    for rev_id in a_dict.keys():
        tokens = []
        for token in a_dict.get(rev_id):
            if not token.text in exclude_lists:
                tokens.append(token.text)
        data.append((' '.join(tokens), label))
    return data

def prepare_data(pos_docs, neg_docs, exclude_lists):
    data = text_filter(pos_docs, 1, exclude_lists)
    data += text_filter(neg_docs, -1, exclude_lists)
    random.seed(42)
    random.shuffle(data)
    texts = []
    labels = []
    for item in data:
        texts.append(item[0])
        labels.append(item[1])
        
    return texts, labels


In [None]:
import nltk
import string
from spacy.lang.en.stop_words import STOP_WORDS as stopwords_list


class TextPreprocessor:
    def __init__(self, exclude_lists:list = None):
        if not exclude_lists:
            exclude_lists = list(stopwords_list) + list(string.punctuation)
            
        self.exclude_lists = exclude_lists
        
        
    def filter(self, text:str) -> str:
        return " ".join([
                token
                for token 
                in nltk.word_tokenize(text)
                if token not in self.exclude_lists
        ])

In [None]:
from MongoDB import MongoDB
import pandas as pd
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import string
from spacy.lang.en.stop_words import STOP_WORDS as stopwords_list


vocabulary = set()
collection_name = "Amazon_Fashion"
amazon_reviews_db = MongoDB("Amazon_Reviews")
exclude_lists = list(stopwords_list) + list(string.punctuation)
all_reviews = []
for reviews in amazon_reviews_db.query(collection_name, query = {"timestamp" : {"$lt" : datetime(2015,1,1)}}):
    reviews["text_tokenized"] = reviews["text"].apply(
        lambda text: " ".join(
            [
                token
                for token 
                in nltk.word_tokenize(text)
                if token not in exclude_lists
            ]
        )
    )
    for text in reviews["text_tokenized"].values:
        for token in text.split(" "):
            vocabulary.add(token)
        
    # temp_vectorizer = CountVectorizer()
    # temp_vectorizer.fit(reviews["text_tokenized"])
    # print(temp_vectorizer.vocabulary_.keys())
    # vocabulary.update(temp_vectorizer.vocabulary_.keys())
            
    reviews["sentiment"] = reviews["rating"].apply(
        lambda rating: 1 if rating > 3 else 0 if rating < 3 else -1
    )
    all_reviews.append(reviews)
    
all_reviews_df = pd.concat(all_reviews, ignore_index=True)
all_reviews_df = df

In [None]:
count_vect_1 = CountVectorizer(vocabulary=vocabulary)
count_vect_1.transform(all_reviews_df["text_tokenized"])

In [None]:
count_vect_2 = CountVectorizer()
count_vect_2.fit_transform(all_reviews_df["text_tokenized"])

In [None]:
count_vect_1.transform(all_reviews_df["text_tokenized"])

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
     all_reviews_df["text_tokenized"], 
     all_reviews_df["sentiment"], 
     test_size = 0.3, 
     random_state = 34
)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer


count_vect = CountVectorizer()
clf = MultinomialNB().fit(count_vect.fit_transform(X_train.values), y_train) 
clf.score(count_vect.transform(X_test.values), y_test)

In [None]:
from MongoDB import MongoDB
import pandas as pd
import en_core_web_sm

collection_name = "Amazon_Fashion"
nlp = en_core_web_sm.load()
amazon_reviews_db = MongoDB("Amazon_Reviews")
total_reviews = amazon_reviews_db.count(collection_name)

i = 0
for reviews in amazon_reviews_db.query(collection_name):
    print(len(reviews))
    reviews["sentiment"] = reviews["rating"].apply(
        lambda rating: 1 if rating > 3 else 0 if rating < 3 else -1
    )
    reviews["text"] = reviews["text"].apply(
        lambda text: nlp(
            text,
            disable = ["ner"]
        )
    )
    
    if i > 2:
        break
    
    i += 1

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(train_data) 

In [None]:
import string


punctuation_list = [
    punct 
    for punct 
    in string.punctuation
]

In [None]:
texts, labels = prepare_data(pos_docs, neg_docs, punctuation_list)
print(len(texts), len(labels))
print(texts[0]) 

In [None]:
from MongoDB import MongoDB
import pandas as pd


amazon_reviews_db = MongoDB("Amazon_Reviews")
amazon_items_db = MongoDB("Amazon_Items")


reviews = pd.concat(
    objs = [
        batch
        for batch
        in amazon_reviews_db.query("All_Beauty")
    ]
)
reviews["sentiment"] = reviews["rating"].apply(
    lambda rating: 1 if rating > 3 else 0 if rating < 3 else -1
)
items = pd.concat(
    objs = [
        batch
        for batch
        in amazon_items_db.query("All_Beauty")
    ]
)
pd.merge(
    left = reviews,
    right = items,
    on = "parent_asin",
    how = "left"
)