# Read reviews and metadata into dataframe

In [1]:
import pandas as pd

# Define the chunk size
chunk_size = 10000  # Adjust based on your system's memory capacity

# Initialize an empty list to store DataFrame chunks
chunks = []

# Read the JSONL file in chunks
for chunk in pd.read_json('Pet_Supplies.jsonl', lines=True, chunksize=chunk_size):
    chunks.append(chunk)  # Collect each chunk

# Concatenate all chunks into a single DataFrame
df = pd.concat(chunks, ignore_index=True)

# Display the first few rows
print("Reviews Dataset")
print(df.head())

meta_chunks = []

# Read the JSONL file in chunks
for meta_chunk in pd.read_json('meta_Pet_Supplies.jsonl', lines=True, chunksize=chunk_size):
    meta_chunks.append(meta_chunk)  # Collect each chunk

# Concatenate all chunks into a single DataFrame
meta_df = pd.concat(meta_chunks, ignore_index=True)

# Display the first few rows
print("Metadata Dataset")
print(meta_df.head())

Reviews Dataset
   rating                                              title  \
0       4  Sticky stair riser tread thingies are utterly ...   
1       1  Dangerous bc metal not properly coated! Rough ...   
2       3                      Arrived damaged/dented/rusted   
3       5                                My pups love these!   
4       3                        My pups refuse to eat them.   

                                                text  \
0  Tried to load photos, but none of my photos or...   
1  Where to begin?  I’ve been trying to get the 2...   
2  Unfortunately mine arrived damaged/dented whic...   
3  My pups love these!  It’s one of their favorit...   
4  Idk why, but my pups will not eat either flavo...   

                                              images        asin parent_asin  \
0                                                 []  B084SXF9Y8  B0BHTBS5RM   
1  [{'small_image_url': 'https://m.media-amazon.c...  B000QFWCJ6  B0BJ16KKML   
2  [{'small_image_url'

# Check Columns

In [2]:
pd.set_option('display.max_columns', None)
print(df.columns.values)
print(meta_df.columns.values)

['rating' 'title' 'text' 'images' 'asin' 'parent_asin' 'user_id'
 'timestamp' 'helpful_vote' 'verified_purchase']
['main_category' 'title' 'average_rating' 'rating_number' 'features'
 'description' 'price' 'images' 'videos' 'store' 'categories' 'details'
 'parent_asin' 'bought_together' 'subtitle' 'author']


# Grab Sample

In [3]:
reviews_sample_size = 5000  # Adjust based on your memory capacity
metadata_sample_size = 5000  # Adjust as needed

# Extract random samples
df_sample = df.sample(n=reviews_sample_size, random_state=42)
meta_df_sample = meta_df.sample(n=metadata_sample_size, random_state=42)

In [4]:
print(len(df_sample))
print(len(meta_df_sample))

5000
5000


Save samples in csv

In [5]:
df_sample.to_csv("Reviews_Sample.csv", index=False)
meta_df_sample.to_csv("Metadata_Sample.csv", index=False)

# Load Sample

In [67]:
import pandas as pd

df_sample = pd.read_csv('Reviews_Sample.csv')
meta_df_sample = pd.read_csv('Metadata_Sample.csv')

print(df_sample.head())
print(df_sample.columns)

   rating                                              title  \
0       1                                   Wouldn’t charge.   
1       1                                Not worth the money   
2       3                        Poor Value.  Too Expensive.   
3       5  Cans guide themselves into the correct slot;no...   
4       5                    Promotes healthy teeth and gums   

                                                text  \
0         Waste of money. They wouldn’t even charge.   
1  It was not even close to what it looked online...   
2  It’s nice and seems like it will last.  It’s s...   
3  We have 13 oz. cans for our indoor cat/outdoor...   
4  I give my dog one of these treats right before...   

                                              images        asin parent_asin  \
0                                                 []  B08HX33Y53  B0932M1STZ   
1                                                 []  B07989JZGP  B07989JZGP   
2  [{'small_image_url': 'https://m.med

In [68]:
print(meta_df_sample.head())
print(meta_df_sample.columns)

              main_category  \
0  Tools & Home Improvement   
1              Pet Supplies   
2              Pet Supplies   
3              Pet Supplies   
4              Pet Supplies   

                                               title  average_rating  \
0  Zoo Med Nightlight Red Reptile Bulb 100 Watts ...             4.3   
1  Gimars Thicker PVC Sturdy Not Collapsing Quick...             4.1   
2  Natural Balance L.I.D. Limited Ingredient Diet...             4.1   
3  teemerryca Detachable Bow Dog Collar with a Fr...             4.4   
4  PetSafe Happy Ride Dog Safety Harness for Cars...             4.1   

   rating_number                                           features  \
0             13  ['Zoo Med Nightlight Red Reptile Bulb 100 Watt...   
1             71  ['【Durable Thicker PVC Material & Deep Texture...   
2             14  ['Contains 1 - 12 Pound Bag Of Dry Dog Food', ...   
3              5                                                 []   
4            226  ['CRASH

In [66]:
print(len(df_sample))

5000


# Data Cleaning

In [73]:
import ast

# Concatenate
df_sample['text_feature'] = df_sample['title'] + ' ' + df_sample['text']

def safe_eval(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return val

# Apply safe_eval to convert string representations to actual lists/dicts
meta_df_sample['features'] = meta_df_sample['features'].apply(safe_eval)
meta_df_sample['description'] = meta_df_sample['description'].apply(safe_eval)
meta_df_sample['details'] = meta_df_sample['details'].apply(safe_eval)

# Function to convert list to string
def list_to_string(lst):
    if isinstance(lst, list):
        return ' '.join(lst)
    return ''

# Function to convert dictionary to formatted string
def dict_to_string(d):
    if isinstance(d, dict):
        return ' '.join([f"{k}: {v}" for k, v in d.items()])
    return ''

# Convert lists and dicts to strings
meta_df_sample['features_str'] = meta_df_sample['features'].apply(list_to_string)
meta_df_sample['description_str'] = meta_df_sample['description'].apply(list_to_string)
meta_df_sample['details_str'] = meta_df_sample['details'].apply(dict_to_string)

# Concatenate the string columns into 'text_feature'
meta_df_sample['text_feature'] = (
    meta_df_sample['features_str'] + ' ' +
    meta_df_sample['description_str'] + ' ' +
    meta_df_sample['details_str']
)


In [75]:
print(df_sample.head())
print(meta_df_sample.head())

   rating                                              title  \
0       1                                   Wouldn’t charge.   
1       1                                Not worth the money   
2       3                        Poor Value.  Too Expensive.   
3       5  Cans guide themselves into the correct slot;no...   
4       5                    Promotes healthy teeth and gums   

                                                text  \
0         Waste of money. They wouldn’t even charge.   
1  It was not even close to what it looked online...   
2  It’s nice and seems like it will last.  It’s s...   
3  We have 13 oz. cans for our indoor cat/outdoor...   
4  I give my dog one of these treats right before...   

                                              images        asin parent_asin  \
0                                                 []  B08HX33Y53  B0932M1STZ   
1                                                 []  B07989JZGP  B07989JZGP   
2  [{'small_image_url': 'https://m.med

Combine data

In [76]:
combined_df = pd.concat([df_sample[['parent_asin', 'text_feature']],
                         meta_df_sample[['parent_asin', 'text_feature']]],
                        ignore_index=True)

In [77]:
print(combined_df)

     parent_asin                                       text_feature
0     B0932M1STZ  Wouldn’t charge. Waste of money. They wouldn’t...
1     B07989JZGP  Not worth the money It was not even close to w...
2     B07BYSHK6R  Poor Value.  Too Expensive. It’s nice and seem...
3     B07C9GYLYM  Cans guide themselves into the correct slot;no...
4     B08KG7R846  Promotes healthy teeth and gums I give my dog ...
...          ...                                                ...
9995  B07VP64XPS  2 TOYS IN ONE: Christmas treat-hiding, inside ...
9996  B08WX4RLS6  Bundle includes: Carrot and Dill, Bell Pepper,...
9997  B01MY5HJGQ  Wet-type cough symptoms Wet noises in the ches...
9998  B08BC1K82X    Package Dimensions: 15.86 x 12.24 x 2.72 inc...
9999  B01289JAO2    Is Discontinued By Manufacturer: No Package ...

[10000 rows x 2 columns]


Handle missing values

In [79]:
combined_df['text_feature'] = combined_df['text_feature'].fillna('')
combined_df['text_feature'] = combined_df['text_feature'].astype(str)


In [80]:
print(combined_df)

     parent_asin                                       text_feature
0     B0932M1STZ  Wouldn’t charge. Waste of money. They wouldn’t...
1     B07989JZGP  Not worth the money It was not even close to w...
2     B07BYSHK6R  Poor Value.  Too Expensive. It’s nice and seem...
3     B07C9GYLYM  Cans guide themselves into the correct slot;no...
4     B08KG7R846  Promotes healthy teeth and gums I give my dog ...
...          ...                                                ...
9995  B07VP64XPS  2 TOYS IN ONE: Christmas treat-hiding, inside ...
9996  B08WX4RLS6  Bundle includes: Carrot and Dill, Bell Pepper,...
9997  B01MY5HJGQ  Wet-type cough symptoms Wet noises in the ches...
9998  B08BC1K82X    Package Dimensions: 15.86 x 12.24 x 2.72 inc...
9999  B01289JAO2    Is Discontinued By Manufacturer: No Package ...

[10000 rows x 2 columns]


# Model

In [81]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the 'text_feature' column
tfidf_matrix = tfidf.fit_transform(combined_df['text_feature'])

In [82]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_products(user_query, tfidf_matrix, meta_df_sample, top_n=5):
    # Transform the user query using the same TF-IDF vectorizer
    user_tfidf = tfidf.transform([user_query])
    
    # Compute cosine similarities between the user query and all product features
    cosine_similarities = cosine_similarity(user_tfidf, tfidf_matrix).flatten()
    
    # Get indices of the top_n products with the highest similarity scores
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]
    
    # Retrieve the corresponding 'parent_asin's
    recommended_asins = meta_df_sample.iloc[top_indices]['parent_asin'].values
    
    return recommended_asins


In [113]:
def retrieve_product_names(parent_asins, meta_df_sample):
    product_dict = meta_df_sample.set_index('parent_asin')['title'].to_dict()
    return {asin: product_dict.get(asin, "Title Not Found") for asin in parent_asins}

Example Usage

In [115]:
user_query = "My dog has itchy skin"
recommended_asins = recommend_products(user_query, tfidf_matrix, combined_df, top_n=5)
product_titles = retrieve_product_names(recommended_asins, meta_df_sample)

print("Recommended Products (parent_asin): ", product_titles)
# retrieve_product_names(recommended_asins)


Recommended Products (parent_asin):  {'B0C6HXVHC5': 'Title Not Found', 'B0C4G5G69X': 'Title Not Found', 'B0BS5RC54N': 'Omega 3 for Dogs - Dog Skin and Coat Supplement - Fish Oil for Dogs Chews - Allergy and Dog Itch Relief - Dog Anti Shedding Supplement - Dog Dry Skin Treatment - Salmon Oil - Made in USA - 120 Treats', 'B0BV3F9HN9': 'Title Not Found', 'B00R1XGOQS': 'Bark2Basics Skin Remedy Dog Shampoo, 16 oz - Red Algae Based CTAB, Alleviates Skin Irritations Naturally, Promotes Healing'}


# Save Model

In [116]:
import pickle

# Save the TF-IDF Vectorizer and matrix
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

with open("tfidf_matrix.pkl", "wb") as f:
    pickle.dump(tfidf_matrix, f)

# Load Model

In [117]:
# Load the saved TF-IDF Vectorizer and matrix
with open("tfidf_vectorizer.pkl", "rb") as f:
    loaded_tfidf = pickle.load(f)

with open("tfidf_matrix.pkl", "rb") as f:
    loaded_tfidf_matrix = pickle.load(f)

In [125]:
user_query = "Dog treats"
recommended_asins = recommend_products(user_query, loaded_tfidf_matrix , combined_df, top_n=5)
product_titles = retrieve_product_names(recommended_asins, meta_df_sample)

print("Recommended Products (parent_asin): ", product_titles)
# retrieve_product_names(recommended_asins)


Recommended Products (parent_asin):  {'B006FRYOMC': 'Title Not Found', 'B0C3W7S2RS': 'Title Not Found', 'B085RFM4RZ': 'Amazing Dog Treats - Lamb Trachea (18-20 Count - 10 oz) - Trachea Dog Chews - Natural Source of Glucosamine & Chondroitin for Dogs - Lamb Dog Treats', 'B0CFYP3QB1': 'Title Not Found', 'B000RZPSX2': 'Title Not Found'}
