# Read metadata into dataframe

In [1]:
import pandas as pd

chunk_size = 15000

meta_chunks = []

# Read the JSONL file in chunks
for meta_chunk in pd.read_json('meta_Pet_Supplies.jsonl', lines=True, chunksize=chunk_size):
    meta_chunks.append(meta_chunk)  # Collect each chunk

# Concatenate all chunks into a single DataFrame
meta_df = pd.concat(meta_chunks, ignore_index=True)

# Display the first few rows
print("Metadata Dataset")
print(meta_df.head())

Metadata Dataset
  main_category                                              title  \
0  Pet Supplies  Hurtta Pet Collection 14-Inch Padded Y-Harness...   
1  Pet Supplies  Raised Dog Bowls,6 Inch Ceramic Dog Bowl Dish,...   
2  Pet Supplies  4 Pack - 4 Inch Ring Filter Socks 200 Micron -...   
3  Pet Supplies  SlowTon Dog Vest Harness, Mesh Breathable Pet ...   
4  Pet Supplies  Cat Window Perch Durable Cat Hammock Seat for ...   

   average_rating  rating_number  \
0             4.4            166   
1             4.6            100   
2             4.4             84   
3             4.5            348   
4             4.4            130   

                                            features  \
0  [Made from highly durable Neoprene, Fitted wit...   
1  [【Two Bowls+Metal Stand+Dog food mat】Really pr...   
2  [Micron filter bags provide excellent mechanic...   
3  [New Match and Well Made --- The chest part of...   
4  [【Ideal for Use Year-around】Cat window perch c...   

        

# Check Columns

In [15]:
pd.set_option('display.max_columns', None)
print(meta_df.columns.values)

['main_category' 'title' 'average_rating' 'rating_number' 'features'
 'description' 'price' 'images' 'videos' 'store' 'categories' 'details'
 'parent_asin' 'bought_together' 'subtitle' 'author']


# Grab Sample

In [2]:
metadata_sample_size = 50000  # Adjust as needed

# Extract random samples
meta_df_sample = meta_df.sample(n=metadata_sample_size, random_state=42)

In [54]:
print(len(meta_df_sample))

50000


Save samples in csv

In [3]:
meta_df_sample.to_csv("Metadata_Sample.csv", index=False)

# Load Sample

In [1]:
import pandas as pd

# df_sample = pd.read_csv('Reviews_Sample.csv')
meta_df_sample = pd.read_csv('Metadata_Sample.csv')

# print(df_sample.head())
# print(df_sample.columns)

In [56]:
print(meta_df_sample.head())
print(meta_df_sample.columns)

                   main_category  \
452451  Tools & Home Improvement   
320740              Pet Supplies   
251412              Pet Supplies   
308031              Pet Supplies   
172850              Pet Supplies   

                                                    title  average_rating  \
452451  Zoo Med Nightlight Red Reptile Bulb 100 Watts ...             4.3   
320740  Gimars Thicker PVC Sturdy Not Collapsing Quick...             4.1   
251412  Natural Balance L.I.D. Limited Ingredient Diet...             4.1   
308031  teemerryca Detachable Bow Dog Collar with a Fr...             4.4   
172850  PetSafe Happy Ride Dog Safety Harness for Cars...             4.1   

        rating_number                                           features  \
452451             13  [Zoo Med Nightlight Red Reptile Bulb 100 Watts...   
320740             71  [【Durable Thicker PVC Material & Deep Textured...   
251412             14  [Contains 1 - 12 Pound Bag Of Dry Dog Food, Gr...   
308031          

In [57]:
print(len(meta_df_sample))

50000


# Data Cleaning

In [4]:
import ast

def safe_eval(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return val

# Apply safe_eval to convert string representations to actual lists/dicts
meta_df_sample['features'] = meta_df_sample['features'].apply(safe_eval)
meta_df_sample['description'] = meta_df_sample['description'].apply(safe_eval)
meta_df_sample['details'] = meta_df_sample['details'].apply(safe_eval)

# Function to convert list to string
def list_to_string(lst):
    if isinstance(lst, list):
        return ' '.join(lst)
    return ''

# Function to convert dictionary to formatted string
def dict_to_string(d):
    if isinstance(d, dict):
        return ' '.join([f"{k}: {v}" for k, v in d.items()])
    return ''

# Convert lists and dicts to strings
meta_df_sample['features_str'] = meta_df_sample['features'].apply(list_to_string)
meta_df_sample['description_str'] = meta_df_sample['description'].apply(list_to_string)
meta_df_sample['details_str'] = meta_df_sample['details'].apply(dict_to_string)

# Concatenate the string columns into 'text_feature'
meta_df_sample['text_feature'] = (
    meta_df_sample['features_str'] + ' ' +
    meta_df_sample['description_str'] + ' ' +
    meta_df_sample['details_str']
)

meta_df_sample['text_feature'] = meta_df_sample['text_feature'].fillna('')
meta_df_sample['text_feature'] = meta_df_sample['text_feature'].astype(str)

In [5]:
import re

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

meta_df_sample['text_feature'] = meta_df_sample['text_feature'].apply(clean_text)

In [6]:
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

meta_df_sample['text_feature'] = meta_df_sample['text_feature'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to C:\Users\Geoff
[nltk_data]     Patag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

meta_df_sample['text_feature'] = meta_df_sample['text_feature'].apply(lemmatize_text)


[nltk_data] Downloading package wordnet to C:\Users\Geoff
[nltk_data]     Patag\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
# print(df_sample.head())
print(meta_df_sample.head())

                   main_category  \
452451  Tools & Home Improvement   
320740              Pet Supplies   
251412              Pet Supplies   
308031              Pet Supplies   
172850              Pet Supplies   

                                                    title  average_rating  \
452451  Zoo Med Nightlight Red Reptile Bulb 100 Watts ...             4.3   
320740  Gimars Thicker PVC Sturdy Not Collapsing Quick...             4.1   
251412  Natural Balance L.I.D. Limited Ingredient Diet...             4.1   
308031  teemerryca Detachable Bow Dog Collar with a Fr...             4.4   
172850  PetSafe Happy Ride Dog Safety Harness for Cars...             4.1   

        rating_number                                           features  \
452451             13  [Zoo Med Nightlight Red Reptile Bulb 100 Watts...   
320740             71  [【Durable Thicker PVC Material & Deep Textured...   
251412             14  [Contains 1 - 12 Pound Bag Of Dry Dog Food, Gr...   
308031          

# Model

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the 'text_feature' column
tfidf_matrix = tfidf.fit_transform(meta_df_sample['text_feature'])

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_products(user_query, tfidf_matrix, meta_df_sample, top_n=5):
    # Transform the user query using the same TF-IDF vectorizer
    user_tfidf = tfidf.transform([user_query])
    
    # Compute cosine similarities between the user query and all product features
    cosine_similarities = cosine_similarity(user_tfidf, tfidf_matrix).flatten()
    
    # Get indices of the top_n products with the highest similarity scores
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]
    
    # Retrieve the corresponding 'parent_asin's
    recommended_asins = meta_df_sample.iloc[top_indices]['parent_asin'].values
    
    return recommended_asins


In [11]:
def retrieve_product_names(parent_asins, meta_df_sample):
    product_dict = meta_df_sample.set_index('parent_asin')['title'].to_dict()
    return {asin: product_dict.get(asin, "Title Not Found") for asin in parent_asins}

Preprocessor

In [12]:
def preprocess(text):
    clean_text(text)
    remove_stopwords(text)
    lemmatize_text(text)
    return text

Example Usage

In [15]:
user_query = "plushie"
user_query = preprocess(user_query)
print(user_query)
recommended_asins = recommend_products(user_query, tfidf_matrix, meta_df_sample, top_n=10)
product_titles = retrieve_product_names(recommended_asins, meta_df_sample)

print("Recommended Products (parent_asin): ", product_titles)


plushie
Recommended Products (parent_asin):  {'B09P49BPM7': "Cheeki Studios 'Pawsent Plushie' Dog Toys - Birthday Surprise Interactive Present Plush Gift (Mint)", 'B09SZ9N737': 'Carrot Plushie Dog Toy, Carrot Dog Toy, Vegetable Dog Toy, Dog Toy Plush Vegetable, Carrot Toy', 'B083PWDNCD': 'ZippyPaws - NomNomz Plush Squeaker Dog Toy for The Foodie Pup - Chips', 'B07FCVDNLQ': 'ZippyPaws Hedgehog Small', 'B0080PMW1W': 'STAR WARS Dog Toys - Officially Licensed Pet Squeaker Toys- Set Of 3 Interactive Plush Dog & Puppy Squeaker Chew Toys - The Mandalorian Mando, Grogu The Child & Sorgan Frog Squeakers For Light Chewers', 'B093GPP5TK': 'IML Pusheen PUH050APPT-ONE Donut Cat Toy Hanging Scratcher', 'B096BF9LRD': 'AmazinglyCat Dancing Fish Cat Toy + The Flopping Lobster (Bundle)', 'B0055UCFI0': 'Trapping Coyote Tail with Ball Chain', 'B09BC9GCX4': 'FzzPetDC Catnip Toys for Indoor Cats,6 Pcs Natural Silvervine Sticks Cat Toys for Indoor Cats Interactive,Cleaning Teeth Molar Tools Matatabi Cat Chew

# Save Model

In [77]:
import pickle

# Save the TF-IDF Vectorizer and matrix
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

with open("tfidf_matrix.pkl", "wb") as f:
    pickle.dump(tfidf_matrix, f)

# Load Model

In [117]:
# Load the saved TF-IDF Vectorizer and matrix
with open("tfidf_vectorizer.pkl", "rb") as f:
    loaded_tfidf = pickle.load(f)

with open("tfidf_matrix.pkl", "rb") as f:
    loaded_tfidf_matrix = pickle.load(f)

In [None]:
user_query = "Dog treats"
recommended_asins = recommend_products(user_query, loaded_tfidf_matrix , meta_df_sample, top_n=5)
product_titles = retrieve_product_names(recommended_asins, meta_df_sample)

print("Recommended Products (parent_asin): ", product_titles)
# retrieve_product_names(recommended_asins)


Recommended Products (parent_asin):  {'B006FRYOMC': 'Title Not Found', 'B0C3W7S2RS': 'Title Not Found', 'B085RFM4RZ': 'Amazing Dog Treats - Lamb Trachea (18-20 Count - 10 oz) - Trachea Dog Chews - Natural Source of Glucosamine & Chondroitin for Dogs - Lamb Dog Treats', 'B0CFYP3QB1': 'Title Not Found', 'B000RZPSX2': 'Title Not Found'}


: 