In [1]:
!pip install h5py



In [27]:
import pickle
import h5py
import pandas as pd
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import re
data_dir = '/home/sagemaker-user/Data/'
model_dir = '/home/sagemaker-user/Models/'
log_dir = '/home/sagemaker-user/Logs/'

In [3]:
# 1. Load the existing user ID map from a pickle file
def load_user_id_map(file_path):
    """
    Load a user ID mapping dictionary from a pickle file.

    :param file_path: Path to the pickle file.
    :return: User ID mapping dictionary or an empty dictionary if not found.
    """
    try:
        with open(file_path, 'rb') as file:
            user_id_map = pickle.load(file)
        print(f"Successfully loaded {file_path}")
        return user_id_map
    except FileNotFoundError:
        print(f"File {file_path} not found. Please check the file path.")
        return {}
    except Exception as e:
        print(f"Error loading file: {e}")
        return {}

# 2. Create a user-password mapping
def create_user_password_map(user_ids, password='password123'):
    """
    Create a mapping of user IDs to a default password.

    :param user_ids: List of user IDs.
    :param password: Default password assigned to all users.
    :return: Dictionary mapping user IDs to passwords.
    """
    return {user_id: password for user_id in user_ids}

# 3. Save the user-password mapping to a pickle file
def save_user_password_map(user_password_map, file_path):
    """
    Save the user-password mapping to a pickle file.

    :param user_password_map: Dictionary containing user-password mapping.
    :param file_path: Path to save the pickle file.
    """
    try:
        with open(file_path, 'wb') as file:
            pickle.dump(user_password_map, file)
        print(f"Successfully saved user-password mapping to {file_path}")
    except Exception as e:
        print(f"Error saving file: {e}")

# 4. Verify the saved user-password mapping
def verify_saved_password_map(file_path):
    """
    Load and display the first few entries from the saved user-password mapping.

    :param file_path: Path to the pickle file.
    """
    try:
        with open(file_path, 'rb') as file:
            user_password_map = pickle.load(file)
        print(f"Loaded user-password mapping (first 5 entries):")
        for idx, (user_id, password) in enumerate(user_password_map.items()):
            print(f"{user_id}: {password}")
            if idx >= 4:  # Limit to first 5 entries for readability
                break
    except FileNotFoundError:
        print(f"File {file_path} not found.")
    except Exception as e:
        print(f"Error loading file: {e}")

# Main program
if __name__ == "__main__":
    # Define file paths
    input_file = './user_id_map.pkl'       # Input file: User ID mapping
    output_file = 'user_passwords.pkl'  # Output file: User-password mapping

    # Step 1: Load the user ID map
    user_id_map = load_user_id_map(input_file)

    if user_id_map:
        # Step 2: Extract user IDs from the map
        user_ids = list(user_id_map.keys())
        print(f"Loaded {len(user_ids)} user IDs.")

        # Step 3: Create a mapping of user IDs to passwords
        user_password_map = create_user_password_map(user_ids, password='password123')

        # Step 4: Save the user-password mapping to a new file
        save_user_password_map(user_password_map, output_file)

        # Step 5: Verify the saved user-password mapping
        verify_saved_password_map(output_file)

Successfully loaded ./user_id_map.pkl
Loaded 290475 user IDs.
Successfully saved user-password mapping to user_passwords.pkl
Loaded user-password mapping (first 5 entries):
AE22236AFRRSMQIKGG7TPTB75QEA: password123
AE222MW56PH6JXPIB6XSAMCBTLNQ: password123
AE222N3VUKMF3GO6D4LHTELE7UWA: password123
AE2244ILMBLRPTIN7VW7YDKRI2YA: password123
AE226BJM6RTWIVV6UJKZAVQPBKXA: password123


In [4]:
# Attempt to read the specific user group "AE22236AFRRSMQIKGG7TPTB75QEA" in recommendations.h5
recommendations_content = {}

try:
    with h5py.File(data_dir+'recommendations.h5', 'r') as f:
        # Extract the content for the specific user ID
        user_id = 'AE22236AFRRSMQIKGG7TPTB75QEA'
        if user_id in f:
            recommendations_content[user_id] = f[user_id][:]
        else:
            recommendations_content = f"User ID {user_id} not found in recommendations.h5."
except FileNotFoundError:
    recommendations_content = "The file recommendations.h5 was not found."

recommendations_content

{'AE22236AFRRSMQIKGG7TPTB75QEA': array([b'B007NFLN1K', b'B005HSPOVU', b'B01KZTB3HE', b'B0BS72KD59',
        b'B00E3UKVVQ', b'B07FLSK36C', b'B07S3RFL7V', b'B01FZIWFLS',
        b'B09FPR3626', b'B003194PBC', b'B09PRVBLTL', b'B000W7IR10',
        b'B071489L22', b'B01K4KU5HI', b'B07DN8GCLG', b'B00T62YNQK',
        b'B0002568EK', b'B081RM74FQ', b'B07KF5G7XP', b'B00O4DKHRK'],
       dtype='|S10')}

In [6]:
# Replace 'meta_Pet_Supplies.jsonl' with the actual path to your JSONL file
file_path = data_dir+'raw_data/meta_Pet_Supplies.jsonl'

# Load the JSONL file into a pandas DataFrame
try:
    data = pd.read_json(file_path, lines=True)
    data.head()  # Display the first few rows of the DataFrame
except FileNotFoundError:
    "The file meta_Pet_Supplies.jsonl was not found."

In [7]:
data.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Pet Supplies,Hurtta Pet Collection 14-Inch Padded Y-Harness...,4.4,166,"[Made from highly durable Neoprene, Fitted wit...",[Hurtta harnesses are suitable for active walk...,24.95,[{'thumb': 'https://m.media-amazon.com/images/...,[],Hurtta,"[Pet Supplies, Dogs, Collars, Harnesses & Leas...","{'Size': '14 Inch', 'Color': 'Pink', 'Pattern'...",B00XJG2SLG,,,
1,Pet Supplies,"Raised Dog Bowls,6 Inch Ceramic Dog Bowl Dish,...",4.6,100,[【Two Bowls+Metal Stand+Dog food mat】Really pr...,[],32.88,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Ihoming Ceramic Bowls for Dogs and...,FIVEAGE,"[Pet Supplies, Dogs, Feeding & Watering Suppli...","{'Material': 'Ceramic', 'Target Species': 'Cat...",B0BD6TXL2G,,,
2,Pet Supplies,4 Pack - 4 Inch Ring Filter Socks 200 Micron -...,4.4,84,[Micron filter bags provide excellent mechanic...,[Micron filter bags provide excellent mechanic...,15.0,[{'thumb': 'https://m.media-amazon.com/images/...,[],Encompass All,[],"{'Is Discontinued By Manufacturer': 'No', 'Pro...",B01MQTWB5H,,,
3,Pet Supplies,"SlowTon Dog Vest Harness, Mesh Breathable Pet ...",4.5,348,[New Match and Well Made --- The chest part of...,[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],SlowTon,"[Pet Supplies, Dogs, Collars, Harnesses & Leas...","{'Size': 'Large', 'Color': 'Purple', 'Pattern'...",B07DYM6LXD,,,
4,Pet Supplies,Cat Window Perch Durable Cat Hammock Seat for ...,4.4,130,[【Ideal for Use Year-around】Cat window perch c...,[],23.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Cat Window Perch Beige Assembly Vi...,Mewoo,"[Pet Supplies, Cats, Beds & Furniture, Hammocks]","{'Brand': 'Mewoo', 'Color': 'Blue,Grey,Beige',...",B09ZTMTS6N,,,


In [8]:
filtered_data = data[data['main_category'] == 'Pet Supplies']

In [9]:
filtered_data=filtered_data[['parent_asin','description','details','categories','average_rating','rating_number']]

In [10]:
filtered_data.head()

Unnamed: 0,parent_asin,description,details,categories,average_rating,rating_number
0,B00XJG2SLG,[Hurtta harnesses are suitable for active walk...,"{'Size': '14 Inch', 'Color': 'Pink', 'Pattern'...","[Pet Supplies, Dogs, Collars, Harnesses & Leas...",4.4,166
1,B0BD6TXL2G,[],"{'Material': 'Ceramic', 'Target Species': 'Cat...","[Pet Supplies, Dogs, Feeding & Watering Suppli...",4.6,100
2,B01MQTWB5H,[Micron filter bags provide excellent mechanic...,"{'Is Discontinued By Manufacturer': 'No', 'Pro...",[],4.4,84
3,B07DYM6LXD,[],"{'Size': 'Large', 'Color': 'Purple', 'Pattern'...","[Pet Supplies, Dogs, Collars, Harnesses & Leas...",4.5,348
4,B09ZTMTS6N,[],"{'Brand': 'Mewoo', 'Color': 'Blue,Grey,Beige',...","[Pet Supplies, Cats, Beds & Furniture, Hammocks]",4.4,130


In [11]:
filtered_data.isnull().sum()

parent_asin       0
description       0
details           0
categories        0
average_rating    0
rating_number     0
dtype: int64

In [13]:
alpha = 0.7  # Weight for average rating
beta = 0.3   # Weight for log of rating count

filtered_data['popularity_score'] = alpha * filtered_data['average_rating'] + beta * np.log(filtered_data['rating_number'])

In [14]:
# Remove rows where 'description' is an empty string
filtered_data = filtered_data[filtered_data['description'].str.len() > 0]

# Drop duplicates based on 'parent_asin'
filtered_data_unique = filtered_data.drop_duplicates(subset=['parent_asin'])

In [15]:
filtered_data.head()

Unnamed: 0,parent_asin,description,details,categories,average_rating,rating_number,popularity_score
0,B00XJG2SLG,[Hurtta harnesses are suitable for active walk...,"{'Size': '14 Inch', 'Color': 'Pink', 'Pattern'...","[Pet Supplies, Dogs, Collars, Harnesses & Leas...",4.4,166,4.613596
2,B01MQTWB5H,[Micron filter bags provide excellent mechanic...,"{'Is Discontinued By Manufacturer': 'No', 'Pro...",[],4.4,84,4.409245
8,B00F3JRLYQ,[For the past 35 years the Wysong goal and com...,"{'Is Discontinued By Manufacturer': 'No', 'Pro...","[Pet Supplies, Dogs, Food, Dry]",4.4,75,4.375246
9,B0006L145S,[Vitakraft's Happy Frutti treats for Guinea Pi...,"{'Brand': 'Vitakraft', 'Flavor': 'Banana', 'Ag...","[Pet Supplies, Small Animals, Treats]",4.1,20,3.76872
11,B074SVVSK7,[The Alpha Series is our flagship line of prof...,"{'Color': 'Black', 'Brand': 'K9PROLINE', 'Mate...","[Pet Supplies, Dogs, Collars, Harnesses & Leas...",4.6,9,3.879167


In [16]:
filtered_data.shape

(230710, 7)

In [17]:
filtered_data.to_pickle(data_dir+'filtered_data_unique_asin.pkl')

In [19]:
filtered_data_new=filtered_data[['parent_asin','description','details']]

In [20]:
filtered_data_unique.head()

Unnamed: 0,parent_asin,description,details,categories,average_rating,rating_number,popularity_score
0,B00XJG2SLG,[Hurtta harnesses are suitable for active walk...,"{'Size': '14 Inch', 'Color': 'Pink', 'Pattern'...","[Pet Supplies, Dogs, Collars, Harnesses & Leas...",4.4,166,4.613596
2,B01MQTWB5H,[Micron filter bags provide excellent mechanic...,"{'Is Discontinued By Manufacturer': 'No', 'Pro...",[],4.4,84,4.409245
8,B00F3JRLYQ,[For the past 35 years the Wysong goal and com...,"{'Is Discontinued By Manufacturer': 'No', 'Pro...","[Pet Supplies, Dogs, Food, Dry]",4.4,75,4.375246
9,B0006L145S,[Vitakraft's Happy Frutti treats for Guinea Pi...,"{'Brand': 'Vitakraft', 'Flavor': 'Banana', 'Ag...","[Pet Supplies, Small Animals, Treats]",4.1,20,3.76872
11,B074SVVSK7,[The Alpha Series is our flagship line of prof...,"{'Color': 'Black', 'Brand': 'K9PROLINE', 'Mate...","[Pet Supplies, Dogs, Collars, Harnesses & Leas...",4.6,9,3.879167


In [21]:
df_sorted = filtered_data.sort_values(by='popularity_score', ascending=False)

# Select the top 5 most popular products
top_5 = df_sorted.head(5)

In [22]:
top_5.to_csv('top_5.csv')

In [33]:
top_10 = df_sorted.head(10)
top_10.to_csv('top_10.csv')

In [23]:
top_5

Unnamed: 0,parent_asin,description,details,categories,average_rating,rating_number,popularity_score
36920,B00QCVQR3A,"[Product Description, An Amazon Brand., From t...","{'Brand': 'Amazon Basics', 'Unit Count': '600 ...","[Pet Supplies, Dogs, Litter & Housebreaking, P...",4.8,175000,6.981762
41450,B0C3J5D1HJ,[The Chuckit! Ultra Ball Dog Toy is a must-hav...,"{'Is Discontinued By Manufacturer': 'No', 'Pro...","[Pet Supplies, Dogs, Toys, Balls]",4.8,132311,6.897873
38305,B0BJ18G8Y3,[The Chuckit! Ultra Ball Dog Toy is a must-hav...,"{'Is Discontinued By Manufacturer': 'No', 'Pro...","[Pet Supplies, Dogs, Toys, Balls]",4.8,132257,6.897751
19073,B0C3HYS7DQ,"[Product Description, An Amazon Brand., From t...","{'Brand': 'Amazon Basics', 'Material': 'Plasti...","[Pet Supplies, Dogs, Litter & Housebreaking, T...",4.5,178175,6.777156
112321,B0BMHXP7XX,[Offer your discerning cat the gourmet tastes ...,"{'Brand': 'PURINA Fancy Feast', 'Flavor': 'Pou...","[Pet Supplies, Cats, Food, Wet, Canned]",4.7,102229,6.750491


In [34]:
top_10

Unnamed: 0,parent_asin,description,details,categories,average_rating,rating_number,popularity_score
36920,B00QCVQR3A,"[Product Description, An Amazon Brand., From t...","{'Brand': 'Amazon Basics', 'Unit Count': '600 ...","[Pet Supplies, Dogs, Litter & Housebreaking, P...",4.8,175000,6.981762
41450,B0C3J5D1HJ,[The Chuckit! Ultra Ball Dog Toy is a must-hav...,"{'Is Discontinued By Manufacturer': 'No', 'Pro...","[Pet Supplies, Dogs, Toys, Balls]",4.8,132311,6.897873
38305,B0BJ18G8Y3,[The Chuckit! Ultra Ball Dog Toy is a must-hav...,"{'Is Discontinued By Manufacturer': 'No', 'Pro...","[Pet Supplies, Dogs, Toys, Balls]",4.8,132257,6.897751
19073,B0C3HYS7DQ,"[Product Description, An Amazon Brand., From t...","{'Brand': 'Amazon Basics', 'Material': 'Plasti...","[Pet Supplies, Dogs, Litter & Housebreaking, T...",4.5,178175,6.777156
112321,B0BMHXP7XX,[Offer your discerning cat the gourmet tastes ...,"{'Brand': 'PURINA Fancy Feast', 'Flavor': 'Pou...","[Pet Supplies, Cats, Food, Wet, Canned]",4.7,102229,6.750491
70666,B0C4V1C3FD,[Cosequin Maximum Strength Plus MSM & Omega-3'...,"{'Brand': 'Nutramax Laboratories', 'Flavor': '...","[Pet Supplies, Dogs, Health Supplies, Hip & Jo...",4.7,90679,6.714524
28589,B079D3NTNF,"[Product Description, An Amazon Brand., From t...","{'Brand': 'Amazon Basics', 'Unit Count': '270 ...","[Pet Supplies, Dogs, Litter & Housebreaking, P...",4.8,71369,6.712686
50754,B0BFYZY2DZ,[While your furry friend no longer needs to fi...,"{'Material': 'Silicone', 'Target Species': 'Do...","[Pet Supplies, Dogs, Feeding & Watering Suppli...",4.6,112055,6.708024
109813,B09CJVFHBD,[Did you know that storing your pet's food in ...,"{'Brand': 'Gamma2', 'Color': 'Granite Stone', ...","[Pet Supplies, Cats, Feeding & Watering Suppli...",4.7,85012,6.695164
70706,B0BMHX5L9G,[Bring home Purina Fancy Feast Classic Poultry...,"{'Brand': 'PURINA Fancy Feast', 'Flavor': 'Pou...","[Pet Supplies, Cats, Food, Wet, Canned]",4.7,76361,6.662968


In [28]:
def clean_text(text):
    if not isinstance(text, str):
        text = ''  # Convert non-string values to empty strings
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = text.lower()  # Convert to lowercase
    return text

# Clean 'description' column
filtered_data['clean_description'] = filtered_data['description'].apply(clean_text)

# Clean 'categories' column
filtered_data['clean_categories'] = filtered_data['categories'].apply(
    lambda x: ' '.join(x) if isinstance(x, list) else ''  # Convert lists to strings
).apply(clean_text)

# Combine cleaned descriptions and categories into a single text field
filtered_data['text'] = filtered_data['clean_description'] + ' ' + filtered_data['clean_categories']

# Initialize and fit the TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(filtered_data['text'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['clean_description'] = filtered_data['description'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['clean_categories'] = filtered_data['categories'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['text'] = filtered_data['clean_descr

In [29]:
def preprocess_user_keywords(keywords):
    return clean_text(keywords)

user_keywords = "dog harness, durable, pink"

clean_keywords = preprocess_user_keywords(user_keywords)

# Convert the cleaned keywords into a TF-IDF vector
user_tfidf = tfidf.transform([clean_keywords])

In [30]:
from sklearn.metrics.pairwise import linear_kernel

cosine_similarities = linear_kernel(user_tfidf, tfidf_matrix).flatten()
filtered_data['similarity'] = cosine_similarities

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['similarity'] = cosine_similarities


In [31]:
def recommend_products(df, top_n=5):
    # Sort by similarity and return top N products
    recommended_df = df.sort_values(by='similarity', ascending=False).head(top_n)
    return recommended_df

top_recommendations = recommend_products(filtered_data, top_n=5)

print("\nRecommended Products:")
print(top_recommendations[['parent_asin', 'description', 'similarity']])


Recommended Products:
       parent_asin                                        description  \
65425   B071RCM8KH  [cuteNfuzzy's Jute dog toys are an ideal choic...   
25808   B00QLLBEQQ  [Covers are removable for cleaning. Fabric is ...   
459369  B07CBSNKX5            [Specification: left hand + right hand]   
1833    B00DHNR2F8  [Hang your Paw Wipe Towel up in a convenient s...   
45648   B00WGXP45U                                               [SA]   

        similarity  
65425     0.910407  
25808     0.910407  
459369    0.910407  
1833      0.910407  
45648     0.910407  


In [32]:
# Save the vectorizer to a file
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)
print("TfidfVectorizer saved to 'tfidf_vectorizer.pkl'")

from scipy import sparse
# Save the TF-IDF matrix as a compressed file
sparse.save_npz('tfidf_matrix.npz', tfidf_matrix)
print("TF-IDF matrix saved to 'tfidf_matrix.npz'")

TfidfVectorizer saved to 'tfidf_vectorizer.pkl'
TF-IDF matrix saved to 'tfidf_matrix.npz'
