## Helper Functions

In [1]:
import numpy as np
import pandas as pd
import gc
from pandas.api.types import is_numeric_dtype

def get_weighted_summary(data: pd.DataFrame, weight: np.ndarray = None) -> pd.DataFrame:
    """
    Generate a weighted summary of numerical columns in a DataFrame.
    
    Args:
        data (pd.DataFrame): The input data.
        weight (np.ndarray): Array of weights for the data. Defaults to equal weights.
    
    Returns:
        pd.DataFrame: A summary DataFrame with statistical details of the numerical columns.
    """
    gc.collect()

    if weight is None:
        weight = np.ones(data.shape[0])  # Assign equal weights if not provided

    # Separate numeric and non-numeric columns
    numeric_cols = [col for col in data.columns if is_numeric_dtype(data[col])]
    non_numeric_cols = [col for col in data.columns if col not in numeric_cols]

    if non_numeric_cols:
        for col in non_numeric_cols:
            print(f"Non-numeric column: {col}, dtype: {data[col].dtype}")

    data_numeric = data[numeric_cols]
    data_array = data_numeric.to_numpy()
    
    # Clipping to ensure non-negative values (if needed)
    clipped_data = np.clip(data_array, 0, None)
    
    # Weighted statistics
    from statsmodels.stats.weightstats import DescrStatsW
    wt = DescrStatsW(clipped_data, weights=weight)
    mean = wt.mean
    std = wt.std_mean

    quantiles = wt.quantile([0.01, 0.025, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.975, 0.99], return_pandas=False).T

    # Additional statistics
    min_values = np.nanmin(data_array, axis=0)
    max_values = np.nanmax(data_array, axis=0)
    missing_counts = np.isnan(data_array).sum(axis=0)
    total_counts = data_array.shape[0]
    non_missing_counts = total_counts - missing_counts
    missing_percentage = (missing_counts / total_counts * 100).round(2)
    unique_counts = data_numeric.nunique()
    modes = data_numeric.mode().iloc[0]

    # Compile summary
    summary = pd.DataFrame(
        data=np.column_stack([
            total_counts, unique_counts, non_missing_counts, missing_counts,
            missing_percentage, mean, std, modes, min_values, quantiles, max_values
        ]),
        index=data_numeric.columns,
        columns=[
            "count", "unique", "non_missing", "missing", "missing_percent",
            "mean", "std", "mode", "min", "1%", "2.5%", "5%", "10%",
            "25%", "50%", "75%", "90%", "95%", "97.5%", "99%", "max"
        ]
    )
    gc.collect()
    return summary


def optimize_data_types(data: pd.DataFrame, exclude: list = []) -> pd.DataFrame:
    """
    Optimize numeric columns to the smallest appropriate integer type to save memory.
    
    Args:
        data (pd.DataFrame): Input DataFrame.
        exclude (list): List of columns to exclude from optimization.
    
    Returns:
        pd.DataFrame: Optimized DataFrame.
    """
    gc.collect()
    original_size = data.memory_usage(index=True).sum() / 1024**3
    print(f"Original dataset size: {original_size:.4f} GB.")

    numeric_cols = [col for col in data.columns if is_numeric_dtype(data[col]) and col not in exclude]
    non_numeric_cols = [col for col in data.columns if not is_numeric_dtype(data[col])]

    # Determine minimum and maximum values for numeric columns
    min_max = data[numeric_cols].agg(['min', 'max']).T
    min_max['optimal_type'] = np.nan

    # Assign optimal integer types
    for dtype, bounds in {
        'int8': (-128, 127),
        'int16': (-32_768, 32_767),
        'int32': (-2_147_483_648, 2_147_483_647)
    }.items():
        min_max.loc[
            (min_max['min'] >= bounds[0]) & (min_max['max'] <= bounds[1]) & min_max['optimal_type'].isna(),
            'optimal_type'
        ] = dtype

    # Convert columns to optimal types
    for col, dtype in min_max['optimal_type'].dropna().items():
        data[col] = data[col].astype(dtype)

    # Report non-numeric and excluded columns
    if non_numeric_cols:
        print("Non-numeric columns:", non_numeric_cols)
    if exclude:
        print("Excluded columns:", exclude)

    reduced_size = data.memory_usage(index=True).sum() / 1024**3
    print(f"Reduced dataset size: {reduced_size:.4f} GB.")
    print(f"Memory reduction: {100 * (original_size - reduced_size) / original_size:.2f}%.")

    gc.collect()
    return data


def check_missing_values(data: pd.DataFrame) -> pd.DataFrame:
    """
    Identify columns with missing values and compute missing statistics.
    
    Args:
        data (pd.DataFrame): Input DataFrame.
    
    Returns:
        pd.DataFrame: A summary DataFrame with missing value counts and percentages.
    """
    gc.collect()
    missing = data.isnull().sum()
    missing_percentage = (missing / len(data) * 100).round(2)

    missing_summary = pd.DataFrame({
        "missing_count": missing,
        "missing_percent": missing_percentage
    }).query("missing_count > 0")

    gc.collect()
    return missing_summary.sort_values(by="missing_percent", ascending=False)


## Import Data

In [2]:
import pandas as pd
data_dir = '/home/sagemaker-user/Data/'
model_dir = '/home/sagemaker-user/Models/'
log_dir = '/home/sagemaker-user/Logs/'

In [2]:
chunk_size = 10000  # Adjust based on your system's capacity

# Initialize an empty list to store data
data_chunks = []

# Read the JSONL file in chunks
for chunk in pd.read_json(data_dir + 'raw_data/Pet_Supplies.jsonl', lines=True, chunksize=chunk_size):
    data_chunks.append(chunk)

# Combine all chunks into a single DataFrame
user_reviews = pd.concat(data_chunks, ignore_index=True)


In [3]:
user_reviews.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,4,Sticky stair riser tread thingies are utterly ...,"Tried to load photos, but none of my photos or...",[],B084SXF9Y8,B0BHTBS5RM,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2023-02-04 16:48:49.744,0,True
1,1,Dangerous bc metal not properly coated! Rough ...,Where to begin? I’ve been trying to get the 2...,[{'small_image_url': 'https://m.media-amazon.c...,B000QFWCJ6,B0BJ16KKML,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2022-10-11 18:41:13.369,4,True
2,3,Arrived damaged/dented/rusted,Unfortunately mine arrived damaged/dented whic...,[{'small_image_url': 'https://m.media-amazon.c...,B08B875X4H,B0BX76YVP9,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2022-06-25 00:55:16.820,4,True
3,5,My pups love these!,My pups love these! It’s one of their favorit...,[],B01MFG9AG7,B0BM6V2SH8,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2020-05-20 00:17:18.188,0,True
4,3,My pups refuse to eat them.,"Idk why, but my pups will not eat either flavo...",[],B00KRMMJV4,B0986BSRB1,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2020-04-25 22:21:43.681,0,True


In [4]:
# Check missing
user_reviews.isnull().sum()

rating               0
title                0
text                 0
images               0
asin                 0
parent_asin          0
user_id              0
timestamp            0
helpful_vote         0
verified_purchase    0
dtype: int64

In [5]:
user_reviews=user_reviews[['rating', 'title', 'text', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase']]

In [6]:
user_reviews['text'][0]

'Tried to load photos, but none of my photos or videos ever want to load on amazon. Weird huh? Bought this as “like new” from amazon. It’s not. Whoever returned it got dog hair all over the sticky stair riser treads & they will not stick to the plastic stairs so neither of my service dogs will use them although it’s identical to the last stairs we owned.  Mine is garbage bc it was previously used.'

In [7]:
user_reviews['title'][0]

'Sticky stair riser tread thingies are utterly useless.'

In [8]:
# Check data distribution
rating_counts = user_reviews['rating'].value_counts()
asin_counts = user_reviews['asin'].value_counts()
parent_asin_counts = user_reviews['parent_asin'].value_counts()
user_id_counts = user_reviews['user_id'].value_counts()
helpful_vote_counts = user_reviews['helpful_vote'].value_counts()
verified_purchase_counts = user_reviews['verified_purchase'].value_counts()

# Display the result
value_counts_dict = {
    'rating': rating_counts,
    'asin': asin_counts,
    'parent_asin': parent_asin_counts,
    'user_id': user_id_counts,
    'helpful_vote': helpful_vote_counts,
    'verified_purchase': verified_purchase_counts
}

value_counts_dict


{'rating': rating
 5    10792183
 1     2034198
 4     1801732
 3     1242552
 2      957197
 Name: count, dtype: int64,
 'asin': asin
 B00BAGTNAQ    23457
 B00ZGPI3OY    18237
 B0009X29WK    17569
 B00CKFL93K    16663
 B00B8CG602    14280
               ...  
 B09ZSGQTNX        1
 B0B5ZN83W4        1
 B0B593RW98        1
 B0BX31VR8J        1
 B0C23N8TPG        1
 Name: count, Length: 773058, dtype: int64,
 'parent_asin': parent_asin
 B0BJ16KKML    38740
 B017UQCB3A    32502
 B082L3QQCB    23893
 B0BNVLH8BF    23457
 B0C2JN4KLD    21980
               ...  
 B0CD4DV88Z        1
 B0BHY1GBYR        1
 B07H4ZS1K2        1
 B0047MH4J2        1
 B00L9JI6PM        1
 Name: count, Length: 492726, dtype: int64,
 'user_id': user_id
 AG73BVBKUOH22USSFJA5ZWL7AKXA    1149
 AHPOHKN4PU4W3V5PGFL7AGTAD2AA     965
 AH665SQ6SQF6DXAGYIQFCX76LALA     678
 AEYVPPWR4CIKWX4BGYKCBCDL2CZQ     598
 AHEMJ62SUJPUYNWGROPI6MUAYQ5A     596
                                 ... 
 AEOBL54VTR5WLK75Y4SIYNADR46Q       1
 

In [9]:
# read the JSON Lines file

chunk_size = 10000

data_chunks = []

for chunk in pd.read_json(data_dir + 'raw_data//meta_Pet_Supplies.jsonl', lines=True, chunksize=chunk_size):
    data_chunks.append(chunk)

# Combine all chunks into a single DataFrame
df_meta_pet_supplies = pd.concat(data_chunks, ignore_index=True)

In [10]:
df_meta_pet_supplies.head(1)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Pet Supplies,Hurtta Pet Collection 14-Inch Padded Y-Harness...,4.4,166,"[Made from highly durable Neoprene, Fitted wit...",[Hurtta harnesses are suitable for active walk...,24.95,[{'thumb': 'https://m.media-amazon.com/images/...,[],Hurtta,"[Pet Supplies, Dogs, Collars, Harnesses & Leas...","{'Size': '14 Inch', 'Color': 'Pink', 'Pattern'...",B00XJG2SLG,,,


In [11]:
df_meta_pet_supplies['description'][1]

[]

In [12]:
len(df_meta_pet_supplies['description'][1])

0

In [13]:
df_meta_pet_supplies.isnull().sum()

main_category       30830
title                   0
average_rating          0
rating_number           0
features                0
description             0
price              331254
images                  0
videos                  0
store                1659
categories              0
details                 0
parent_asin             0
bought_together    492798
subtitle           492710
author             492758
dtype: int64

## Text data cleaning

In [14]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

# Download necessary NLTK resources
nltk.download('punkt')  # Tokenizer data
nltk.download('stopwords')  # List of common stopwords
nltk.download('wordnet')  # WordNet data for lemmatization

# Initialize Lemmatizer and stopwords set
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    # Convert lists to strings if necessary
    if isinstance(text, list):
        text = ' '.join(text)
    
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    processed_text = ' '.join(tokens)
    return processed_text


[nltk_data] Downloading package punkt to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/sagemaker-
[nltk_data]     user/nltk_data...


In [15]:
nltk.download('punkt_tab')

# Apply the preprocessing function to the specified columns
df_meta_pet_supplies['description_processed'] = df_meta_pet_supplies['description'].apply(preprocess_text)
df_meta_pet_supplies['features_processed'] = df_meta_pet_supplies['features'].apply(preprocess_text)
df_meta_pet_supplies['categories_processed'] = df_meta_pet_supplies['categories'].apply(preprocess_text)

# Display the original and processed descriptions for verification
df_meta_pet_supplies[['description', 'description_processed']].head()

[nltk_data] Downloading package punkt_tab to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,description,description_processed
0,[Hurtta harnesses are suitable for active walk...,hurtta harness suitable active walk dog especi...
1,[],
2,[Micron filter bags provide excellent mechanic...,micron filter bag provide excellent mechanical...
3,[],
4,[],


In [16]:
df_meta_pet_supplies.columns

Index(['main_category', 'title', 'average_rating', 'rating_number', 'features',
       'description', 'price', 'images', 'videos', 'store', 'categories',
       'details', 'parent_asin', 'bought_together', 'subtitle', 'author',
       'description_processed', 'features_processed', 'categories_processed'],
      dtype='object')

In [17]:
df_meta_pet_supplies.dropna(subset=['description_processed'],inplace=True)

In [18]:
# filter the data with main_category as Pet Supplies
df_meta_pet_supplies = df_meta_pet_supplies[df_meta_pet_supplies['main_category'] == 'Pet Supplies']

# check data
df_meta_pet_supplies.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author,description_processed,features_processed,categories_processed
0,Pet Supplies,Hurtta Pet Collection 14-Inch Padded Y-Harness...,4.4,166,"[Made from highly durable Neoprene, Fitted wit...",[Hurtta harnesses are suitable for active walk...,24.95,[{'thumb': 'https://m.media-amazon.com/images/...,[],Hurtta,"[Pet Supplies, Dogs, Collars, Harnesses & Leas...","{'Size': '14 Inch', 'Color': 'Pink', 'Pattern'...",B00XJG2SLG,,,,hurtta harness suitable active walk dog especi...,made highly durable neoprene fitted efficient ...,pet supply dog collar harness leash harness ve...
1,Pet Supplies,"Raised Dog Bowls,6 Inch Ceramic Dog Bowl Dish,...",4.6,100,[【Two Bowls+Metal Stand+Dog food mat】Really pr...,[],32.88,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Ihoming Ceramic Bowls for Dogs and...,FIVEAGE,"[Pet Supplies, Dogs, Feeding & Watering Suppli...","{'Material': 'Ceramic', 'Target Species': 'Cat...",B0BD6TXL2G,,,,,two bowlsmetal standdog food matreally practic...,pet supply dog feeding watering supply bowl di...
2,Pet Supplies,4 Pack - 4 Inch Ring Filter Socks 200 Micron -...,4.4,84,[Micron filter bags provide excellent mechanic...,[Micron filter bags provide excellent mechanic...,15.0,[{'thumb': 'https://m.media-amazon.com/images/...,[],Encompass All,[],"{'Is Discontinued By Manufacturer': 'No', 'Pro...",B01MQTWB5H,,,,micron filter bag provide excellent mechanical...,micron filter bag provide excellent mechanical...,
3,Pet Supplies,"SlowTon Dog Vest Harness, Mesh Breathable Pet ...",4.5,348,[New Match and Well Made --- The chest part of...,[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],SlowTon,"[Pet Supplies, Dogs, Collars, Harnesses & Leas...","{'Size': 'Large', 'Color': 'Purple', 'Pattern'...",B07DYM6LXD,,,,,new match well made chest part harness adopted...,pet supply dog collar harness leash harness ve...
4,Pet Supplies,Cat Window Perch Durable Cat Hammock Seat for ...,4.4,130,[【Ideal for Use Year-around】Cat window perch c...,[],23.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Cat Window Perch Beige Assembly Vi...,Mewoo,"[Pet Supplies, Cats, Beds & Furniture, Hammocks]","{'Brand': 'Mewoo', 'Color': 'Blue,Grey,Beige',...",B09ZTMTS6N,,,,,ideal use yeararoundcat window perch cover mad...,pet supply cat bed furniture hammock


In [19]:
# delete rows without desc
df_meta_pet_supplies = df_meta_pet_supplies[df_meta_pet_supplies['description'].apply(lambda x: len(x) > 0)]

In [20]:
df_meta_pet_supplies_new=df_meta_pet_supplies[['parent_asin','description_processed','features_processed','categories']]

In [21]:
df_meta_pet_supplies_new['text']=df_meta_pet_supplies_new['description_processed']+df_meta_pet_supplies_new['features_processed']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_meta_pet_supplies_new['text']=df_meta_pet_supplies_new['description_processed']+df_meta_pet_supplies_new['features_processed']


In [22]:
df_meta_pet_supplies.head(10)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author,description_processed,features_processed,categories_processed
0,Pet Supplies,Hurtta Pet Collection 14-Inch Padded Y-Harness...,4.4,166,"[Made from highly durable Neoprene, Fitted wit...",[Hurtta harnesses are suitable for active walk...,24.95,[{'thumb': 'https://m.media-amazon.com/images/...,[],Hurtta,"[Pet Supplies, Dogs, Collars, Harnesses & Leas...","{'Size': '14 Inch', 'Color': 'Pink', 'Pattern'...",B00XJG2SLG,,,,hurtta harness suitable active walk dog especi...,made highly durable neoprene fitted efficient ...,pet supply dog collar harness leash harness ve...
2,Pet Supplies,4 Pack - 4 Inch Ring Filter Socks 200 Micron -...,4.4,84,[Micron filter bags provide excellent mechanic...,[Micron filter bags provide excellent mechanic...,15.0,[{'thumb': 'https://m.media-amazon.com/images/...,[],Encompass All,[],"{'Is Discontinued By Manufacturer': 'No', 'Pro...",B01MQTWB5H,,,,micron filter bag provide excellent mechanical...,micron filter bag provide excellent mechanical...,
8,Pet Supplies,Wysong Optimal Adult Canine Formula Dry Dog Fo...,4.4,75,[42% Protein With High Levels Of Fresh/Frozen ...,[For the past 35 years the Wysong goal and com...,17.46,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Helped Our Cat That Has a Sensitiv...,Wysong,"[Pet Supplies, Dogs, Food, Dry]","{'Is Discontinued By Manufacturer': 'No', 'Pro...",B00F3JRLYQ,,,,past year wysong goal commitment reconcile mod...,protein high level freshfrozen dried meat natu...,pet supply dog food dry
9,Pet Supplies,"Vitakraft Guinea Pig Happy Frutti Treat, 6 Oun...",4.1,20,[A Colorful Mix Of Tropical Fruits Including F...,[Vitakraft's Happy Frutti treats for Guinea Pi...,,[{'thumb': 'https://m.media-amazon.com/images/...,[],Vitakraft,"[Pet Supplies, Small Animals, Treats]","{'Brand': 'Vitakraft', 'Flavor': 'Banana', 'Ag...",B0006L145S,,,,vitakrafts happy frutti treat guinea pig made ...,colorful mix tropical fruit including fig apri...,pet supply small animal treat
11,Pet Supplies,K9PROLINE [Waterproof] Professional Dog Leash ...,4.6,9,[Waterproof All Terrain Leash- This leash is c...,[The Alpha Series is our flagship line of prof...,,[{'thumb': 'https://m.media-amazon.com/images/...,[],K9PROLINE,"[Pet Supplies, Dogs, Collars, Harnesses & Leas...","{'Color': 'Black', 'Brand': 'K9PROLINE', 'Mate...",B074SVVSK7,,,,alpha series flagship line professional grade ...,waterproof terrain leash leash constructed gen...,pet supply dog collar harness leash leash basi...
13,Pet Supplies,Shed Pal Pet Hair Remover Dog Cat Grooming Vac...,2.5,48,[Shed Pal Pet Hair Remover Dog Cat Grooming Va...,[Shed Pal As Seen On TV Pet Hair Remover Dog C...,26.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Does the hair vacuum work on all d...,Unknown,"[Pet Supplies, Cats, Grooming, Hair Removal Mi...","{'Is Discontinued By Manufacturer': 'No', 'Pac...",B01LLXU360,,,,shed pal seen tv pet hair remover dog cat groo...,shed pal pet hair remover dog cat grooming vac...,pet supply cat grooming hair removal mitt roller
16,Pet Supplies,"Kaytee 3 Pack of Carrot Carousel Chew Toys, La...",4.4,93,"[3 large Carrot Carousel Chew Toys, For rabbit...",[Provide your pet with hours of fun activity. ...,10.57,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Wood Chews Combo', 'url': 'https:/...",Kaytee,"[Pet Supplies, Small Animals, Toys]","{'Is Discontinued By Manufacturer': 'No', 'Pro...",B01LY4VFTR,,,,provide pet hour fun activity carousel chew to...,large carrot carousel chew toy rabbit guinea p...,pet supply small animal toy
19,Pet Supplies,Slow Feeder Dog Bowls Insert - Slow Feeder Ins...,3.7,21,[TURNING DOG BOWLS INTO SLOW FEEDERS WITH ONE ...,[SOME TIPS TO KNOW 1)Top rack dishwasher safe ...,14.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Dog Slow Feeder Bowl', 'url': 'htt...",Oborlo,"[Pet Supplies, Dogs, Feeding & Watering Suppli...","{'Material': 'Silicone', 'Target Species': 'Ca...",B09WYP5TMM,,,,tip know top rack dishwasher safe best cleanin...,turning dog bowl slow feeder one click want ma...,pet supply dog feeding watering supply bowl di...
20,Pet Supplies,Cichlid Gold Fish Food Large Pellet Floating T...,4.7,716,"[Hikari Sales Usa Inc, LARGE, 8.8 OUNCE]",[Cichlid Gold fish food large pellet floating ...,9.62,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Hikari Cichlid Gold Fish Food', 'u...",Hikari,"[Pet Supplies, Fish & Aquatic Pets, Food]","{'Is Discontinued By Manufacturer': 'No', 'Pro...",B008NSGB5E,,,,cichlid gold fish food large pellet floating t...,hikari sale usa inc large ounce,pet supply fish aquatic pet food
23,Pet Supplies,Jxinrong Mushrooms Spiked Rivet Studded Adjust...,4.4,299,"[Material: PU Leather + Alloy, 3 Rows reto riv...",[Please check the size before you confirm the ...,12.98,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Black collar with gold spikes', 'u...",Jxinrong,"[Pet Supplies, Dogs, Collars, Harnesses & Leas...","{'Pattern': 'Solid', 'Color': 'Coffee', 'Mater...",B089QQHKVD,,,,please check size confirm order always feature...,material pu leather alloy row reto rivet solid...,pet supply dog collar harness leash collar bas...


In [23]:
df_meta_pet_supplies_new=df_meta_pet_supplies_new[['parent_asin','text','categories']]

In [24]:
df_meta_pet_supplies_new.head()

Unnamed: 0,parent_asin,text,categories
0,B00XJG2SLG,hurtta harness suitable active walk dog especi...,"[Pet Supplies, Dogs, Collars, Harnesses & Leas..."
2,B01MQTWB5H,micron filter bag provide excellent mechanical...,[]
8,B00F3JRLYQ,past year wysong goal commitment reconcile mod...,"[Pet Supplies, Dogs, Food, Dry]"
9,B0006L145S,vitakrafts happy frutti treat guinea pig made ...,"[Pet Supplies, Small Animals, Treats]"
11,B074SVVSK7,alpha series flagship line professional grade ...,"[Pet Supplies, Dogs, Collars, Harnesses & Leas..."


In [25]:
user_reviews.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,4,Sticky stair riser tread thingies are utterly ...,"Tried to load photos, but none of my photos or...",B084SXF9Y8,B0BHTBS5RM,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2023-02-04 16:48:49.744,0,True
1,1,Dangerous bc metal not properly coated! Rough ...,Where to begin? I’ve been trying to get the 2...,B000QFWCJ6,B0BJ16KKML,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2022-10-11 18:41:13.369,4,True
2,3,Arrived damaged/dented/rusted,Unfortunately mine arrived damaged/dented whic...,B08B875X4H,B0BX76YVP9,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2022-06-25 00:55:16.820,4,True
3,5,My pups love these!,My pups love these! It’s one of their favorit...,B01MFG9AG7,B0BM6V2SH8,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2020-05-20 00:17:18.188,0,True
4,3,My pups refuse to eat them.,"Idk why, but my pups will not eat either flavo...",B00KRMMJV4,B0986BSRB1,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2020-04-25 22:21:43.681,0,True


In [26]:
user_reviews_new=user_reviews[['rating','asin','parent_asin','user_id','timestamp']]

In [27]:
# Merging based on 'parent_asin'
merged_data = pd.merge(user_reviews_new, df_meta_pet_supplies_new, on='parent_asin', how='inner')

In [28]:
merged_data.isnull().sum()

rating         0
asin           0
parent_asin    0
user_id        0
timestamp      0
text           0
categories     0
dtype: int64

In [29]:
merged_data.shape

(8825276, 7)

In [30]:
merged_data.head()

Unnamed: 0,rating,asin,parent_asin,user_id,timestamp,text,categories
0,4,B084SXF9Y8,B0BHTBS5RM,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2023-02-04 16:48:49.744,let cuddle time continue best friend help pets...,"[Pet Supplies, Dogs, Beds & Furniture, Stairs ..."
1,1,B000QFWCJ6,B0BJ16KKML,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2022-10-11 18:41:13.369,life stage folding metal dog crate midwest hom...,"[Pet Supplies, Dogs, Crates, Houses & Pens, Cr..."
2,3,B08B875X4H,B0BX76YVP9,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2022-06-25 00:55:16.820,breeze touch sit haunch holder grooming restra...,"[Pet Supplies, Dogs, Grooming, Shower & Bath S..."
3,5,B01MFG9AG7,B0BM6V2SH8,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2020-05-20 00:17:18.188,bring boost unexpected flavor dog day purina b...,"[Pet Supplies, Dogs, Treats, Bones]"
4,3,B00KRMMJV4,B0986BSRB1,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2020-04-25 22:21:43.681,perfect complement wilderness food blue wild b...,"[Pet Supplies, Dogs, Treats, Cookies, Biscuits..."


In [31]:
merged_data.to_pickle(data_dir+"clean_data/merged_data.pkl")

## Data Filtering

In [36]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=593614 sha256=a9cb1f7b534073d972eb1726eb3ec96dbba1b0faffc2074a51bc98a470df0b9e
  Stored in directory: /home/sagemaker-user/.cache/pip/wheels/2a/8f/6e/7e2899163e2d85d8266daab4aa1cdabec7a6c56f83c015b5af
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [3]:
import pandas as pd

# Load the data
merged_data = pd.read_pickle(data_dir+"clean_data/merged_data.pkl")

In [4]:
merged_data.isnull().sum()

rating         0
asin           0
parent_asin    0
user_id        0
timestamp      0
text           0
categories     0
dtype: int64

In [5]:
# dedupe
filtered_data = merged_data.drop_duplicates(subset=['user_id', 'parent_asin', 'timestamp'])
filtered_data.shape

(8748707, 7)

In [6]:
import pandas as pd
import numpy as np
import random
from surprise import Dataset, Reader, SVD, NMF, SlopeOne, CoClustering, SVDpp
from surprise import accuracy
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool, cpu_count
from functools import partial
import faiss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
import pickle
from scipy.sparse import csr_matrix, issparse

seed = 42
random.seed(seed)
np.random.seed(seed)

In [7]:
# 1. Filtering low-frequency users and products
user_counts = filtered_data['user_id'].value_counts()
item_counts = filtered_data['parent_asin'].value_counts()

min_user_interactions = 5
min_item_interactions = 5

filtered_users = user_counts[user_counts >= min_user_interactions].index
filtered_items = item_counts[item_counts >= min_item_interactions].index

filtered_data = filtered_data[
    (filtered_data['user_id'].isin(filtered_users)) & 
    (filtered_data['parent_asin'].isin(filtered_items))
].copy()

print(f"Volume after filtering: {filtered_data.shape}")
print(f"Number of users: {filtered_data['user_id'].nunique()}")
print(f"Number of products: {filtered_data['parent_asin'].nunique()}")

# 2. Time-based division
filtered_data['timestamp'] = pd.to_datetime(filtered_data['timestamp'])
filtered_data = filtered_data.sort_values(['user_id', 'timestamp']).reset_index(drop=True)
print(f"Timeframe for dataset: {filtered_data['timestamp'].min()} to {filtered_data['timestamp'].max()}")

Volume after filtering: (2412690, 7)
Number of users: 290493
Number of products: 90965
Timeframe for dataset: 2001-09-27 04:57:21 to 2023-09-07 23:10:59.670000


In [8]:
filtered_data.rating.value_counts()

rating
5    1594539
4     304448
3     201515
1     188520
2     123668
Name: count, dtype: int64

In [41]:
filtered_data.to_pickle(data_dir+'clean_data/filtered_data.pkl')

## Generate train and test sets

In [3]:
filtered_data = pd.read_pickle(data_dir+'clean_data/filtered_data.pkl')

In [4]:
filtered_data = optimize_data_types(filtered_data)

Original dataset size: 0.1185 GB.
Non-numeric columns: ['asin', 'parent_asin', 'user_id', 'timestamp', 'text', 'categories']
Reduced dataset size: 0.1037 GB.
Memory reduction: 12.50%.


  min_max.loc[


In [None]:
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool, cpu_count
from functools import partial
from tqdm.auto import tqdm
import pandas as pd
import gc  # For garbage collection

# Function to generate train and test samples for a given user (unchanged)
def generate_samples(user, user_positive_items, all_items, user_item_rating, user_item_timestamp, split_ratio=0.2, min_test_samples=10, seed=None, negative_ratio=1):
    if seed is not None:
        np.random.seed(seed)
    
    train_samples = []
    test_samples = []
    
    positive_items = list(user_positive_items.get(user, set()))
    num_positive = len(positive_items)
    
    if num_positive == 0:
        return train_samples, test_samples
    
    num_test = max(min_test_samples, int(num_positive * split_ratio))
    num_test = min(num_test, num_positive - 1)
    
    test_positive = positive_items[-num_test:]
    train_positive = positive_items[:-num_test]
    
    for item in train_positive:
        train_samples.append({
            'user_id': user,
            'parent_asin': item,
            'label': 1,
            'rating': user_item_rating.get((user, item), 0),
            'timestamp': user_item_timestamp.get((user, item))
        })
    
    if train_positive:
        negative_pool_train = all_items - set(train_positive) - set(test_positive)
        num_train_neg = len(train_positive) * negative_ratio
        negative_items_train = np.random.choice(
            list(negative_pool_train), size=num_train_neg, replace=(len(negative_pool_train) < num_train_neg)
        )
        for idx, item in enumerate(negative_items_train):
            train_samples.append({
                'user_id': user,
                'parent_asin': item,
                'label': 0,
                'rating': 0,
                'timestamp': user_item_timestamp.get((user, train_positive[idx % len(train_positive)]))
            })
    
    for item in test_positive:
        test_samples.append({
            'user_id': user,
            'parent_asin': item,
            'label': 1,
            'rating': user_item_rating.get((user, item), 0),
            'timestamp': user_item_timestamp.get((user, item))
        })
    
    if test_positive:
        negative_pool_test = all_items - set(train_positive) - set(test_positive)
        num_test_neg = len(test_positive) * negative_ratio
        negative_items_test = np.random.choice(
            list(negative_pool_test), size=num_test_neg, replace=(len(negative_pool_test) < num_test_neg)
        )
        for idx, item in enumerate(negative_items_test):
            test_samples.append({
                'user_id': user,
                'parent_asin': item,
                'label': 0,
                'rating': 0,
                'timestamp': user_item_timestamp.get((user, test_positive[idx % len(test_positive)]))
            })
    
    return train_samples, test_samples

# Function to process a batch of users using multithreading
def process_batch_generate_samples(batch_users, user_positive_items, all_items, user_item_rating, user_item_timestamp, split_ratio=0.2, min_test_samples=10, seed=None):
    train_samples = []
    test_samples = []
    with ThreadPoolExecutor(max_workers=16) as executor:
        futures = [
            executor.submit(
                generate_samples, user, user_positive_items, all_items, user_item_rating, user_item_timestamp, split_ratio, min_test_samples, seed
            ) for user in batch_users
        ]
        for future in tqdm(futures, desc="Processing users in batch"):
            train_batch, test_batch = future.result()
            train_samples.extend(train_batch)
            test_samples.extend(test_batch)
    return train_samples, test_samples

# Function to split the dataset into train and test sets using multiprocessing and save them incrementally
def parallel_generate_split_set(filtered_data, split_ratio=0.2, min_test_samples=10, num_processes=None, batch_size=10000, output_dir="./"):
    if num_processes is None:
        num_processes = cpu_count()
    
    all_users = filtered_data['user_id'].unique()
    all_items = set(filtered_data['parent_asin'].unique())
    
    user_positive_items = filtered_data.groupby('user_id')['parent_asin'].apply(list).to_dict()
    user_item_rating = filtered_data.set_index(['user_id', 'parent_asin'])['rating'].to_dict()
    user_item_timestamp = filtered_data.set_index(['user_id', 'parent_asin'])['timestamp'].to_dict()
    
    batches = [all_users[i:i + batch_size] for i in range(0, len(all_users), batch_size)]
    
    partial_generate = partial(
        process_batch_generate_samples,
        user_positive_items=user_positive_items,
        all_items=all_items,
        user_item_rating=user_item_rating,
        user_item_timestamp=user_item_timestamp,
        split_ratio=split_ratio,
        min_test_samples=min_test_samples,
        seed=None
    )
    
    print("Generating train and test sets using multiprocessing and multithreading...")
    
    # Use multiprocessing to process batches in parallel
    with Pool(processes=num_processes) as pool:
        results = tqdm(pool.imap(partial_generate, batches), total=len(batches), desc="Processing batches")
        
        # Save results incrementally
        train_file = f"{output_dir}/train_data.csv"
        test_file = f"{output_dir}/test_data.csv"
        
        for i, (train_batch, test_batch) in enumerate(results):
            train_df = pd.DataFrame(train_batch)
            test_df = pd.DataFrame(test_batch)
            
            if i == 0:
                train_df.to_csv(train_file, mode='w', index=False, header=True)
                test_df.to_csv(test_file, mode='w', index=False, header=True)
            else:
                train_df.to_csv(train_file, mode='a', index=False, header=False)
                test_df.to_csv(test_file, mode='a', index=False, header=False)
            
            # Clean up memory
            del train_df, test_df, train_batch, test_batch
            gc.collect()
    
    print(f"Train and test sets saved incrementally to {output_dir}")

# Generate train and test sets
parallel_generate_split_set(filtered_data, split_ratio=0.05, min_test_samples=10, batch_size=10000, output_dir="./output")



Generating train and test sets using multiprocessing and multithreading...


Processing batches:   0%|          | 0/28 [00:00<?, ?it/s]

In [None]:
# save all unique users
all_users = test_df['user_id'].unique()

# save the actual positive sample of user-product pairs
user_actual = test_df[test_df['label'] == 1].groupby('user_id')['parent_asin'].apply(set).to_dict()


In [None]:
import pickle

with open(data_dir+"clean_data/all_users.pkl", "wb") as f:
    pickle.dump(all_users, f)

with open(data_dir+"clean_data/user_actual.pkl", "wb") as f:
    pickle.dump(user_actual, f)
