In [None]:
# !pip install datasets

## Download dataset

In [None]:
from datasets import load_dataset
import pandas as pd

# Import and load dataset
category = 'Books'
## reviews
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_review_{category}", trust_remote_code=True)
reviews = pd.DataFrame.from_dict(dataset["full"])
reviews.drop(['title', 'text', 'images', 'timestamp', 'helpful_vote'], axis=1, inplace=True)

## metadata
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_meta_{category}", split="full", trust_remote_code=True)
metadata = pd.DataFrame.from_dict(dataset)
metadata.drop(['title', 'average_rating', 'rating_number', 'features', 'description', 'images', 'videos', 'store', 'details', 'bought_together', 'subtitle', 'author'], axis=1, inplace=True)

## Filter items and users

### Remove items that don't have have subcategories

In [None]:
metadata_processed = metadata[metadata['categories'].map(lambda d: len(d)) > 0]
metadata_processed.dropna()
metadata_processed

### Convert price to numeric

In [None]:
metadata_processed = metadata_processed[pd.to_numeric(metadata_processed['price'], errors='coerce').notnull()]
metadata_processed['price'] = metadata_processed['price'].astype(float) 
metadata_processed

### Remove items that don't correspond to the main catgory

In [None]:
metadata_processed = metadata_processed[metadata_processed['main_category'].isin([category])]
metadata_processed


In [None]:
ids_to_keep = metadata_processed['parent_asin'].unique()
reviews_processed = reviews[reviews['parent_asin'].isin(ids_to_keep)]

In [None]:
reviews_processed.to_csv('./data/reviews_processed.csv')
metadata_processed.to_csv('./data/metadata_processed.csv')

In [1]:
import pandas as pd

reviews_processed =  pd.read_csv('./data/reviews_processed.csv',sep=",", index_col=0, )
metadata_processed = pd.read_csv('./data/metadata_processed.csv', sep=",", index_col=0, converters={'categories': pd.eval})

In [4]:
len(reviews_processed['parent_asin'].unique()), len(reviews_processed['user_id'].unique())

(2878115, 7797481)

In [2]:
# reviews_processed_copy = reviews_processed.copy() 
# metadata_processed_copy = metadata_processed.copy()

In [3]:
reviews_processed = reviews_processed_copy.copy() 
metadata_processed = metadata_processed_copy.copy()

## Features of items
### Stats

In [4]:
def extract_features(metadata, reviews):
    features = metadata.copy()
    reviews = reviews.copy()
    name = 'item_'

    # Stats ratings 
    stats = {}
    aggregates = ['mean', 'median', 'min', 'max', 'std', 'size']

    for agg in  aggregates:
        name_agg = 'number' if agg == 'size' else agg
        stats[agg] = reviews.groupby('parent_asin')['rating'].aggregate(agg)
        stats[agg] = pd.DataFrame({'parent_asin': stats[agg].index, f'{name}{name_agg}_ratings': stats[agg].values})

    # Merging all dataframes
    for rating_stat in stats.values():
        features = pd.merge(features, rating_stat, on='parent_asin')
    
    # Replace NaN with 0
    features.fillna(0, inplace=True)

    # Add column index
    features[f'{name}index'] =  features.index

    return features

### Process metadata

In [5]:
features = extract_features(metadata_processed, reviews_processed)
features

Unnamed: 0,main_category,price,categories,parent_asin,item_mean_ratings,item_median_ratings,item_min_ratings,item_max_ratings,item_std_ratings,item_number_ratings,item_index
0,Books,8.23,"[Books, Literature & Fiction, History & Critic...",0701169850,4.000000,4.0,4.0,4.0,0.000000,1,0
1,Books,3.52,"[Books, Reference, Words, Language & Grammar]",0435088688,5.000000,5.0,5.0,5.0,0.000000,1,1
2,Books,17.17,"[Books, Biographies & Memoirs, Leaders & Notab...",0316185361,4.760274,5.0,2.0,5.0,0.578998,146,2
3,Books,7.43,"[Books, Children's Books, Science Fiction & Fa...",0545425573,4.111111,5.0,1.0,5.0,1.364225,9,3
4,Books,4.05,"[Books, Arts & Photography, History & Criticism]",B09PHG4FQ8,5.000000,5.0,5.0,5.0,0.000000,2,4
...,...,...,...,...,...,...,...,...,...,...,...
2878110,Books,36.06,"[Books, Biographies & Memoirs, Community & Cul...",1594483574,3.000000,3.0,3.0,3.0,0.000000,1,2878110
2878111,Books,75.00,"[Books, Cookbooks, Food & Wine, Regional & Int...",9719317051,5.000000,5.0,5.0,5.0,0.000000,1,2878111
2878112,Books,18.39,"[Books, Politics & Social Sciences, Philosophy]",0029051509,5.000000,5.0,5.0,5.0,0.000000,2,2878112
2878113,Books,4.99,"[Books, Reference, Atlases & Maps]",0925873039,5.000000,5.0,5.0,5.0,0.000000,1,2878113


In [6]:
features.columns

Index(['main_category', 'price', 'categories', 'parent_asin',
       'item_mean_ratings', 'item_median_ratings', 'item_min_ratings',
       'item_max_ratings', 'item_std_ratings', 'item_number_ratings',
       'item_index'],
      dtype='object')

### Filter items with given number minimum of ratings

In [8]:
limit = 1
features = features[features['item_number_ratings'] > limit]
features

Unnamed: 0,price,categories,parent_asin,item_mean_ratings,item_median_ratings,item_min_ratings,item_max_ratings,item_std_ratings,item_number_ratings
2,17.17,"[Books, Biographies & Memoirs, Leaders & Notab...",0316185361,4.760274,5.0,2.0,5.0,0.578998,146
3,7.43,"[Books, Children's Books, Science Fiction & Fa...",0545425573,4.111111,5.0,1.0,5.0,1.364225,9
4,4.05,"[Books, Arts & Photography, History & Criticism]",B09PHG4FQ8,5.000000,5.0,5.0,5.0,0.000000,2
6,13.43,"[Books, Engineering & Transportation, Engineer...",1680450263,4.742105,5.0,1.0,5.0,0.652075,190
7,14.00,"[Books, Literature & Fiction, Genre Fiction]",1694621731,5.000000,5.0,5.0,5.0,0.000000,3
...,...,...,...,...,...,...,...,...,...
2878097,4.33,"[Books, Arts & Photography, Photography & Video]",0578815923,5.000000,5.0,5.0,5.0,0.000000,2
2878098,6.49,"[Books, Education & Teaching, Schools & Teaching]",1934338745,4.500000,4.5,4.0,5.0,0.707107,2
2878104,19.00,"[Books, Literature & Fiction, Genre Fiction]",1496716388,4.500000,4.5,4.0,5.0,0.707107,2
2878107,11.74,"[Books, Business & Money, Business Culture]",1717788157,4.571429,5.0,2.0,5.0,1.133893,7


In [9]:
ids_to_keep = features['parent_asin'].unique()
reviews_processed = reviews_processed[reviews_processed['parent_asin'].isin(ids_to_keep)]

In [10]:
reviews_processed

Unnamed: 0,rating,asin,parent_asin,user_id,verified_purchase
1,5.0,0593235657,0593235657,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,True
2,5.0,1782490671,1782490671,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,True
4,5.0,0823098079,0823098079,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,True
6,4.0,1640210148,1640210148,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,False
7,5.0,1784881953,1784881953,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,False
...,...,...,...,...,...
29475446,5.0,0972842136,0972842136,AFXLUB6LNBFJYKEJSVCYE7FD7K4A,True
29475447,5.0,0805087761,0805087761,AFXLUB6LNBFJYKEJSVCYE7FD7K4A,True
29475448,5.0,0805087761,0805087761,AFJJ2SIZBI4VCHQAMFAFMX7K6MEQ,True
29475449,5.0,0736427163,0736427163,AETEIROGXZZZPVDNB6BWU5JDVKMA,True


#### Keep only users that reviewed more than a certain limit of items

In [11]:
number_reviews = reviews_processed.groupby('user_id')['rating'].aggregate('size')
number_reviews = pd.DataFrame({'userId': number_reviews.index, f'number_reviews_user': number_reviews.values})
number_reviews

Unnamed: 0,userId,number_reviews_user
0,AE2222HXKLMAEK4SG56OW23V3LTA,1
1,AE22236AFRRSMQIKGG7TPTB75QEA,5
2,AE2223EZBHNIO5LHPKDLXBL3IMQQ,1
3,AE22247VI5Y625HSC52JQQBGXCAQ,1
4,AE2224D3S4GTKVFJ5V7ZRQJ7P4FQ,2
...,...,...
7482311,AHZZZYCUOTRYW4ZQFIFAEZGBOY4A,1
7482312,AHZZZYE2256FFHPFB54DUDOQL3IA,4
7482313,AHZZZZ76NI5YF4RP5TKCQRGRQAGA,1
7482314,AHZZZZUMHA57YCBU4WFINWREUBKQ,1


In [12]:
## number maximum and minimum of items reviewed by a user

max(number_reviews['number_reviews_user']), min(number_reviews['number_reviews_user'])

(3404, 1)

In [13]:
nb_min = 350

number_reviews = number_reviews[(number_reviews['number_reviews_user'] > nb_min)]
number_reviews = number_reviews.drop(columns='number_reviews_user')
number_reviews

Unnamed: 0,userId
10159,AE27NHMO2VWDSCCEP4A53GTX7YJA
14818,AE2C6ZDFCI2EW5EDM53RYKP7WHNQ
20775,AE2FGO5WED4FEC53K57X4DHCEPIQ
43213,AE2RLSWPDFQRKKED246OZZX3NQCA
51393,AE2W3CFIRW42PHPYRNRUCIR4X6BA
...,...
7390168,AHYHOC5AU2TSVZLWIFVNMUVB5YBA
7434573,AHZ7XZQVCIWUY6ONMGNMADRXW3WA
7457925,AHZMSBFDOWI6UVLSWC5K7VTEEMCQ
7476506,AHZWVIBN5WVIZP5IULWPKEFJN2AA


In [14]:
ids_users_to_keep = number_reviews['userId'].unique()
reviews_processed = reviews_processed[reviews_processed['user_id'].isin(ids_users_to_keep)]

In [15]:
features.to_csv(f'./data/features_nb-ratings-item-min={limit}.csv')
reviews_processed.to_csv(f'./data/reviews_processed_nb-ratings-item-min={limit}_nb-ratings-user-min={nb_min}.csv')

In [None]:
# import pandas as pd

# limit = 1
# nb_min = 250
# features = pd.read_csv(f'./data/features_nb-ratings-item-min={limit}.csv', sep=",", index_col=0,)
# reviews_processed = pd.read_csv(f'./data/reviews_processed_nb-ratings-item-min={limit}_nb-ratings-user-min={nb_min}.csv', sep=",", index_col=0,)


### Encoding categorical attributes

In [16]:
# Number of unique categories
len(features['categories'].explode().unique())

711

In [18]:
from sklearn.preprocessing import MultiLabelBinarizer


# Encode boolean to int
reviews_processed['verified_purchase'] = reviews_processed['verified_purchase'].astype(int)

# Encode categories
# Drop unused features
features.drop(['main_category', 'item_index'], axis=1, inplace=True)

## Categories To "one hot vectors"
mlb = MultiLabelBinarizer()
features = features.join(pd.DataFrame(mlb.fit_transform(features.pop('categories')),
                                            columns=mlb.classes_,
                                            index=features.index))
features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_processed['verified_purchase'] = reviews_processed['verified_purchase'].astype(int)


Unnamed: 0,price,parent_asin,item_mean_ratings,item_median_ratings,item_min_ratings,item_max_ratings,item_std_ratings,item_number_ratings,2011 Best Books of the Year,2023 Calendars,...,Women's Health,Women's Studies,"Words, Language & Grammar",World,World Literature,Worship & Devotion,Writing Stationery,"Writing, Research & Publishing Guides",eDocs,tech.book(store)
2,17.17,0316185361,4.760274,5.0,2.0,5.0,0.578998,146,0,0,...,0,0,0,0,0,0,0,0,0,0
3,7.43,0545425573,4.111111,5.0,1.0,5.0,1.364225,9,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4.05,B09PHG4FQ8,5.000000,5.0,5.0,5.0,0.000000,2,0,0,...,0,0,0,0,0,0,0,0,0,0
6,13.43,1680450263,4.742105,5.0,1.0,5.0,0.652075,190,0,0,...,0,0,0,0,0,0,0,0,0,0
7,14.00,1694621731,5.000000,5.0,5.0,5.0,0.000000,3,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2878097,4.33,0578815923,5.000000,5.0,5.0,5.0,0.000000,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2878098,6.49,1934338745,4.500000,4.5,4.0,5.0,0.707107,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2878104,19.00,1496716388,4.500000,4.5,4.0,5.0,0.707107,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2878107,11.74,1717788157,4.571429,5.0,2.0,5.0,1.133893,7,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
for column in features.columns:
    print(column)

price
parent_asin
item_mean_ratings
item_median_ratings
item_min_ratings
item_max_ratings
item_std_ratings
item_number_ratings
2011 Best Books of the Year
2023 Calendars
7 Day Free Trial Textbooks
ASINs for HQP
Abuse
Accounting
Acrobat 9
Acrobat X
Action & Adventure
Activities, Crafts & Games
Addiction & Recovery
Address Books
Administration & Medicine Economics
Administrative Law
Adobe Store
Adoption
Advanced
Africa
Age 5
Ages Baby-2
Aging
Aging Parents
Agnosticism
Agricultural Sciences
All Deals
All-American
Allied Health Professions
Almanacs & Yearbooks
Alternative Medicine
Amazon Charts: Most Sold Fiction
Amazon Publishing
Americas
Ancient & Medieval Literature
Ancient Civilizations
Android
Anger Management
Animals
Anthologies
Anthropology
Antiques & Collectibles
Anxieties & Phobias
Arabic
Archaeology
Architectural Engineering
Architecture
Arctic & Antarctica
Art Therapy & Relaxation
Art, Music & Photography
Arts
Arts & Literature
Arts & Photography
Arts, Music & Photography
Asia
A

### Merge features with reviews
(to create one hot vectors of users)

In [20]:
reviews_processed.drop(columns=['asin'], axis=1, inplace=True)

reviews_processed

Unnamed: 0,rating,asin,parent_asin,user_id,verified_purchase
779,5.0,1422631753,1422631753,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,0
782,5.0,1602804303,1602804303,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,1
783,5.0,0785238999,0785238999,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,0
784,5.0,1422627837,1422627837,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,0
785,5.0,145493753X,145493753X,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,0
...,...,...,...,...,...
24876269,4.0,1592495532,1592495532,AGIS3QL3LUEQWKF3YKHLLWDD5BJA,0
24876270,3.0,0525458646,0525458646,AGIS3QL3LUEQWKF3YKHLLWDD5BJA,1
24876271,5.0,079451006X,079451006X,AGIS3QL3LUEQWKF3YKHLLWDD5BJA,1
24876272,4.0,0439083702,0439083702,AGIS3QL3LUEQWKF3YKHLLWDD5BJA,0


In [21]:
features

Unnamed: 0,price,parent_asin,item_mean_ratings,item_median_ratings,item_min_ratings,item_max_ratings,item_std_ratings,item_number_ratings,2011 Best Books of the Year,2023 Calendars,...,Women's Health,Women's Studies,"Words, Language & Grammar",World,World Literature,Worship & Devotion,Writing Stationery,"Writing, Research & Publishing Guides",eDocs,tech.book(store)
2,17.17,0316185361,4.760274,5.0,2.0,5.0,0.578998,146,0,0,...,0,0,0,0,0,0,0,0,0,0
3,7.43,0545425573,4.111111,5.0,1.0,5.0,1.364225,9,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4.05,B09PHG4FQ8,5.000000,5.0,5.0,5.0,0.000000,2,0,0,...,0,0,0,0,0,0,0,0,0,0
6,13.43,1680450263,4.742105,5.0,1.0,5.0,0.652075,190,0,0,...,0,0,0,0,0,0,0,0,0,0
7,14.00,1694621731,5.000000,5.0,5.0,5.0,0.000000,3,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2878097,4.33,0578815923,5.000000,5.0,5.0,5.0,0.000000,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2878098,6.49,1934338745,4.500000,4.5,4.0,5.0,0.707107,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2878104,19.00,1496716388,4.500000,4.5,4.0,5.0,0.707107,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2878107,11.74,1717788157,4.571429,5.0,2.0,5.0,1.133893,7,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
reviews_processed_x_features = pd.merge(reviews_processed, features, on='parent_asin')
reviews_processed_x_features

Unnamed: 0,rating,asin,parent_asin,user_id,verified_purchase,price,item_mean_ratings,item_median_ratings,item_min_ratings,item_max_ratings,...,Women's Health,Women's Studies,"Words, Language & Grammar",World,World Literature,Worship & Devotion,Writing Stationery,"Writing, Research & Publishing Guides",eDocs,tech.book(store)
0,5.0,1422631753,1422631753,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,0,30.65,5.000000,5.0,5.0,5.0,...,0,0,0,0,0,0,0,0,0,0
1,5.0,1602804303,1602804303,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,1,40.77,4.562500,5.0,2.0,5.0,...,0,0,0,0,0,0,0,0,0,0
2,5.0,0785238999,0785238999,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,0,17.65,4.794521,5.0,1.0,5.0,...,0,0,0,0,0,0,0,0,0,0
3,4.0,0785238999,0785238999,AGV4UFVIVZ3AFNYTJZW4EQKEMVMQ,0,17.65,4.794521,5.0,1.0,5.0,...,0,0,0,0,0,0,0,0,0,0
4,5.0,0785238999,0785238999,AF2LZKZEDQKGQMIFLQQAOIVL5BHQ,0,17.65,4.794521,5.0,1.0,5.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371107,5.0,1608681831,1608681831,AEZV764RVEC2CRHM7F6S23KUCFIQ,0,9.99,4.454545,5.0,3.0,5.0,...,0,0,0,0,0,0,0,0,0,0
371108,5.0,1482712156,1482712156,AEZV764RVEC2CRHM7F6S23KUCFIQ,0,14.99,5.000000,5.0,5.0,5.0,...,0,0,0,0,0,0,0,0,0,0
371109,5.0,1482712156,1482712156,AEZV764RVEC2CRHM7F6S23KUCFIQ,0,14.99,5.000000,5.0,5.0,5.0,...,0,0,0,0,0,0,0,0,0,0
371110,5.0,1482712156,1482712156,AEZV764RVEC2CRHM7F6S23KUCFIQ,0,14.99,5.000000,5.0,5.0,5.0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# distinct items kept
len(reviews_processed_x_features['parent_asin'].unique())

205743

In [24]:
reviews_processed_x_features

Unnamed: 0,rating,parent_asin,user_id,verified_purchase,price,item_mean_ratings,item_median_ratings,item_min_ratings,item_max_ratings,item_std_ratings,...,Women's Health,Women's Studies,"Words, Language & Grammar",World,World Literature,Worship & Devotion,Writing Stationery,"Writing, Research & Publishing Guides",eDocs,tech.book(store)
0,5.0,1422631753,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,0,30.65,5.000000,5.0,5.0,5.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
1,5.0,1602804303,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,1,40.77,4.562500,5.0,2.0,5.0,0.963933,...,0,0,0,0,0,0,0,0,0,0
2,5.0,0785238999,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,0,17.65,4.794521,5.0,1.0,5.0,0.574370,...,0,0,0,0,0,0,0,0,0,0
3,4.0,0785238999,AGV4UFVIVZ3AFNYTJZW4EQKEMVMQ,0,17.65,4.794521,5.0,1.0,5.0,0.574370,...,0,0,0,0,0,0,0,0,0,0
4,5.0,0785238999,AF2LZKZEDQKGQMIFLQQAOIVL5BHQ,0,17.65,4.794521,5.0,1.0,5.0,0.574370,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371107,5.0,1608681831,AEZV764RVEC2CRHM7F6S23KUCFIQ,0,9.99,4.454545,5.0,3.0,5.0,0.820200,...,0,0,0,0,0,0,0,0,0,0
371108,5.0,1482712156,AEZV764RVEC2CRHM7F6S23KUCFIQ,0,14.99,5.000000,5.0,5.0,5.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
371109,5.0,1482712156,AEZV764RVEC2CRHM7F6S23KUCFIQ,0,14.99,5.000000,5.0,5.0,5.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
371110,5.0,1482712156,AEZV764RVEC2CRHM7F6S23KUCFIQ,0,14.99,5.000000,5.0,5.0,5.0,0.000000,...,0,0,0,0,0,0,0,0,0,0


### One hot encoding of users

In [25]:
## Create tuples of combined tags (we use tuples because lists can't be hashed when droping duplicates)
users_lists = reviews_processed_x_features.groupby('parent_asin')['user_id'].apply(list)

## Merge dataframes
reviews_processed_x_features = pd.merge(reviews_processed_x_features, users_lists, on='parent_asin')
reviews_processed_x_features

Unnamed: 0,rating,parent_asin,user_id_x,verified_purchase,price,item_mean_ratings,item_median_ratings,item_min_ratings,item_max_ratings,item_std_ratings,...,Women's Studies,"Words, Language & Grammar",World,World Literature,Worship & Devotion,Writing Stationery,"Writing, Research & Publishing Guides",eDocs,tech.book(store),user_id_y
0,5.0,1422631753,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,0,30.65,5.000000,5.0,5.0,5.0,0.000000,...,0,0,0,0,0,0,0,0,0,[AFW2PDT3AMT4X3PYQG7FJZH5FXFA]
1,5.0,1602804303,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,1,40.77,4.562500,5.0,2.0,5.0,0.963933,...,0,0,0,0,0,0,0,0,0,[AFW2PDT3AMT4X3PYQG7FJZH5FXFA]
2,5.0,0785238999,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,0,17.65,4.794521,5.0,1.0,5.0,0.574370,...,0,0,0,0,0,0,0,0,0,"[AFW2PDT3AMT4X3PYQG7FJZH5FXFA, AGV4UFVIVZ3AFNY..."
3,4.0,0785238999,AGV4UFVIVZ3AFNYTJZW4EQKEMVMQ,0,17.65,4.794521,5.0,1.0,5.0,0.574370,...,0,0,0,0,0,0,0,0,0,"[AFW2PDT3AMT4X3PYQG7FJZH5FXFA, AGV4UFVIVZ3AFNY..."
4,5.0,0785238999,AF2LZKZEDQKGQMIFLQQAOIVL5BHQ,0,17.65,4.794521,5.0,1.0,5.0,0.574370,...,0,0,0,0,0,0,0,0,0,"[AFW2PDT3AMT4X3PYQG7FJZH5FXFA, AGV4UFVIVZ3AFNY..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371107,5.0,1608681831,AEZV764RVEC2CRHM7F6S23KUCFIQ,0,9.99,4.454545,5.0,3.0,5.0,0.820200,...,0,0,0,0,0,0,0,0,0,"[AEZV764RVEC2CRHM7F6S23KUCFIQ, AEZV764RVEC2CRH..."
371108,5.0,1482712156,AEZV764RVEC2CRHM7F6S23KUCFIQ,0,14.99,5.000000,5.0,5.0,5.0,0.000000,...,0,0,0,0,0,0,0,0,0,"[AEZV764RVEC2CRHM7F6S23KUCFIQ, AEZV764RVEC2CRH..."
371109,5.0,1482712156,AEZV764RVEC2CRHM7F6S23KUCFIQ,0,14.99,5.000000,5.0,5.0,5.0,0.000000,...,0,0,0,0,0,0,0,0,0,"[AEZV764RVEC2CRHM7F6S23KUCFIQ, AEZV764RVEC2CRH..."
371110,5.0,1482712156,AEZV764RVEC2CRHM7F6S23KUCFIQ,0,14.99,5.000000,5.0,5.0,5.0,0.000000,...,0,0,0,0,0,0,0,0,0,"[AEZV764RVEC2CRHM7F6S23KUCFIQ, AEZV764RVEC2CRH..."


In [26]:
reviews_processed_x_features.columns

Index(['rating', 'parent_asin', 'user_id_x', 'verified_purchase', 'price',
       'item_mean_ratings', 'item_median_ratings', 'item_min_ratings',
       'item_max_ratings', 'item_std_ratings',
       ...
       'Women's Studies', 'Words, Language & Grammar', 'World',
       'World Literature', 'Worship & Devotion', 'Writing Stationery',
       'Writing, Research & Publishing Guides', 'eDocs', 'tech.book(store)',
       'user_id_y'],
      dtype='object', length=723)

In [27]:
## Users to One hot vectors 
mlb = MultiLabelBinarizer()
reviews_processed_x_features = reviews_processed_x_features.join(pd.DataFrame(mlb.fit_transform(reviews_processed_x_features.pop('user_id_y')),
                                            columns=mlb.classes_,
                                            index=reviews_processed_x_features.index))
reviews_processed_x_features.rename(columns={'user_id_x':'user_id'}, inplace=True)
reviews_processed_x_features

Unnamed: 0,rating,parent_asin,user_id,verified_purchase,price,item_mean_ratings,item_median_ratings,item_min_ratings,item_max_ratings,item_std_ratings,...,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,AHY4NLD5U6WGJ2DRDOLCVOKUGB4Q,AHY52ST2HDGRLL5HZTHBLPPDKPXA,AHY5662OAFQ7XPLZQ6AXRRVZPUMQ,AHYCJGF2DYH4CUEZFE4VVNG2DWBA,AHYHOC5AU2TSVZLWIFVNMUVB5YBA,AHZ7XZQVCIWUY6ONMGNMADRXW3WA,AHZMSBFDOWI6UVLSWC5K7VTEEMCQ,AHZWVIBN5WVIZP5IULWPKEFJN2AA,AHZZL323ORE6AHMIB2OCLYNCRXJQ
0,5.0,1422631753,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,0,30.65,5.000000,5.0,5.0,5.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
1,5.0,1602804303,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,1,40.77,4.562500,5.0,2.0,5.0,0.963933,...,0,0,0,0,0,0,0,0,0,0
2,5.0,0785238999,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,0,17.65,4.794521,5.0,1.0,5.0,0.574370,...,0,0,0,0,0,0,0,0,0,0
3,4.0,0785238999,AGV4UFVIVZ3AFNYTJZW4EQKEMVMQ,0,17.65,4.794521,5.0,1.0,5.0,0.574370,...,0,0,0,0,0,0,0,0,0,0
4,5.0,0785238999,AF2LZKZEDQKGQMIFLQQAOIVL5BHQ,0,17.65,4.794521,5.0,1.0,5.0,0.574370,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371107,5.0,1608681831,AEZV764RVEC2CRHM7F6S23KUCFIQ,0,9.99,4.454545,5.0,3.0,5.0,0.820200,...,0,0,0,0,0,0,0,0,0,0
371108,5.0,1482712156,AEZV764RVEC2CRHM7F6S23KUCFIQ,0,14.99,5.000000,5.0,5.0,5.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
371109,5.0,1482712156,AEZV764RVEC2CRHM7F6S23KUCFIQ,0,14.99,5.000000,5.0,5.0,5.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
371110,5.0,1482712156,AEZV764RVEC2CRHM7F6S23KUCFIQ,0,14.99,5.000000,5.0,5.0,5.0,0.000000,...,0,0,0,0,0,0,0,0,0,0


In [28]:
for column in reviews_processed_x_features.columns:
    print(column)

rating
parent_asin
user_id
verified_purchase
price
item_mean_ratings
item_median_ratings
item_min_ratings
item_max_ratings
item_std_ratings
item_number_ratings
2011 Best Books of the Year
2023 Calendars
7 Day Free Trial Textbooks
ASINs for HQP
Abuse
Accounting
Acrobat 9
Acrobat X
Action & Adventure
Activities, Crafts & Games
Addiction & Recovery
Address Books
Administration & Medicine Economics
Administrative Law
Adobe Store
Adoption
Advanced
Africa
Age 5
Ages Baby-2
Aging
Aging Parents
Agnosticism
Agricultural Sciences
All Deals
All-American
Allied Health Professions
Almanacs & Yearbooks
Alternative Medicine
Amazon Charts: Most Sold Fiction
Amazon Publishing
Americas
Ancient & Medieval Literature
Ancient Civilizations
Android
Anger Management
Animals
Anthologies
Anthropology
Antiques & Collectibles
Anxieties & Phobias
Arabic
Archaeology
Architectural Engineering
Architecture
Arctic & Antarctica
Art Therapy & Relaxation
Art, Music & Photography
Arts
Arts & Literature
Arts & Photography

In [30]:
nan_rows = sum(reviews_processed_x_features.isna().any(axis=1))
print(nan_rows)

0


In [31]:
reviews_processed_x_features.sort_values(by='rating', ascending=False, inplace=True)
reviews_processed_x_features.to_csv('./data/reviews_x_features.csv')

In [2]:
import pandas as pd

df = pd.read_csv('./data/reviews_x_features.csv', sep=",", index_col=0)
df

Unnamed: 0,rating,parent_asin,user_id,verified_purchase,price,item_mean_ratings,item_median_ratings,item_min_ratings,item_max_ratings,item_std_ratings,...,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,AHY4NLD5U6WGJ2DRDOLCVOKUGB4Q,AHY52ST2HDGRLL5HZTHBLPPDKPXA,AHY5662OAFQ7XPLZQ6AXRRVZPUMQ,AHYCJGF2DYH4CUEZFE4VVNG2DWBA,AHYHOC5AU2TSVZLWIFVNMUVB5YBA,AHZ7XZQVCIWUY6ONMGNMADRXW3WA,AHZMSBFDOWI6UVLSWC5K7VTEEMCQ,AHZWVIBN5WVIZP5IULWPKEFJN2AA,AHZZL323ORE6AHMIB2OCLYNCRXJQ
0,5.0,1422631753,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,0,30.65,5.000000,5.0,5.0,5.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
209623,5.0,1598534653,AENBSJOBQIUYW5UK3ESTAF7ECSGA,0,38.06,4.625000,5.0,3.0,5.0,0.744024,...,0,0,0,0,0,0,0,0,0,0
209627,5.0,0691151032,AENBSJOBQIUYW5UK3ESTAF7ECSGA,0,16.97,3.666667,4.0,2.0,5.0,1.527525,...,0,0,0,0,0,0,0,0,0,0
209630,5.0,9652297100,AENBSJOBQIUYW5UK3ESTAF7ECSGA,0,16.22,5.000000,5.0,5.0,5.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
209633,5.0,0312576463,AGKFRC3Q6AIE6FOJDDP5AXTUZQ5A,0,12.46,3.440415,4.0,1.0,5.0,1.547116,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326843,1.0,1566252717,AHFLOPS7G4YTM3KZVQSTF2H2AOVA,1,38.99,4.347826,5.0,1.0,5.0,1.152416,...,0,0,0,0,0,0,0,0,0,0
199240,1.0,0310276357,AHCBLEYD2E3XUXQEE3EOJ5SLCESQ,0,16.53,3.361991,4.0,1.0,5.0,1.782543,...,0,0,0,0,0,0,0,0,0,0
44572,1.0,1101973757,AGGFYIQ464XNB3QDRLQNAD5H7XTQ,0,17.80,4.080808,4.0,1.0,5.0,1.157731,...,0,0,0,0,0,0,0,0,0,0
280497,1.0,0593182650,AGICZ7OZM4A4ODOWQRHDXY4M5WEQ,0,14.50,4.153846,5.0,1.0,5.0,1.214232,...,0,0,0,0,0,0,0,0,0,0


In [4]:
items_vectors = df.drop(['rating', 'user_id', 'verified_purchase'], axis=1, inplace=False)
items_vectors.drop_duplicates(inplace=True)
items_vectors

Unnamed: 0,parent_asin,price,item_mean_ratings,item_median_ratings,item_min_ratings,item_max_ratings,item_std_ratings,item_number_ratings,2011 Best Books of the Year,2023 Calendars,...,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,AHY4NLD5U6WGJ2DRDOLCVOKUGB4Q,AHY52ST2HDGRLL5HZTHBLPPDKPXA,AHY5662OAFQ7XPLZQ6AXRRVZPUMQ,AHYCJGF2DYH4CUEZFE4VVNG2DWBA,AHYHOC5AU2TSVZLWIFVNMUVB5YBA,AHZ7XZQVCIWUY6ONMGNMADRXW3WA,AHZMSBFDOWI6UVLSWC5K7VTEEMCQ,AHZWVIBN5WVIZP5IULWPKEFJN2AA,AHZZL323ORE6AHMIB2OCLYNCRXJQ
0,1422631753,30.65,5.000000,5.0,5.0,5.0,0.000000,2,0,0,...,0,0,0,0,0,0,0,0,0,0
209623,1598534653,38.06,4.625000,5.0,3.0,5.0,0.744024,8,0,0,...,0,0,0,0,0,0,0,0,0,0
209627,0691151032,16.97,3.666667,4.0,2.0,5.0,1.527525,3,0,0,...,0,0,0,0,0,0,0,0,0,0
209630,9652297100,16.22,5.000000,5.0,5.0,5.0,0.000000,4,0,0,...,0,0,0,0,0,0,0,0,0,0
209633,0312576463,12.46,3.440415,4.0,1.0,5.0,1.547116,193,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280410,1453806024,42.84,3.736842,4.0,1.0,5.0,1.329178,38,0,0,...,0,0,0,0,0,0,0,0,0,0
156941,0385360185,23.51,1.666667,2.0,1.0,2.0,0.577350,3,0,0,...,0,0,0,0,0,0,0,0,0,0
326846,1400079381,14.95,3.454545,4.0,1.0,5.0,1.694912,11,0,0,...,0,0,0,0,0,0,0,0,0,0
326843,1566252717,38.99,4.347826,5.0,1.0,5.0,1.152416,23,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
len(items_vectors['parent_asin'].unique())

205743

In [9]:
import numpy as np
seed = 1

In [14]:
### Get users
rng = np.random.RandomState(seed)
users = df['user_id'].unique()

### Choose user
user_id = rng.choice(users)

### Get items already  reviewed by user
items_rated = df[df['user_id'] == user_id]
items_rated_ids = items_rated['parent_asin'].values
items_ratings = items_rated['rating'].values
items_rated = items_rated[items_rated['parent_asin'].isin(items_rated_ids)]
items_rated = items_rated.drop(['parent_asin', 'rating', 'user_id', 'verified_purchase'], axis=1, inplace=False)
items_rated
# items_rated /= np.linalg.norm(items_rated, 2, axis=1)[:, None]

Unnamed: 0,price,item_mean_ratings,item_median_ratings,item_min_ratings,item_max_ratings,item_std_ratings,item_number_ratings,2011 Best Books of the Year,2023 Calendars,7 Day Free Trial Textbooks,...,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,AHY4NLD5U6WGJ2DRDOLCVOKUGB4Q,AHY52ST2HDGRLL5HZTHBLPPDKPXA,AHY5662OAFQ7XPLZQ6AXRRVZPUMQ,AHYCJGF2DYH4CUEZFE4VVNG2DWBA,AHYHOC5AU2TSVZLWIFVNMUVB5YBA,AHZ7XZQVCIWUY6ONMGNMADRXW3WA,AHZMSBFDOWI6UVLSWC5K7VTEEMCQ,AHZWVIBN5WVIZP5IULWPKEFJN2AA,AHZZL323ORE6AHMIB2OCLYNCRXJQ
209502,12.99,4.531250,5.0,1.0,5.0,0.882021,96,0,0,0,...,0,0,0,0,0,0,0,0,0,0
211299,10.59,4.555556,5.0,2.0,5.0,0.800641,27,0,0,0,...,0,0,0,0,0,0,0,0,0,0
211305,14.41,4.209677,5.0,1.0,5.0,1.218250,124,0,0,0,...,0,0,0,0,0,0,0,0,0,0
211314,31.50,4.733333,5.0,4.0,5.0,0.457738,15,0,0,0,...,0,0,0,0,0,0,0,0,0,0
211315,15.29,3.904762,4.0,1.0,5.0,1.375917,42,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98220,11.97,3.414201,4.0,1.0,5.0,1.461828,169,0,0,0,...,0,0,0,1,0,0,0,0,0,1
140571,25.48,3.623188,4.0,1.0,5.0,1.466302,69,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30235,13.91,4.226626,5.0,1.0,5.0,1.349259,1968,0,0,0,...,0,0,0,0,0,0,0,0,0,0
211026,10.33,2.666667,2.0,1.0,5.0,1.851640,36,0,0,0,...,0,0,0,0,0,0,0,0,0,0


751

In [23]:
### Create catalog of X items not played by given user using most rated games
items_catalog = items_vectors[~items_vectors['parent_asin'].isin(items_rated_ids)]
items_catalog.sort_values(by='item_number_ratings', ascending=False, inplace=True)
items_catalog = items_catalog[:2000].drop('parent_asin', axis=1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  items_catalog.sort_values(by='item_number_ratings', ascending=False, inplace=True)


In [25]:
items_catalog

Unnamed: 0,price,item_mean_ratings,item_median_ratings,item_min_ratings,item_max_ratings,item_std_ratings,item_number_ratings,2011 Best Books of the Year,2023 Calendars,7 Day Free Trial Textbooks,...,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,AHY4NLD5U6WGJ2DRDOLCVOKUGB4Q,AHY52ST2HDGRLL5HZTHBLPPDKPXA,AHY5662OAFQ7XPLZQ6AXRRVZPUMQ,AHYCJGF2DYH4CUEZFE4VVNG2DWBA,AHYHOC5AU2TSVZLWIFVNMUVB5YBA,AHZ7XZQVCIWUY6ONMGNMADRXW3WA,AHZMSBFDOWI6UVLSWC5K7VTEEMCQ,AHZWVIBN5WVIZP5IULWPKEFJN2AA,AHZZL323ORE6AHMIB2OCLYNCRXJQ
23833,12.91,4.673781,5.0,1.0,5.0,0.905922,10073,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4412,5.28,4.641946,5.0,1.0,5.0,1.025131,7996,0,0,0,...,0,0,0,0,0,0,0,0,0,0
185099,4.78,4.583699,5.0,1.0,5.0,0.882051,7975,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44414,17.69,4.609206,5.0,1.0,5.0,1.110904,7756,0,0,0,...,0,0,0,0,0,0,1,0,0,0
29460,8.89,4.362448,5.0,1.0,5.0,1.142761,6045,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144879,15.86,3.627566,4.0,1.0,5.0,1.506555,341,0,0,0,...,0,0,0,0,0,0,0,0,0,0
180074,9.90,4.290323,5.0,1.0,5.0,1.163486,341,0,0,0,...,0,0,0,1,0,0,0,0,0,0
83476,5.98,4.817647,5.0,1.0,5.0,0.606489,340,0,0,0,...,0,0,0,0,0,0,0,0,0,0
141594,16.99,4.902941,5.0,1.0,5.0,0.433811,340,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [1]:
import numpy as np 
from sklearn.linear_model import LinearRegression

def load_amazon(reviews_x_items, size_catalog=1500, seed=12):
    items_vectors = reviews_x_items.drop(['rating', 'user_id', 'verified_purchase'], axis=1, inplace=False)
    items_vectors.drop_duplicates(inplace=True)

    ### Get users
    rng = np.random.RandomState(seed)
    users = reviews_x_items['user_id'].unique()

    ### Choose user
    user_id = rng.choice(users)

    ### Get items already  reviewed by user
    items_rated = reviews_x_items[reviews_x_items['user_id'] == user_id]
    items_rated_ids = items_rated['parent_asin'].values
    items_ratings = items_rated['rating'].values
    items_rated = items_vectors[items_vectors['parent_asin'].isin(items_rated_ids)]
    items_rated = np.array(items_rated.drop('parent_asin', axis=1, inplace=False))
    items_rated /= np.linalg.norm(items_rated, 2, axis=1)[:, None]

    ### Compute linear regression to estimate theta 
    reg = LinearRegression(fit_intercept=False).fit(items_rated, items_ratings)
    theta_user = reg.coef_

    ### Create catalog of X items not rated by given user using most rated games
    items_catalog = items_vectors[~items_vectors['parent_asin'].isin(items_rated_ids)]
    items_catalog = items_catalog.sort_values(by='item_number_ratings', ascending=False, inplace=False)
    items_catalog = np.array(items_catalog[:size_catalog].drop('parent_asin', axis=1))


    # theta_user /= np.linalg.norm(theta_user, 2)
    items_catalog /= np.linalg.norm(items_catalog, 2, axis=1)[:, None]

    return theta_user, items_catalog, size_catalog, theta_user.shape[0]