<a href="https://colab.research.google.com/github/GeorgeSakketos/Data_Mining_2025/blob/main/Data_Mining_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Mounting Google Drive to Collab**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
csv_folder_path = '/content/drive/My Drive/Data_Mining_CSV'

Mounted at /content/drive


# **Part 1**

In [None]:
!pip install datasets

Requirements

In [None]:
from itertools import islice
import pandas as pd
import re
from datasets import load_dataset

# Gift Cards

In [None]:
# ------------------- Load and Sample Data -------------------

Gift_Cards_review_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Gift_Cards", trust_remote_code=True, split="full", streaming=True)
Gift_Cards_meta_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Gift_Cards", trust_remote_code=True, split="full", streaming=True)

gc_rows_to_get = 3000
gc_review_sample = list(islice(Gift_Cards_review_ds, gc_rows_to_get))
gc_meta_sample = list(islice(Gift_Cards_meta_ds, gc_rows_to_get))

gc_reviews_df = pd.DataFrame(gc_review_sample)
gc_product_df = pd.DataFrame(gc_meta_sample)

pd.set_option('display.max_rows', gc_rows_to_get)
pd.set_option('display.max_colwidth', None)

# ------------------- Cleaning Functions -------------------

def normalize_price(price):
    if isinstance(price, str):
        match = re.search(r'[\d,.]+', price)
        if match:
            return float(match.group(0).replace(',', ''))
    return None

def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s]', '', text)
        return text.strip()
    return ''

# ------------------- Clean Meta Dataset -------------------

clean_gc_product_df = gc_product_df.copy()

# Drop columns that are not needed
clean_gc_product_df.drop(columns=['bought_together', 'subtitle', 'author', 'images', 'videos'], inplace=True)

# Drop rows missing essential fields
clean_gc_product_df.dropna(subset=['title', 'main_category', 'price'], inplace=True)

# Fill or normalize other fields
clean_gc_product_df.loc[:, 'description'] = clean_gc_product_df['description'].fillna('No description available').apply(clean_text)
clean_gc_product_df.loc[:, 'features'] = clean_gc_product_df['features'].fillna('').astype(str).apply(clean_text)
clean_gc_product_df.loc[:, 'details'] = clean_gc_product_df['details'].fillna('').astype(str).apply(clean_text)
clean_gc_product_df.loc[:, 'average_rating'] = clean_gc_product_df['average_rating'].fillna(0).astype(float)
clean_gc_product_df.loc[:, 'rating_number'] = clean_gc_product_df['rating_number'].fillna(0).astype(int)
clean_gc_product_df.loc[:, 'price'] = clean_gc_product_df['price'].apply(normalize_price)
clean_gc_product_df.dropna(subset=['price'], inplace=True)
clean_gc_product_df.loc[:, 'categories'] = clean_gc_product_df['categories'].apply(lambda x: x if isinstance(x, list) else [])
clean_gc_product_df.loc[:, 'store'] = clean_gc_product_df['store'].fillna('Unknown')
clean_gc_product_df.loc[:, 'title'] = clean_gc_product_df['title'].apply(clean_text)

clean_gc_product_df.reset_index(drop=True, inplace=True)

# ------------------- Clean Reviews Dataset -------------------

clean_gc_reviews_df = gc_reviews_df.copy()

# Drop columns that are not needed
clean_gc_reviews_df.drop(columns=['images'], inplace=True)

# Drop rows missing essential fields
clean_gc_reviews_df.dropna(subset=['rating', 'text', 'asin', 'helpful_vote'], inplace=True)

# Clean and fill optional fields
clean_gc_reviews_df.loc[:, 'title'] = clean_gc_reviews_df['title'].fillna('').apply(clean_text)
clean_gc_reviews_df.loc[:, 'text'] = clean_gc_reviews_df['text'].apply(clean_text)
clean_gc_reviews_df = clean_gc_reviews_df[clean_gc_reviews_df['verified_purchase'] == True]
clean_gc_reviews_df.loc[:, 'helpful_vote'] = clean_gc_reviews_df['helpful_vote'].fillna(0).astype(int)

# Convert timestamp from milliseconds to seconds (remove milliseconds)
clean_gc_reviews_df.loc[:, 'timestamp'] = (clean_gc_reviews_df['timestamp'].astype('int64') // 1000)
converted_timestamps = pd.to_datetime(clean_gc_reviews_df['timestamp'], unit='s', errors='coerce')
clean_gc_reviews_df = clean_gc_reviews_df.drop(columns=['timestamp'])
clean_gc_reviews_df['timestamp'] = converted_timestamps

# Split timestamp into date and time
clean_gc_reviews_df.loc[:, 'date'] = clean_gc_reviews_df['timestamp'].dt.date
clean_gc_reviews_df.loc[:, 'time'] = clean_gc_reviews_df['timestamp'].dt.strftime('%H:%M:%S')

# Drop original timestamp column
clean_gc_reviews_df.drop(columns=['timestamp'], inplace=True)

clean_gc_reviews_df.reset_index(drop=True, inplace=True)

# ------------------- Merge Datasets -------------------

# Rename columns for clarity
clean_gc_reviews_df.rename(columns={'title': 'review_title'}, inplace=True)
clean_gc_product_df.rename(columns={'title': 'product_title'}, inplace=True)

# Ensure types match
clean_gc_reviews_df['parent_asin'] = clean_gc_reviews_df['parent_asin'].astype(str)
clean_gc_product_df['parent_asin'] = clean_gc_product_df['parent_asin'].astype(str)

# Merge on parent_asin
clean_gc_merged_df = pd.merge(clean_gc_reviews_df, clean_gc_product_df, on='parent_asin', how='left')
clean_gc_merged_df.dropna(inplace=True)
clean_gc_merged_df.reset_index(drop=True, inplace=True)


In [None]:
# Create the CSV file
clean_gc_merged_df.to_csv(f"{csv_folder_path}/Gift_Cards.csv", index=False)

Opening CSV file

In [None]:
# Load Gift Cards CSV file
gift_cards_df = pd.read_csv(f"{csv_folder_path}/Gift_Cards.csv")

# Show file
gift_cards_df.head()

Unnamed: 0,rating,review_title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,date,time,main_category,product_title,average_rating,rating_number,features,description,price,store,categories,details
0,5.0,perfect gift,when you have a person who is hard to shop for an amazon gift card is p e r f e c t man or woman no matter what their hobby lifestyle or age all you have to do is pick the dont forget to mention that it is a gift when you check out you will have some gift card options ive ordered many of these over years they are always received with glee woo hoo if youre looking for a great fit for me this is just my size best to all,B01K8RIM5Y,B005S28ZES,AFZUK3MTBIBEDQOPAK3OATUOUKLA,27,True,2018-09-03,01:58:49,Gift Cards,amazoncom gift card in a gift box various thank you designs,4.9,4918.0,gift card is nested inside a specialty gift box gift card has no fees and no expiration date no returns and no refunds on gift cards gift amount may not be printed on gift cards gift card is redeemable towards millions of items storewide at amazoncom scan and redeem any gift card with a mobile or tablet device via the amazon app free oneday shipping where available customized gift message if chosen at checkout only appears on packing slip and not on the actual gift card or carrier,,25.0,Amazon,"['Gift Cards', 'Gift Card Recipients', 'For Him']",product dimensions 1 x 44 x 48 inches 113 ounces item model number variabledenomination date first available november 29 2016 manufacturer amazon domestic shipping currently item can be shipped only within the us and to apofpo addresses for apofpo shipments please check with the manufacturer regarding warranty and support issues international shipping this item is not eligible for international shipping learn more
1,1.0,not 10 gift cards,i bought this pack of starbucks gift cards in 2019 ive given them to friends and i gave 2 to my daughterbr my daughter used one recently and it had 652 on the card not 1000 she had the cashier check the balance of the other card and it had 532 on it she had forgotten that she had these gift cards so yes 2 years later decided to use the when she found them do they decline in value and then both had random amounts on them im embarrassed now to have given them as gifts friends receiving the gift card arent going to tell you that werent able to cover their order with the card you gave them,B00FTGTM5E,B00FTGTIOE,AH5L7ILVA6HYLZOUZIQAWNHVVK3A,2,True,2021-11-28,03:06:48,Gift Cards,starbucks 10 gift cards 4pack,4.9,13066.0,this item contains 4 separate 10 plastic gift cards starbucks cards redeemable at most sb locations its a great way to treat a friend its a convenient way to prepay for your own regular purchases no returns and no refunds on gift cards,,40.0,Starbucks,"['Gift Cards', 'Gift Card Categories', 'Grocery, Gourmet & Floral']",package dimensions 516 x 411 x 02 inches 063 ounces date first available july 1 2013 domestic shipping currently item can be shipped only within the us and to apofpo addresses for apofpo shipments please check with the manufacturer regarding warranty and support issues international shipping this item is not eligible for international shipping learn more
2,5.0,cute,that snowman tin is adorable,B072L7GTF5,B00ADR2LV6,AECABX3OO3GK7FCPZLFM3LT2E6UA,0,True,2020-12-22,18:17:43,Gift Cards,amazoncom gift card in a holiday gift box various designs,4.9,185606.0,gift card is affixed inside a box gift amount may not be printed on gift cards you can customize the gift amount as desired for example 36 54 72 etc gift card has no fees and no expiration date gift card is redeemable towards millions of items storewide at amazoncom scan and redeem any gift card with a mobile or tablet device via the amazon app free oneday shipping where available customized gift message if chosen at checkout only appears on packing slip and not on the actual gift card or carrier,,25.0,Amazon,"['Gift Cards', 'Occasions', 'Chanukah']",package dimensions 819 x 441 x 13 inches 145 ounces item model number variabledenomination date first available september 16 2020 manufacturer amazon
3,5.0,great gift,super cute nice quality tinbr your choose amount,B072P5VV4D,B00ADR2LV6,AF4XAUOI5XPGWHCPOTORKBIJSRDA,0,True,2019-02-27,06:08:02,Gift Cards,amazoncom gift card in a holiday gift box various designs,4.9,185606.0,gift card is affixed inside a box gift amount may not be printed on gift cards you can customize the gift amount as desired for example 36 54 72 etc gift card has no fees and no expiration date gift card is redeemable towards millions of items storewide at amazoncom scan and redeem any gift card with a mobile or tablet device via the amazon app free oneday shipping where available customized gift message if chosen at checkout only appears on packing slip and not on the actual gift card or carrier,,25.0,Amazon,"['Gift Cards', 'Occasions', 'Chanukah']",package dimensions 819 x 441 x 13 inches 145 ounces item model number variabledenomination date first available september 16 2020 manufacturer amazon
4,5.0,gifts for my two granddaughters,they love it,B072P5VV4D,B00ADR2LV6,AGJXTLEOLLTIX5AAGFPBZ7CNNVOQ,0,True,2023-02-03,01:31:40,Gift Cards,amazoncom gift card in a holiday gift box various designs,4.9,185606.0,gift card is affixed inside a box gift amount may not be printed on gift cards you can customize the gift amount as desired for example 36 54 72 etc gift card has no fees and no expiration date gift card is redeemable towards millions of items storewide at amazoncom scan and redeem any gift card with a mobile or tablet device via the amazon app free oneday shipping where available customized gift message if chosen at checkout only appears on packing slip and not on the actual gift card or carrier,,25.0,Amazon,"['Gift Cards', 'Occasions', 'Chanukah']",package dimensions 819 x 441 x 13 inches 145 ounces item model number variabledenomination date first available september 16 2020 manufacturer amazon


# Digital Music

In [None]:
# Load in streaming mode
Digital_Music_review_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Digital_Music", trust_remote_code=True, split="full", streaming=True)
Digital_Music_meta_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Digital_Music", trust_remote_code=True, split="full", streaming=True)

# Sample Rows from each Dataset
dm_rows_to_get = 10000
dm_review_sample = list(islice(Digital_Music_review_ds, dm_rows_to_get))
dm_meta_sample = list(islice(Digital_Music_meta_ds, dm_rows_to_get))

# Make the DataFrames
dm_reviews_df = pd.DataFrame(dm_review_sample)
dm_product_df = pd.DataFrame(dm_meta_sample)

# Set pandas display options to show all 100 rows
pd.set_option('display.max_rows', dm_rows_to_get)        # show all rows
pd.set_option('display.max_colwidth', None)           # don't truncate column content

# dm_reviews_df.head()
dm_product_df.head()

# Magazine Subscriptions

In [None]:
# Load in streaming mode
Magazine_Subscriptions_review_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Magazine_Subscriptions", trust_remote_code=True, split="full", streaming=True)
Magazine_Subscriptions_meta_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Magazine_Subscriptions", trust_remote_code=True, split="full", streaming=True)

# Sample Rows from each Dataset
ms_rows_to_get = 10000
ms_review_sample = list(islice(Magazine_Subscriptions_review_ds, ms_rows_to_get))
ms_meta_sample = list(islice(Magazine_Subscriptions_meta_ds, ms_rows_to_get))

# Make the DataFrames
ms_reviews_df = pd.DataFrame(ms_review_sample)
ms_product_df = pd.DataFrame(ms_meta_sample)

# Set pandas display options to show all 100 rows
pd.set_option('display.max_rows', ms_rows_to_get)        # show all rows
pd.set_option('display.max_colwidth', None)           # don't truncate column content

# ms_reviews_df.head()
ms_product_df.head()

# Software

In [None]:
# Load in streaming mode
Software_review_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Software", trust_remote_code=True, split="full", streaming=True)
Software_meta_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Software", trust_remote_code=True, split="full", streaming=True)

# Sample Rows from each Dataset
s_rows_to_get = 10000
s_review_sample = list(islice(Software_review_ds, s_rows_to_get))
s_meta_sample = list(islice(Software_meta_ds, s_rows_to_get))

# Make the DataFrames
s_reviews_df = pd.DataFrame(s_review_sample)
s_product_df = pd.DataFrame(s_meta_sample)

# Set pandas display options to show all 100 rows
pd.set_option('display.max_rows', s_rows_to_get)        # show all rows
pd.set_option('display.max_colwidth', None)           # don't truncate column content

# s_reviews_df.head()
s_product_df.head()

# All Beauty

In [None]:
# Load in streaming mode
All_Beauty_review_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True, split="full", streaming=True)
All_Beauty_meta_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_All_Beauty", trust_remote_code=True, split="full", streaming=True)

# Sample Rows from each Dataset
ab_rows_to_get = 10000
ab_review_sample = list(islice(All_Beauty_review_ds, ab_rows_to_get))
ab_meta_sample = list(islice(All_Beauty_meta_ds, ab_rows_to_get))

# Make the DataFrames
ab_reviewab_df = pd.DataFrame(ab_review_sample)
ab_product_df = pd.DataFrame(ab_meta_sample)

# Set pandas display options to show all 100 rows
pd.set_option('display.max_rows', ab_rows_to_get)        # show all rows
pd.set_option('display.max_colwidth', None)           # don't truncate column content

# ab_reviews_df.head()
ab_product_df.head()