In [None]:
from utils.bigdata_a3_utils import *
import numpy as np
import pandas as pd
import os
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
# from utils.preprocessing import *
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
os.makedirs("custom_dataset", exist_ok=True)

In [None]:
MAX_ROWS = 100000

In [None]:
reviews_csv_path = "custom_dataset/reviews.csv"
metadata_csv_path = "custom_dataset/metadata.csv"

In [None]:
first_review_df = None
first_meta_df = None

# Process each category
for category in tqdm(VALID_CATEGORIES, desc="Processing categories"):
    print(f"\nProcessing category: {category}")
    
    # Process review data
    try:
        # Use streaming mode with iterable dataset
        review_dataset = load_dataset(
            "McAuley-Lab/Amazon-Reviews-2023", 
            f"raw_review_{category}", 
            trust_remote_code=True,
            split="full",
            streaming=True
        )
        
        # Create an iterator for the dataset
        review_iter = iter(review_dataset)
        
        # Collect reviews into a list (up to MAX_ROWS)
        reviews = []
        count = 0
        
        for item in review_iter:
            if count >= MAX_ROWS:
                break
                
            # Create a new dictionary instead of modifying the item in-place
            review_item = dict(item)
            review_item['category'] = category
            reviews.append(review_item)
            count += 1
            
            # Process in batches to conserve memory
            if len(reviews) >= 1000 or count >= MAX_ROWS:
                review_df = pd.DataFrame(reviews)
                
                # Check if this is the first batch we're writing
                if first_review_df is None:
                    review_df.to_csv(reviews_csv_path, mode='w', header=True, index=False)
                    first_review_df = True
                else:
                    review_df.to_csv(reviews_csv_path, mode='a', header=False, index=False)
                
                # Clear the batch
                reviews = []
        
        print(f"  ✓ Appended {count} reviews from {category} to {reviews_csv_path}")
    except Exception as e:
        print(f"  ✗ Error processing reviews for {category}: {str(e)}")
    
    # Process metadata
    try:
        # Use streaming mode with iterable dataset
        meta_dataset = load_dataset(
            "McAuley-Lab/Amazon-Reviews-2023", 
            f"raw_meta_{category}", 
            trust_remote_code=True,
            split="full",
            streaming=True
        )
        
        # Create an iterator for the dataset
        meta_iter = iter(meta_dataset)
        
        # Collect metadata into a list (up to MAX_ROWS)
        metadata = []
        count = 0
        
        for item in meta_iter:
            if count >= MAX_ROWS:
                break
                
            # Create a new dictionary instead of modifying the item in-place
            meta_item = dict(item)
            meta_item['category'] = category
            metadata.append(meta_item)
            count += 1
            
            # Process in batches to conserve memory
            if len(metadata) >= 1000 or count >= MAX_ROWS:
                meta_df = pd.DataFrame(metadata)
                
                # Check if this is the first batch we're writing
                if first_meta_df is None:
                    meta_df.to_csv(metadata_csv_path, mode='w', header=True, index=False)
                    first_meta_df = True
                else:
                    meta_df.to_csv(metadata_csv_path, mode='a', header=False, index=False)
                
                # Clear the batch
                metadata = []
        
        print(f"  ✓ Appended {count} metadata records from {category} to {metadata_csv_path}")
    except Exception as e:
        print(f"  ✗ Error processing metadata for {category}: {str(e)}")

print(f"\nAll categories processed!")
print(f"Review data saved to: {reviews_csv_path}")
print(f"Metadata saved to: {metadata_csv_path}")

In [None]:
delete_cache_directory()

In [None]:
def mergedatasets():
    reviews = pd.read_csv("custom_dataset/reviews.csv")
    metadata = pd.read_csv("custom_dataset/metadata.csv")

    merged = pd.merge(reviews, metadata, on= 'parent_asin', how='inner', suffixes=('_review', '_metadata'))
    return merged

In [None]:
merged = pd.DataFrame()
merged = mergedatasets()

In [None]:
merged.shape

In [None]:
merged.columns

In [None]:
merged = merged[merged['text'].str.strip().astype(bool)]

In [None]:
merged.shape

In [None]:
merged = merged.drop_duplicates(subset=['user_id', 'asin', 'text'], keep='first')

In [None]:
merged.shape

In [None]:
def extract_brand(row):
    # Check if store exists and is not empty after stripping
    store_value = row.get('store')
    if isinstance(store_value, str) and store_value.strip():
        return store_value.strip()  # Return store if it has a non-empty value
    
    # Try to extract brand from details
    elif row.get('details'):
        # Use a regex pattern similar to your original function
        match = re.search(r"Brand[:\s\-]*([A-Za-z0-9&\s]+)", row['details'], re.IGNORECASE)
        if match:
            return match.group(1).strip()  # Return the brand found in details
    
    # Default fallback
    return 'Unknown'

In [None]:
merged.loc[:, 'brand'] = merged.apply(extract_brand, axis=1)

In [None]:
merged.loc[:, 'review_length'] = merged['text'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)

In [None]:
merged.head(1)

In [None]:
analysis_path = "custom_dataset/analysis.csv"

In [None]:
needed_columns = {
                'main_category': 'category',
                'brand': 'brand',
                'title_metadata': 'item',
                'rating': 'rating',
                'text': 'reviewText',
                'review_length': 'text_length',
                'timestamp': 'timestamp',
                'rating_number': 'numRating',
                'average_rating': 'average_rating',
                'helpful_vote':'total_votes'
                }

merged = merged[list(needed_columns.keys())]
merged.rename(columns=needed_columns, inplace=True)
merged.shape

In [None]:
merged.to_csv(analysis_path, index=False)

In [None]:
df = pd.read_csv(analysis_path)

In [None]:
print(df.shape)
print(df.isnull().sum()) 

In [None]:
df = df[df['category'].notnull()]
df = df[df['item'].notnull()]
df = df[df['reviewText'].notnull()]
df = df[df['numRating'].notnull()]

print(df.shape)
print(df.isnull().sum()) 

In [None]:
reviews_to_clean = df["reviewText"]

In [None]:
def clean_html(text):
    # Remove URLs
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    # Remove HTML tags
    text = re.sub('<.*?>+', '', text)
    return text

In [None]:
def tokenize_df(text):
    if isinstance(text, str):  # If the input is a string
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token.isalpha()]  # Keep only alphabetic tokens
        return tokens
    elif isinstance(text, (int, float)):  # If the input is an integer or float
        return text  # Return the number as is
    else:
        return []

In [None]:
def remove_stopwords(tokens):
    stop_words = (stopwords.words("english"))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return filtered_tokens

In [None]:
def lemmatize_text(tokens):
    if tokens is None:
        return []  # Return an empty list if tokens is None
    lemma = WordNetLemmatizer()
    lemma_tokens = [lemma.lemmatize(word) for word in tokens if word is not None]  # Check for None
    return lemma_tokens

In [None]:
def clean_text(text):
    if not isinstance(text, str):  # Check if the input is not a string
        return ''  # Return an empty string or handle it as needed
    text = clean_html(text)
    tokens = tokenize_df(text)
    tokens = [token.lower() for token in tokens]
    tokens = remove_stopwords(tokens)
    tokens = [token.translate(str.maketrans('', '', string.punctuation)) for token in tokens]
    tokens = lemmatize_text(tokens)
    return ' '.join(tokens)

In [None]:
cleaned_reviews = reviews_to_clean.apply(clean_text)

In [None]:
cleaned_reviews = cleaned_reviews.apply(lambda x: re.sub(r"\s+", " ", str(x)).strip() if isinstance(x, (str, float, int)) else x)

In [None]:
df.rename(columns={'reviewText': 'cleanedText'}, inplace=True)

In [None]:
df.columns

In [None]:
df["cleanedText"] = cleaned_reviews

In [None]:
df["brand"] = df["brand"].apply(clean_text)

In [None]:
df["brand"]

In [None]:
df.to_csv("csv's/AnalysisDataset.csv")