# Save the file


In [4]:
import pandas as pd
import re
import numpy as np
from datetime import datetime

def preprocess_amazon_data(df):
    """
    Preprocess Amazon scraped data
    """
    # Create a copy of the dataframe
    df_cleaned = df.copy()

    # Remove duplicates
    initial_rows = len(df_cleaned)
    df_cleaned = df_cleaned.drop_duplicates()
    duplicates_removed = initial_rows - len(df_cleaned)
    print(f"Removed {duplicates_removed} duplicate rows")

    # Function to extract product info
    def extract_product_info(title):
        # Split the title at the first '|'
        parts = title.split('|', 1)
        product_name = parts[0].strip()
        specifications = parts[1].strip() if len(parts) > 1 else ''

        # Extract RAM and Storage
        ram_storage_pattern = r'(\d+GB)\s*RAM,\s*(\d+)\s*GB\s*Storage'
        ram_storage_match = re.search(ram_storage_pattern, product_name)

        ram = ram_storage_match.group(1) if ram_storage_match else None
        storage = ram_storage_match.group(2) + 'GB' if ram_storage_match else None

        # Extract color
        color_pattern = r'\((.*?)\,'
        color_match = re.search(color_pattern, product_name)
        color = color_match.group(1) if color_match else None

        return pd.Series({
            'product_name': product_name,
            'specifications': specifications,
            'ram': ram,
            'storage': storage,
            'color': color
        })

    # Apply product info extraction
    print("Extracting product information...")
    product_info = df_cleaned['title'].apply(extract_product_info)
    df_cleaned = pd.concat([df_cleaned, product_info], axis=1)

    # Clean price
    def clean_price(price):
        if pd.isna(price):
            return None
        try:
            return float(price.replace('₹', '').replace(',', ''))
        except:
            return None

    print("Cleaning price data...")
    df_cleaned['price'] = df_cleaned['price'].apply(clean_price)

    # Clean rating
    def clean_rating(rating):
        if pd.isna(rating) or rating == 'Previous page':
            return None
        try:
            rating_match = re.search(r'(\d+\.?\d*)', str(rating))
            return float(rating_match.group(1)) if rating_match else None
        except:
            return None

    print("Cleaning rating data...")
    df_cleaned['rating'] = df_cleaned['rating'].apply(clean_rating)

    # Clean reviews
    def clean_reviews(reviews):
        if pd.isna(reviews) or reviews == 'No reviews found':
            return 0
        try:
            reviews_match = re.search(r'(\d+,?\d*)', str(reviews))
            return int(reviews_match.group(1).replace(',', '')) if reviews_match else 0
        except:
            return 0

    print("Cleaning review data...")
    df_cleaned['reviews'] = df_cleaned['reviews'].apply(clean_reviews)

    # Clean availability
    def clean_availability(availability):
        if pd.isna(availability):
            return 'Out of stock'
        return availability.strip()

    print("Cleaning availability data...")
    df_cleaned['availability'] = df_cleaned['availability'].apply(clean_availability)

    # Extract brand names
    def extract_brand(product_name):
        brands = ['Galaxy', 'NARZO', 'realme', 'Samsung', 'Xiaomi', 'Redmi', 'OnePlus', 'OPPO', 'vivo']
        for brand in brands:
            if brand.lower() in product_name.lower():
                return brand
        return 'Other'

    print("Extracting brand information...")
    df_cleaned['brand'] = df_cleaned['product_name'].apply(extract_brand)

    # Add additional features
    print("Adding additional features...")
    df_cleaned['price_segment'] = pd.qcut(df_cleaned['price'],
                                        q=4,
                                        labels=['Budget', 'Mid-range', 'Premium', 'Ultra-premium'])

    df_cleaned['rating_category'] = pd.cut(df_cleaned['rating'],
                                         bins=[0, 3, 3.5, 4, 4.5, 5],
                                         labels=['Poor', 'Average', 'Good', 'Very Good', 'Excellent'])

    # Reorder columns
    columns_order = [
        'product_name', 'brand', 'price', 'rating', 'reviews',
        'availability', 'ram', 'storage', 'color', 'specifications',
        'price_segment', 'rating_category'
    ]

    df_cleaned = df_cleaned[columns_order]

    return df_cleaned

def save_preprocessed_data(df_processed, output_filename=None):
    """
    Save the preprocessed data to a CSV file with timestamp
    """
    try:
        # Generate filename with timestamp if not provided
        if output_filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            output_filename = f"amazon_data_processed_{timestamp}.csv"

        # Save to CSV
        df_processed.to_csv(output_filename, index=False)
        print(f"\nPreprocessed data successfully saved to: {output_filename}")

        # Print summary statistics
        print("\nDataset Summary:")
        print(f"Total rows: {len(df_processed)}")
        print(f"Total columns: {len(df_processed.columns)}")
        print("\nColumns in the processed dataset:")
        for col in df_processed.columns:
            print(f"- {col}")

        # Print sample of missing values
        missing_values = df_processed.isnull().sum()
        if missing_values.any():
            print("\nMissing values summary:")
            print(missing_values[missing_values > 0])

        return True

    except Exception as e:
        print(f"Error saving the preprocessed data: {str(e)}")
        return False

# Main execution
try:
    # Load the data
    print("Loading the dataset...")
    input_file = 'amazon_data.csv'  # Change this to your input file name
    df = pd.read_csv(input_file)

    # Apply preprocessing
    print("\nStarting preprocessing...")
    df_processed = preprocess_amazon_data(df)

    # Save the preprocessed data
    output_file = 'amazon_data_processed.csv'  # Change this to your desired output file name
    save_preprocessed_data(df_processed, output_file)

    # Print additional statistics
    print("\nValue counts for categorical variables:")
    categorical_cols = ['brand', 'price_segment', 'rating_category', 'availability']
    for col in categorical_cols:
        print(f"\n{col} distribution:")
        print(df_processed[col].value_counts())

    print("\nPreprocessing completed successfully!")

except Exception as e:
    print(f"An error occurred during processing: {str(e)}")

Loading the dataset...

Starting preprocessing...
Removed 30 duplicate rows
Extracting product information...
Cleaning price data...
Cleaning rating data...
Cleaning review data...
Cleaning availability data...
Extracting brand information...
Adding additional features...

Preprocessed data successfully saved to: amazon_data_processed.csv

Dataset Summary:
Total rows: 30
Total columns: 12

Columns in the processed dataset:
- product_name
- brand
- price
- rating
- reviews
- availability
- ram
- storage
- color
- specifications
- price_segment
- rating_category

Missing values summary:
rating              6
ram                22
storage            22
color              14
rating_category     6
dtype: int64

Value counts for categorical variables:

brand distribution:
brand
Other      18
NARZO       8
OnePlus     3
Galaxy      1
Name: count, dtype: int64

price_segment distribution:
price_segment
Budget           8
Mid-range        8
Ultra-premium    8
Premium          6
Name: count, dty

In [5]:
# To use a custom output filename:
output_file = 'my_processed_amazon_data.csv'

# Or to use automatic timestamp-based filename:
output_file = None