In [None]:
import argparse
import logging
import random
import requests
from bs4 import BeautifulSoup
from retrying import retry
import pandas as pd
import json
import re
import csv

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@retry(stop_max_attempt_number=3, wait_fixed=1000)
def extract_product_info(product_id, api_key):
    payload = {
        'api_key': api_key,
        'url': f'https://www.macys.com/shop/product?ID={product_id}',
        'country_code': 'us',
        'device_type': 'desktop',
        'session_number': '1'
    }
    r = requests.get('https://api.scraperapi.com/', params=payload)
    soup = BeautifulSoup(r.text, 'html.parser')

    selectors = {
        'brand': ('label', {'itemprop': 'brand'}, 'a'),
        'product_title': ('span', {'itemprop': 'name'}),
        'offer': ('div', {'class': 'flex-container price-type-container label'}, 'span'),
        'price': ('span', {'class': 'price-lg'}),
        'old_price': ('span', {'class': 'price-strike-lg'}),
        'colors': ('input', {'class': 'color-swatch-sprite-radio'}),
        'sizes': ('li', {'class': 'shrink cell'}, 'span'),
        'product_details': ('ul', {'class': 'margin-left-xs'}, 'li'),
        'shipping_returns_info': ('li', {'data-auto': 'shipping-returns-section'}, ('li', {'data-testid': 'note'})),
        'materials_and_care_info': ('li', {'data-auto': 'materials-care-section'}, 'span'),
        'size_and_fit_info': ('li', {'data-auto': 'size-fit-section'}, 'span'),
        'complete_the_look': None,
        'chatbot': 'chatbot present',
        'featured_brands': ['Donna Karan New York', 'Calvin Klein', 'Karl Lagerfeld Paris', 'Levi\'s',
                            'DKNY', 'Tommy Hilfiger', 'Kenneth Cole', 'Dockers', 'Champion',
                            'Michael Kors', 'Vince Camuto', 'Anne Klein']
    }

    output = {}
    for key, selector in selectors.items():
        try:
            if key == 'colors':
                output[key] = [tag['aria-label'].replace('Color: ', '').strip() for tag in soup.find_all(*selector)]
            elif key == 'sizes':
                output[key] = [tag.get_text(strip=True) for tag in soup.find_all(*selector[:2])]
            elif key == 'product_details':
                output[key] = ', '.join([tag.get_text(strip=True) for tag in soup.find_all(*selector)])
            elif key == 'shipping_returns_info':
                output[key] = [tag.get_text(strip=True) for tag in soup.find(*selector[:2]).find_all(*selector[2])] \
                    if soup.find(*selector[:2]) else None
            elif key == 'materials_and_care_info':
                output[key] = [tag.get_text(strip=True) for tag in soup.find(*selector[:2]).find_all(selector[2])] \
                    if soup.find(*selector[:2]) else None
            elif key == 'size_and_fit_info':
                output[key] = [tag.get_text(strip=True) for tag in soup.find(*selector[:2]).find_all(selector[2])] \
                    if soup.find(*selector[:2]) else None
            elif key == 'featured_brands':
                output[key] = random.sample(selector, random.randint(1, len(selector)))
            elif selector:
                output[key] = soup.find(*selector[:2]).get_text(strip=True) if soup.find(*selector[:2]) else None
            else:
                output[key] = selector
        except AttributeError:
            output[key] = None

    output['colors'] = ', '.join(output['colors']) if output['colors'] else None
    output['sizes'] = ', '.join(output['sizes']) if output['sizes'] else None

    return output

def extract_product_ids(url, api_key):
    payload = {
        'api_key': api_key,
        'url': url,
        'country_code': 'us',
        'device_type': 'desktop',
        'session_number': '1'
    }
    r = requests.get('https://api.scraperapi.com/', params=payload)
    soup = BeautifulSoup(r.text, 'html.parser')

    script_tag = soup.find('script', string=lambda x: x and 'window.__INITIAL_STATE__' in x)
    if script_tag:
        script_text = script_tag.string if script_tag else ""
        match = re.search(r'"productID":\[(.*?)\]', script_text)
        if match:
            product_ids_str = '[' + match.group(1) + ']'
            product_ids = json.loads(product_ids_str)
            return [int(pid) for pid in product_ids]
        else:
            logger.warning("No productID array found.")
            return []
    else:
        logger.warning("Script tag with 'window.__INITIAL_STATE__' not found.")
        return []

def main(api_key, input_file, output_file, categories_file):
    df = pd.read_csv(input_file)
    df = df[df['Shipping Returns Info'].isna() | (df['Shipping Returns Info'] == '[]')]

    product_ids = []
    if categories_file:
        with open(categories_file, 'r') as f:
            categories = [line.strip() for line in f]
        for url in categories:
            product_ids.extend(extract_product_ids(url, api_key))

    file_exists = False
    try:
        with open(output_file, 'r', encoding='utf-8'):
            file_exists = True
    except FileNotFoundError:
        pass

    with open(output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if not file_exists:
            writer.writerow(["PRODUCT ID"])
        for product_id in product_ids:
            writer.writerow([product_id])

    for index, row in df.iterrows():
        try:
            product_id = row['ProductId']
            product_info = extract_product_info(product_id, api_key)
            for key, value in product_info.items():
                df.at[index, key] = value
            logger.info(f"Updated product ID: {product_id}")
        except Exception as e:
            logger.error(f"Error updating product ID {product_id}: {e}")

    df.to_csv(output_file, index=False)
    logger.info(f"CSV update completed for rows with blank or empty Shipping Returns Info.")

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Scrape product information from Macy\'s website.')
    parser.add_argument('--api-key', required=True, help='API key for Scraper API')
    parser.add_argument('--input-file', default='20241023_Macys-ALL-BRANDS.csv', help='Input CSV file')
    parser.add_argument('--output-file', default='20241023_Macys-ALL-BRANDS-updated.csv', help='Output CSV file')
    parser.add_argument('--categories-file', help='Text file containing category URLs')
    args = parser.parse_args()

    main(args.api_key, args.input_file, args.output_file, args.categories_file)