In [2]:
import xml.etree.ElementTree as ET
import requests
import csv
from datetime import datetime
from dataclasses import dataclass
from typing import List
import time
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

@dataclass
class Offer:
    price_after_gc: str
    gift_card: str
    total_price: str
    monthly_price: str
    down_payment: str
    bib_premium: str
    bib_monthly: str
    down_return: str

@dataclass
class Carrier:
    name: str
    link: str
    offers: List[Offer]

@dataclass
class Phone:
    name: str
    carriers: List[Carrier]

class BestBuyScraper:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)

    def scrape(self, xml_file_path: str) -> List[Phone]:
        phones = []

        if xml_file_path.startswith("http://") or xml_file_path.startswith("https://"):
            response = self.session.get(xml_file_path)
            response.raise_for_status()
            xml_content = response.content
            root = ET.fromstring(xml_content)
        else:
            tree = ET.parse(xml_file_path)
            root = tree.getroot()

        for phone_node in root:
            phone_name = phone_node.tag.replace('_', ' ')
            carriers = []

            urls = [(url.text, phone_name) for url in phone_node]
            with ThreadPoolExecutor() as executor:
                futures = {executor.submit(self.scrape_carrier_data, url, phone_name): url for url, phone_name in urls}

                for future in as_completed(futures):
                    try:
                        carriers.append(future.result())
                    except Exception as e:
                        print(f"Error scraping carrier data: {e}")

            phones.append(Phone(name=phone_name, carriers=carriers))

        return phones

    def extract_carrier_name(self, url: str) -> str:
        carriers = {
            'telus': 'Telus',
            'koodo': 'Koodo',
            'rogers': 'Rogers',
            'fido': 'Fido',
            'freedom-mobile': 'Freedom Mobile',
            'bell': 'Bell',
            'virgin-plus': 'Virgin Plus'
        }

        for key, value in carriers.items():
            if key in url.lower():
                return value
        return 'Unknown'

    def extract_sku_id(self, url: str) -> str:
        return url.split('/')[-1]

    def extract_cellphone_api_url(self, response: requests.Response) -> str:
        for resp_line_raw in response.iter_lines():
            resp_line = resp_line_raw.decode()
            searchprefix='cellPhonesCarrierPlansUrl'
            if searchprefix in resp_line:
                rematch = re.search(f'{searchprefix}":\".*?\"', resp_line)
                if rematch is not None:
                    apiurl = rematch.group()[len(searchprefix)+3:-1]
                    return apiurl

    def scrape_carrier_data(self, url: str, phone_name: str) -> Carrier:
        carrier_name = self.extract_carrier_name(url)
        sku_id = self.extract_sku_id(url)
        print(f"Started Extracting data for {phone_name} ({sku_id}) - {carrier_name}")

        max_retries = 3
        attempt = 1
        return_it = ['N/A']
        keep_it = ['N/A']
        while attempt <= max_retries:
            try:
                response = self.session.get(url)
                cellphone_api_url = self.extract_cellphone_api_url(response)
                request_str = re.sub('{skuId}', sku_id, cellphone_api_url)

                api_response = self.session.get(request_str)
                for offer_type in api_response.json():
                    offer_dict = offer_type
                    if 'return-it' in offer_dict['type']:
                        return_it = (str(offer_dict['monthly']), str(offer_dict['downPayment']))
                    if 'keep-it' in offer_dict['type']:
                        keep_it = (str(offer_dict['monthly']), str(offer_dict['downPayment']))
                    gift_card_amount = str(offer_dict['giftCard'])

                if keep_it[0] != 'N/A' and keep_it[0].replace('.', '').isdigit():
                    monthly_price = float(keep_it[0])
                    down_payment = float(keep_it[1]) if keep_it[1] != 'N/A' else 0
                    gift_card = float(gift_card_amount)

                    total_price = monthly_price * 24 + down_payment
                    price_after_gc = total_price - gift_card if gift_card else total_price
                    bib_monthly_price = float(return_it[0]) if return_it[0] != 'N/A' and return_it[0].replace('.', '').isdigit() else 'N/A'
                    bib_down_payment = float(return_it[1]) if return_it[0] != 'N/A' and return_it[1].replace('.', '').isdigit() else 'N/A'
                    bib_premium = (f"{total_price - (float(bib_monthly_price) * 24) - float(bib_down_payment):.2f}"
                                   if isinstance(bib_monthly_price, float) else 'N/A')

                    offer = Offer(
                        price_after_gc=str(price_after_gc),
                        gift_card=gift_card_amount,
                        total_price=f"{total_price:.2f}",
                        monthly_price=monthly_price,
                        down_payment=down_payment,
                        bib_premium=bib_premium,
                        bib_monthly=bib_monthly_price,
                        down_return=bib_down_payment
                    )
                    return Carrier(name=carrier_name, link=url, offers=[offer])

                attempt += 1
            except Exception as e:
                if attempt < max_retries:
                    print(f"Retry {attempt} - Error occurred for {phone_name} - {carrier_name}: {str(e)}")
                    time.sleep(2)
                attempt += 1

        print(f"Failed to load data after {max_retries} attempts for {phone_name} - {carrier_name}")
        return Carrier(
            name=carrier_name,
            link=url,
            offers=[Offer(
                price_after_gc='N/A',
                gift_card='0',
                total_price='N/A',
                monthly_price='N/A',
                down_payment='N/A',
                bib_premium='N/A',
                bib_monthly='N/A',
                down_return='N/A'
            )]
        )

def write_to_csv(phones: List[Phone], file_path: str):
    headers = [
        'Phone', 'Carrier', 'Price After GC', 'Gift Card Amount', 'Total Price',
        'Monthly Price', 'Downpayment', 'BIB Premium', 'BIB Monthly Price',
        'BIB Downpayment', 'Link'
    ]

    carrier_order = ['Fido', 'Rogers', 'Virgin Plus', 'Bell', 'Koodo', 'Telus', 'Freedom Mobile']

    with open(file_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(headers)

        for phone in phones:
            carrier_map = {carrier.name: carrier for carrier in phone.carriers}

            for carrier_name in carrier_order:
                if carrier_name in carrier_map:
                    carrier = carrier_map[carrier_name]
                    for offer in carrier.offers:
                        writer.writerow([
                            phone.name,
                            carrier.name,
                            offer.price_after_gc,
                            offer.gift_card,
                            offer.total_price,
                            offer.monthly_price,
                            offer.down_payment,
                            offer.bib_premium,
                            offer.bib_monthly,
                            offer.down_return,
                            carrier.link
                        ])
                else:
                    writer.writerow([
                        phone.name,
                        carrier_name,
                        '--', '--', '--', '--', '--', '--', '--', '--', '--'
                    ])

def main():
    scraper = BestBuyScraper()
    xml_path = "https://raw.githubusercontent.com/Herbrax/PhoneDealsScrapper/22ed5b864d6f2ca18965d8d15d4b3596e8a82c60/bestbuymobile.xml"
    phones = scraper.scrape(xml_path)

    current_date = datetime.now().strftime("%Y%m%d")
    csv_path = f"bestbuy_{current_date}_mobiles.csv"

    write_to_csv(phones, csv_path)
    print("All data has been written to CSV files.")

if __name__ == "__main__":
    main()


Started Extracting data for iPhone 13 128GB (15726923) - Freedom Mobile
Started Extracting data for iPhone 13 128GB (15726985) - KoodoStarted Extracting data for iPhone 13 128GB (15727047) - Telus
Started Extracting data for iPhone 13 128GB (15727109) - Virgin Plus

Started Extracting data for iPhone 13 128GB (15727171) - Bell
Started Extracting data for iPhone 13 128GB (15727233) - Fido
Started Extracting data for iPhone 13 128GB (15727295) - Rogers
Started Extracting data for iPhone 13 256GB (15726918) - Freedom Mobile
Started Extracting data for iPhone 13 256GB (15726980) - Koodo
Started Extracting data for iPhone 13 256GB (15727042) - Telus
Started Extracting data for iPhone 13 256GB (15727104) - Virgin Plus
Started Extracting data for iPhone 13 256GB (15727166) - Bell
Started Extracting data for iPhone 13 256GB (15727228) - Fido
Started Extracting data for iPhone 13 256GB (15727290) - Rogers
Started Extracting data for iPhone 15 128GB (17231531) - Freedom Mobile
Started Extracting