In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import re

# Setup Chrome Options
options = Options()
options.add_argument("--start-maximized")
options.add_argument("--ignore-certificate-errors")

# Launch Browser
driver = webdriver.Chrome(options=options)

# Open Flipkart mobiles page
search_url = "https://www.flipkart.com/search?q=mobiles"
driver.get(search_url)

# Close the login popup if it appears
try:
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'✕')]"))).click()
    print("Login popup closed.")
except:
    print("Login popup not appeared.")

# Lists to store data
data = {
    'Mobile_Name': [], 'Price_INR': [], 'Original_Price': [], 'Discount_Percentage': [],
    'RAM': [], 'Storage': [], 'Display_Size': [], 'Primary_Camera': [], 'Battery_Capacity': [],
    'Processor': [], 'Warranty': [], 'Rating': [], 'Num_Reviews': [], 'Expandable_Storage': []
}

# Target 1000 rows
target_rows = 1000
total_rows = 0
page = 1

while total_rows < target_rows:
    print(f"Scraping page {page}...")
    
    try:
        # Scroll to the bottom to ensure content is loaded
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for dynamic content to load
        
        # Wait for mobile titles to load
        WebDriverWait(driver, 15).until(EC.presence_of_all_elements_located((By.XPATH, "//div[contains(@class, 'KzDlHZ')]")))
        
        # Find all mobile product containers
        products = driver.find_elements(By.XPATH, "//div[contains(@class, 'KzDlHZ')]/ancestor::div[contains(@class, 'row')][1]")
        
        # Debug: Check if products are found
        print(f"Found {len(products)} product containers on page {page}.")
        if len(products) == 0:
            print("No products found. Checking titles...")
            titles = driver.find_elements(By.XPATH, "//div[contains(@class, 'KzDlHZ')]")
            print(f"Found {len(titles)} titles with class 'KzDlHZ'.")
            if titles:
                print("Sample title:", titles[0].text)
            else:
                print("No titles found. Page source snippet:")
                print(driver.page_source[:1000])
        
        for product in products:
            if total_rows >= target_rows:
                break
            try:
                # Mobile Name
                name = product.find_element(By.XPATH, ".//div[contains(@class, 'KzDlHZ')]").text
                data['Mobile_Name'].append(name)
                
                # Price (Current Price)
                price = product.find_element(By.XPATH, ".//div[contains(@class, 'Nx9bqj _4b5DiR')]").text.replace('₹', '').replace(',', '')
                data['Price_INR'].append(price)
                
                # Original Price and Discount (if available)
                original_price_elem = product.find_elements(By.XPATH, ".//div[contains(@class, 'yRaY8j')]")
                data['Original_Price'].append(original_price_elem[0].text.replace('₹', '').replace(',', '') if original_price_elem else '')
                discount_elem = product.find_elements(By.XPATH, ".//span[contains(@class, 'yRaY8j')]/following-sibling::span[1]")
                data['Discount_Percentage'].append(discount_elem[0].text if discount_elem else '')
                
                # Specifications (RAM, Storage, Display, Camera, Battery, Processor, Warranty, Expandable Storage)
                specs = product.find_elements(By.XPATH, ".//ul[contains(@class, 'G4BRas')]/li")
                ram, storage, display, camera, battery, processor, warranty, expandable = '', '', '', '', '', '', '', ''
                for spec in specs:
                    text = spec.text
                    if 'RAM' in text:
                        ram = text
                    elif 'ROM' in text or 'Storage' in text:
                        storage = text
                        if 'Expandable' in text:
                            expandable = text
                    elif 'Display' in text:
                        display = text
                    elif 'Camera' in text:
                        camera = text
                    elif 'Battery' in text or 'mAh' in text:
                        battery = text
                    elif 'Processor' in text:
                        processor = text
                    elif 'Warranty' in text:
                        warranty = text
                data['RAM'].append(ram)
                data['Storage'].append(storage)
                data['Display_Size'].append(display)
                data['Primary_Camera'].append(camera)
                data['Battery_Capacity'].append(battery)
                data['Processor'].append(processor)
                data['Warranty'].append(warranty)
                data['Expandable_Storage'].append(expandable)
                
                # Rating and Number of Reviews
                rating_elem = product.find_elements(By.XPATH, ".//div[contains(@class, '_3LWZlK')]")
                data['Rating'].append(rating_elem[0].text if rating_elem else '')
                reviews_elem = product.find_elements(By.XPATH, ".//span[contains(@class, '_2_R_DZ')]")
                reviews = reviews_elem[0].text if reviews_elem else ''
                num_reviews = re.search(r'(\d+(?:,\d+)?)\s*Ratings', reviews) or re.search(r'(\d+(?:,\d+)?)\s*Reviews', reviews)
                data['Num_Reviews'].append(num_reviews.group(1).replace(',', '') if num_reviews else '')
                
                total_rows += 1
                print(f"Scraped {total_rows} items so far...")
                
            except Exception as e:
                print(f"Error scraping a product on page {page}: {e}")
                # Append empty values for missing data to maintain alignment
                for key in data:
                    data[key].append('')
        
        if total_rows >= target_rows:
            print(f"Reached target of {target_rows} rows. Stopping scraping.")
            break
        
        # Manual pagination
        print("Please manually click the 'NEXT' button in the browser, then press Enter here to continue...")
        print("If there is no 'NEXT' button, press Enter to stop scraping or type 'stop' to end.")
        user_input = input("Press Enter to continue after clicking 'NEXT' (or 'stop' to end)...")
        if user_input.lower() == 'stop':
            print("Stopping scraping as per user input.")
            break
        time.sleep(2)  # Allow page to load after user clicks
        page += 1
    
    except Exception as e:
        print(f"Error on page {page}: {e}")
        break

# Save data to Excel
mobile_df = pd.DataFrame(data)
mobile_df.to_csv(r'C:\Users\g702708\flipkart_mobiles.csv', index=False)
print("Saved to flipkart_mobiles.csv")

driver.quit()