In [89]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import csv
import re


In [90]:
urls = [
        'https://www.flipkart.com/search?q=tv&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off'
        ]

In [91]:
driver = webdriver.Chrome()

In [92]:
# List to store scraped data before writing to CSV
data_list = []

for url in urls:
    driver.get(url)
    time.sleep(4)
    print(f"Scraping the data from {url}")

    last_page = None  

    while True:
        # Get page source and parse
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # Extract current page number
        page_info = soup.find("div", class_="_1G0WLw")
        if page_info:
            page_text = page_info.find("span").text  
            current_page = page_text.split()[1]  
            print(f"📄 Scraping details from Page {current_page}...")

            if current_page == last_page:  
                print("✅ Scraping completed. No more pages available.")
                break

            last_page = current_page  

        # Extract the data from the HTML
        title = soup.find_all('div', class_="DOjaWF gdgoEp")
        for i in title:
            products = i.find_all('div', class_="cPHDOP col-12-12")

            # Exclude the last two products (pagination elements)
            products = products[:-2]  

            for product in products:
                j = product.find_all('div', class_="tUxRFH")
                for k in j:
                    link = k.find('a', class_="CGtC98")
                    product_url = "https://www.flipkart.com" + link['href'] if link else "N/A"

                    name = k.find('div', class_="KzDlHZ").text.strip()
                    rating_tag = k.find('div', class_="XQDdHH")
                    rating = rating_tag.get_text(strip=True) if rating_tag else "N/A"

                    ratings_reviews = k.find('span', class_="Wphh3N")
                    num_rating, num_reviews = "N/A", "N/A"

                    if ratings_reviews:
                        text = ratings_reviews.get_text(strip=True)  # Extracts full text
                        parts = text.split("&")  # Splits at '&' separator

                        if len(parts) == 2:  
                            num_rating = ''.join(filter(str.isdigit, parts[0]))  # Extract only digits from "41,641 Ratings"
                            num_reviews = ''.join(filter(str.isdigit, parts[1]))  # Extract only digits from "3,235 Reviews"

                    details = k.find('div', class_="_6NESgJ").find_all('li', class_="J+igdf")
                    os, resolution_type, resolution_pixels, launch_year, warranty = "N/A", "N/A", "N/A", "N/A", "N/A"

                    for detail in details:
                        detail_text = detail.text.strip().lower()

                        if "resolution" in detail_text or "hd" in detail_text:
                        # Adjusted regex to handle different resolution formats
                            resolution_match = re.search(r'([\w\s\(\)-]+)\s+(\d{3,4}\s*x\s*\d{3,4})', detail_text)
    
                            if resolution_match:
                                resolution_type = resolution_match.group(1).strip()  # Extract text before resolution
                                resolution_pixels = resolution_match.group(2).replace(" ", "")  # Remove extra spaces in resolution

                        elif "launch year" in detail_text:
                            launch_year = detail.text.strip().replace("Launch Year:", "").strip()

                        elif "warranty" in detail_text:
                            warranty_pattern = re.compile(r"(\d+)\s*Year", re.IGNORECASE)
                            match = warranty_pattern.search(detail_text)
                            if match:
                                warranty = match.group(1)

                        elif "os" in detail_text or "operating system" in detail_text:
                            os = detail.text.strip().replace("Operating System:", "").strip()

                    price = k.find('div', class_="Nx9bqj _4b5DiR")
                    price = price.text.strip().replace("₹", "").replace(",", "") if price else "N/A"

                    original_price = k.find('div', class_="yRaY8j ZYYwLA")
                    original_price = original_price.text.strip().replace("₹", "").replace(",", "") if original_price else "N/A"

                    discount = k.find('div', class_="UkUFwK")
                    discount = discount.text.strip().replace("off", "").replace("%", "") if discount else "N/A"

                    # Append data to list
                    data_list.append([
                        product_url, name, rating, num_rating, num_reviews,
                        os, resolution_type, resolution_pixels, launch_year, warranty,
                        price, original_price, discount
                    ])

                    print(f"✅ Scraped: {name}")

        # Locate the "Next" button
        try:
            next_button = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//a[@class='_9QVEpD' and span[text()='Next']]"))
            )

            # Scroll to the "Next" button
            driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", next_button)
            time.sleep(2)

            # Click the "Next" button
            next_button.click()
            print("➡️ Moving to the next page...\n")
            time.sleep(2)  
        except Exception as e:
            print("✅ Scraping completed or no more pages available.", e)
            break  



Scraping the data from https://www.flipkart.com/search?q=tv&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off
📄 Scraping details from Page 1...
✅ Scraped: InnoQ Sounbar 60 cm (24 inch) HD Ready LED Smart Android TV with 30W Front Boom Speakers | 1000+ Smart...
✅ Scraped: LG UR7500 108 cm (43 inch) Ultra HD (4K) LED Smart WebOS TV with Alpha5 AI Processor 4K Gen6, AI Pictu...
✅ Scraped: Thomson FA Series 108 cm (43 inch) Full HD LED Smart Android TV with Dolby Digital Plus & Android 11
✅ Scraped: TCL L4B 79.97 cm (32 inch) HD Ready LED Smart Android TV 2024 Edition with Metallic Bezel Less and Chr...
✅ Scraped: Thomson Phoenix 80 cm (32 inch) QLED HD Ready Smart Android TV 48W Sound Output
✅ Scraped: Dyanora 60 cm (24 inch) HD Ready LED TV with Noise Reduction, Cinema Zoom, Powerful Audio Box Speakers
✅ Scraped: TCL S5500 79.97 cm (32 inch) Full HD LED Smart Google TV 2024 Edition with 1.5 GB RAM + 16 GB ROM
✅ Scraped: Thomson Alpha 80 cm (32 inch) HD Ready LED Sma

In [93]:
# ✅ Write the data to CSV at the end
csv_filename = "flipkart_product.csv"
with open(csv_filename, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    
    # Write header
    writer.writerow([
        "Product URL", "Name", "Rating", "Number of Ratings", "Number of Reviews", 
        "OS", "Resolution Type", "Resolution Pixels", "Launch Year", "Warranty", 
        "Price (₹)", "Original Price (₹)", "Discount (%)"
    ])

    # Write all data rows
    writer.writerows(data_list)

print(f"📂 Data successfully saved in '{csv_filename}'")


📂 Data successfully saved in 'flipkart_product.csv'
