In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
def extract_product_data(new_soup):
    """
    Extracts all relevant product information in one pass through the soup object
    to minimize time complexity.
    """
    product_data = {}

    try:
        # Get Product Name
        product_name = new_soup.find('span', attrs={'class': 'VU-ZEz'})
        product_data["Product Name"] = product_name.text.strip() if product_name else "NA"
        
        # Get Original Price
        original_price = new_soup.find('div', class_='yRaY8j A6+E6v')
        product_data["Original Price"] = int(original_price.text.strip().replace('₹', '').replace(',', '')) if original_price else 0
        
        # Get Discounted Price
        discounted_price = new_soup.find('div', class_='Nx9bqj CxhGGd')
        product_data["Discounted Price"] = int(discounted_price.text.strip().replace('₹', '').replace(',', '')) if discounted_price else 0
        
        # Get Extra Off Value
        extra_off = new_soup.find('div', class_='_2lX4N0')
        product_data["Extra Off"] = int(extra_off.span.text.replace('Extra ₹', '').replace('off', '').strip()) if extra_off and extra_off.span else 0
        
        # Get Discount Percentage
        discount_percent = new_soup.find('div', class_='UkUFwK WW8yVX')
        product_data["Discount Percentage"] = int(discount_percent.span.text.replace('% off', '').strip()) / 100 if discount_percent and discount_percent.span else 0
        
        # Get Overall Rating
        overall_rating = new_soup.find('div', class_='ipqd2A')
        product_data["Overall Rating"] = overall_rating.text.strip() if overall_rating else "N/A"
        
        # Get Ratings by Category
        ratings = {}
        for item in new_soup.find_all('a', class_='col-3-12'):
            category = item.find('div', class_='NTiEl0')
            rating_value = item.find('text', class_='_2DdnFS')
            if category and rating_value:
                ratings[category.text.strip()] = rating_value.text.strip()
        product_data.update(ratings)
    
    except Exception as e:
        print(f"Error extracting product data: {e}")
    
    return product_data  # Added return statement

def extract_specific_product_details(new_soup):
    """
    Extracts only the required product details from the table.
    """
    # Initialize product data with default "N/A" values
    product_data = {
        "In The Box": "N/A",
        "Model Number": "N/A",
        "Model Name": "N/A",
        "Color": "N/A",
        "Display Size": "N/A",
        "Resolution": "N/A",
        "GPU": "N/A",
        "Display Type": "N/A",
        "Operating System": "N/A",
        "Processor Brand": "N/A",
        "Processor Type": "N/A",
        "Processor Core": "N/A",
        "Internal Storage": "N/A",
        "RAM": "N/A",
        "Expandable Storage": "N/A",
        "Primary Camera": "N/A",
        "Secondary Camera": "N/A",
        "Network Type": "N/A",
        "Battery Capacity": "N/A",
        "Depth": "N/A",
        "Weight": "N/A"
    }

    try:
        # Get Product Details (from table)
        rows = new_soup.find_all('tr', class_='WJdYP6 row')
        for row in rows:
            key = row.find('td', class_='+fFi1w col col-3-12')
            value = row.find('td', class_='Izz52n col col-9-12')

            if key and value:
                key_text = key.text.strip()
                value_text = value.text.strip()

                # Update product data only if the key matches required fields
                if key_text in product_data:
                    product_data[key_text] = value_text

    except Exception as e:
        print(f"Error extracting specific product details: {e}")

    return product_data  # Make sure the return statement is placed here

In [3]:
if __name__ == '__main__':
    # User agent headers for web scraping
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
        "Accept-Language": "en-US, en;q=0.5"
    }

    # The webpage URL (start with page 1)
    BASE_URL = "https://www.flipkart.com/q/5g-mobiles-under-30000?page="
    
    # Make initial request to fetch first page and check how many total pages exist
    webpage = requests.get(BASE_URL + "1", headers=HEADERS)
    soup = BeautifulSoup(webpage.content, "html.parser")

    # Extract total number of pages from the pagination info
    page_info = soup.find('div', attrs={'class': '_1G0WLw'}).find('span').text.strip()
    parts = page_info.split()
    total_pages = int(parts[3])  # Get total number of pages (e.g., 420)
    print("Total No. of Pages: ", total_pages)

    # Initialize list to hold all products' data
    all_products = []

    # Loop over a limited number of pages for efficiency
    for page in range(1, 42):  # Change to (1, total_pages + 1) for full scrape
        print(f"Scraping Page {page}...")

        # Make request for each page
        try:
            webpage = requests.get(BASE_URL + str(page), headers=HEADERS)
            soup = BeautifulSoup(webpage.content, "html.parser")

            # Extract all product links on the current page
            links_list = [link.get('href') for link in soup.find_all('a', attrs={'class': 'CGtC98'})]
            print(f"--> Found {len(links_list)} product links on Page {page}")

            # Loop over each product link and fetch product details
            for i, link in enumerate(links_list):
                product_url = "https://www.flipkart.com" + link
                new_webpage = requests.get(product_url, headers=HEADERS)
                new_soup = BeautifulSoup(new_webpage.content, "html.parser")

                # Extract product data in one pass
                product_data = extract_product_data(new_soup)
                product_detail = extract_specific_product_details(new_soup)

                # Merge the product data and specific details into one dictionary
                merged_product_data = {**product_data, **product_detail}
                all_products.append(merged_product_data)
                
                print(f"Link No: {i + 1} processed...")

            print(f"Page No: {page} successfully scraped...")
            print("*" * 50)

        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
            continue  # Skip to the next page if there's a request error

    # Create a DataFrame from the list of product dictionaries
    Flipkart_df = pd.DataFrame(all_products)
    Flipkart_df

    # Output DataFrame to CSV
#     Flipkart_df.to_csv("flipkart_5g_mobiles.csv", index=False)
#     print("Data saved to 'flipkart_5g_mobiles.csv'")


Total No. of Pages:  428
Scraping Page 1...
--> Found 24 product links on Page 1
Link No: 1 processed...
Link No: 2 processed...
Link No: 3 processed...
Link No: 4 processed...
Link No: 5 processed...
Link No: 6 processed...
Link No: 7 processed...
Link No: 8 processed...
Link No: 9 processed...
Link No: 10 processed...
Link No: 11 processed...
Link No: 12 processed...
Link No: 13 processed...
Link No: 14 processed...
Link No: 15 processed...
Link No: 16 processed...
Link No: 17 processed...
Link No: 18 processed...
Link No: 19 processed...
Link No: 20 processed...
Link No: 21 processed...
Link No: 22 processed...
Link No: 23 processed...
Link No: 24 processed...
Page No: 1 successfully scraped...
**************************************************
Scraping Page 2...
--> Found 24 product links on Page 2
Link No: 1 processed...
Link No: 2 processed...
Link No: 3 processed...
Link No: 4 processed...
Link No: 5 processed...
Link No: 6 processed...
Link No: 7 processed...
Link No: 8 process

In [4]:
Flipkart_df

Unnamed: 0,Product Name,Original Price,Discounted Price,Extra Off,Discount Percentage,Overall Rating,Camera,Battery,Display,Design,...,Processor Core,Internal Storage,RAM,Expandable Storage,Primary Camera,Secondary Camera,Network Type,Battery Capacity,Depth,Weight
0,OPPO K12x 5G with 45W SUPERVOOC Charger In-The...,16999,12999,4000.0,0.23,4.5,3.9,4.4,4.2,4.4,...,Octa Core,128 GB,6 GB,1 TB,32MP + 2MP,8MP Front Camera,"5G, 4G, 3G, 2G",5100 mAh,7.68 mm,186 g
1,"Nothing Phone (2a) Plus (Grey, 256 GB) (12 GB...",31999,25999,6000.0,0.18,4.4,4.0,4.1,4.3,4.4,...,Octa Core,256 GB,12 GB,,50MP + 50MP,50MP Front Camera,"5G, 4G, 3G, 2G",5000 mAh,8.55 mm,190 g
2,Motorola Edge 50 Pro 5G with 125W Charger (Lux...,41999,29999,6000.0,0.28,4.4,4.3,4.0,4.5,4.5,...,Octa Core,256 GB,12 GB,,50MP + 13MP + 10MP,50MP Front Camera,"2G, 3G, 4G, 5G",4500 mAh,8.19 mm,186 g
3,Motorola Edge 50 Pro 5G with 125W Charger (Moo...,41999,29999,12000.0,0.28,4.4,4.3,4.0,4.5,4.5,...,Octa Core,256 GB,12 GB,,50MP + 13MP + 10MP,50MP Front Camera,"2G, 3G, 4G, 5G",4500 mAh,8.19 mm,186 g
4,"CMF by Nothing Phone 1 (Orange, 128 GB) (8 GB...",21999,16499,1500.0,0.25,4.4,3.8,4.2,4.3,4.4,...,Octa Core,128 GB,8 GB,2 TB,50MP + 2MP,16MP Front Camera,"5G, 4G, 3G, 2G",5000 mAh,8.2 mm,197 g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
966,"Apple iPhone 16 Plus (White, 128 GB)",0,89900,0.0,0.00,4.7,4.8,4.6,4.8,4.7,...,Hexa Core,128 GB,,,48MP + 12MP,12MP Front Camera,"5G, 4G VOLTE, 4G, 3G, 2G",,7.8 mm,199 g
967,"Tecno Pova 5 Pro 5G (Silver Fantasy, 256 GB) ...",20999,15990,0.0,0.23,4.2,3.9,3.9,4.1,4.3,...,,256 GB,8 GB,1 TB,50MP + 0.08MP,16MP Front Camera,"5G, 4G VOLTE",5000 mAh,9 mm,214 g
968,"Xiaomi 14 (Jade Green, 512 GB) (12 GB RAM)",79999,59999,20000.0,0.25,4.5,4.6,4.1,4.6,4.7,...,Octa Core,512 GB,12 GB,,50MP + 50MP + 50MP,32MP Front Camera,"2G, 3G, 4G, 4G VOLTE, 5G",4610 mAh,8.2 mm,193 g
969,"vivo Y28 5G (Crystal Purple, 128 GB) (6 GB RAM)",19999,14999,0.0,0.25,4.3,4.1,4.2,4.2,4.3,...,Octa Core,128 GB,6 GB,1 TB,50MP + 2MP,8MP Front Camera,"2G, 3G, 4G, 5G",5000 mAh,8.09 mm,186 g


In [5]:
Flipkart_df.shape

(971, 31)