In [12]:
import requests
from bs4 import BeautifulSoup
import time
import logging

# Logging
logging.basicConfig(filename='scraping.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to convert dictionary keys to lowercase
def keys_lower_inplace(data):
    for key in list(data.keys()):
        data[key.lower()] = data.pop(key)

# List to hold the scrpped data
all_data = []

begin = time.perf_counter()
for page in range(1, 42):
    try:
        # Start the counter
        logging.info(f"Page {page} scraping started...")
        
        print("**************************************************\n")
        print(f"Page {page} scrapping started...")
        start = time.perf_counter()

        # Fetch the HTML and create soup object for the main page
        main_page_url = f"https://www.flipkart.com/search?q=laptops&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page={page}"
        html_main = requests.get(main_page_url).text
        soup_main = BeautifulSoup(html_main, 'lxml')

        # Finding all the anchor tags of boxes containing laptops on the present page (#laptops=24)
        laptops = soup_main.find_all(name='a', class_='_1fQZEK')

        # Collecting laptop_urls from the present page
        laptop_urls = []
        for i in laptops:
            try:
                url = "https://www.flipkart.com" + i['href']
            except Exception as e:
                logging.error(f"Error while collecting laptop URL: {e}")
            else:
                laptop_urls.append(url)

        # Scrapping the specifications for the laptops on the present page
        for url in laptop_urls:
            data = {}
            try:
                html_prod = requests.get(url).text
                soup_laptop = BeautifulSoup(html_prod, 'lxml')

                # Fetching the name of the laptop
                name = soup_laptop.find(name='span', class_='B_NuCI').text
                data['name'] = name

                # Fetch the price of the laptop
                price = soup_laptop.find(name='div', class_='_30jeq3 _16Jk6d').text
                data['price'] = price

                # Finding the specification/specs section
                specs = soup_laptop.find(name='div', class_='_3dtsli')

                # Collecting the attributes
                keys = [_.text for _ in specs.find_all(name='td', class_='_1hKmbr col col-3-12')]

                # Collecting the records for the corresponding attributes
                values = [_.text for _ in specs.find_all(name='td', class_='URwL2w col col-9-12')]

                # Adding the collected info to data
                for i in range(len(keys)):
                    data[keys[i]] = values[i]

                # Convert the keys of the dictionary to lowercase
                keys_lower_inplace(data)

                # Append the data from the present page in the list
                all_data.append(data)
            except Exception as e:
                logging.error(f"Error while scraping laptop page data: {e}")
        
        # Wait for a few seconds
        end = time.perf_counter()
        logging.info(f"Execution finished for page {page}. Execution time: {end - start} seconds.")
        logging.info("Sleeping for 7 seconds...")
        print(f"Execution finished for page {page}\nExecution time to scrape page {1}: {end-start}\n")
        print("Sleeping for 7 seconds...")
        time.sleep(7)

    except Exception as e:
        logging.error(f"Error occurred during main page {page} scraping: {e}")

terminate = time.perf_counter()
logging.info(f"Finished Scraping in {terminate-begin}")
print("\nFinished Scrapping\n")
print(f"Total execution time of the script: {terminate-begin}")

**************************************************

Page 1 scrapping started...
Execution finished for page 1
Execution time to scrape page 1: 16.522067956626415

Sleeping for 7 seconds...
**************************************************

Page 2 scrapping started...
Execution finished for page 2
Execution time to scrape page 1: 17.54845768213272

Sleeping for 7 seconds...
**************************************************

Page 3 scrapping started...
Execution finished for page 3
Execution time to scrape page 1: 17.554554220288992

Sleeping for 7 seconds...
**************************************************

Page 4 scrapping started...
Execution finished for page 4
Execution time to scrape page 1: 17.107618141919374

Sleeping for 7 seconds...
**************************************************

Page 5 scrapping started...
Execution finished for page 5
Execution time to scrape page 1: 16.202299371361732

Sleeping for 7 seconds...
**************************************************

Page

---
### Exporting the data to a JSON file

In [19]:
import json
with open('laptops.json','w') as file:
    json.dump(all_data,file,indent=4)

---
### Creating the DataFrame

In [13]:
# Load the JSON file

import json
with open('laptops.json','r') as file:
    data = json.load(file)

In [14]:
type(data)

list

In [15]:
data[0]

{'name': 'ASUS Chromebook Touch Intel Celeron Dual Core - (4 GB/64 GB EMMC Storage/Chrome OS) C523NA-A20303 Chromebook\xa0\xa0(15.6 inch, Silver, 1.69 Kg)',
 'price': '₹18,990',
 'sales package': 'Laptop, Power Adaptor, User Guide',
 'model number': 'C523NA-A20303',
 'part number': '90NX01R1-M05250 | 90NX01R1-M03520',
 'series': 'Chromebook Touch Intel',
 'color': 'Silver',
 'type': 'Chromebook',
 'suitable for': 'Processing & Multitasking',
 'power supply': '45W AC Adapter',
 'battery cell': '2 Cell',
 'ms office provided': 'No',
 'processor brand': 'Intel',
 'processor name': 'Celeron Dual Core',
 'ssd': 'No',
 'ram': '4 GB',
 'ram type': 'LPDDR4',
 'emmc storage capacity': '64 GB',
 'processor variant': 'N3350',
 'chipset': 'NA',
 'clock speed': '1.1 GHz with Turbo Boost Upto 2.4 GHz',
 'cache': '2',
 'graphic processor': 'Intel Integrated HD 500',
 'number of cores': '2',
 'operating system': 'Chrome',
 'mic in': 'Yes',
 'usb port': '2 x USB 3.2 Gen 1 Type-A, 2 x USB 3.2 Gen 1 Type

#### Defining the required attributes and creating the DataFrame

In [16]:
attr = ['name', 'price', 'type', 'processor brand', 'processor name', 'processor generation', 'ssd', 'ssd capacity',\
        'hdd capacity', 'emmc storage capacity', 'ram', 'clock speed', 'graphic processor', 'operating system',\
        'touchscreen', 'screen size', 'screen resolution', 'screen type', 'weight', 'finger print sensor', 'backlit keyboard'
       ]

In [17]:
# Define all the empty list 

name_lst = []
price_lst = []
type_lst = []
processor_brand_lst = []
processor_name_lst = []
processor_gen_lst = []
ssd_lst = []
ssd_cap_lst = []
hdd_cap_lst = []
emmc_lst = []
ram_lst = []
clk_speed_lst = []
gpu_lst = []
os_lst = []
touchscreen_lst = []
screen_size_lst = []
screen_res_lst = []
screen_type_lst = []
weight_lst = []
fp_sensor_lst = []
key_light_lst = []

In [18]:
# Filling all the lists
import time
import numpy as np

start = time.perf_counter()
s = 1
for d in data:
    begin = time.perf_counter()
    print("*******************************\n")
    print(f"filling data of laptop no. {s}")
    
    # Start appending
    name_lst.append(d.get('name', np.nan))
    price_lst.append(d.get('price', np.nan))
    type_lst.append(d.get('type', np.nan))
    processor_brand_lst.append(d.get('processor brand', np.nan))
    processor_name_lst.append(d.get('processor name', np.nan))
    processor_gen_lst.append(d.get('processor generation', np.nan))
    ssd_lst.append(d.get('ssd', 'No'))
    ssd_cap_lst.append(d.get('ssd capacity', 0))
    hdd_cap_lst.append(d.get('hdd capacity', 0))
    emmc_lst.append(d.get('emmc storage capacity', 0))
    ram_lst.append(d.get('ram', 0))
    clk_speed_lst.append(d.get('clock speed', np.nan))
    gpu_lst.append(d.get('graphic processor', np.nan))
    os_lst.append(d.get('operating system', np.nan))
    touchscreen_lst.append(d.get('touchscreen', 'No'))
    screen_size_lst.append(d.get('screen size', np.nan))
    screen_res_lst.append(d.get('screen resolution', np.nan))
    screen_type_lst.append(d.get('screen type', np.nan))
    weight_lst.append(d.get('weight', np.nan))
    fp_sensor_lst.append(d.get('finger print sensor', 'No'))
    key_light_lst.append(d.get('backlit keyboard', 'No'))
    
    # Display the counter and sleep
    terminate = time.perf_counter()
    print(f"Finished filling in: {terminate-begin}")
    #print("Sleeping for 1 seconds...")
    s+=1
    #time.sleep(1)
    
print(f"Script executed in {time.perf_counter()-start}")

*******************************

filling data of laptop no. 1
Finished filling in: 5.539999983739108e-05
*******************************

filling data of laptop no. 2
Finished filling in: 8.7999999323074e-06
*******************************

filling data of laptop no. 3
Finished filling in: 7.60000011723605e-06
*******************************

filling data of laptop no. 4
Finished filling in: 8.199999911084888e-06
*******************************

filling data of laptop no. 5
Finished filling in: 4.229999990457145e-05
*******************************

filling data of laptop no. 6
Finished filling in: 7.500000037907739e-06
*******************************

filling data of laptop no. 7
Finished filling in: 7.000000096013537e-06
*******************************

filling data of laptop no. 8
Finished filling in: 6.900000016685226e-06
*******************************

filling data of laptop no. 9
Finished filling in: 7.599999889862374e-06
*******************************

filling data of laptop no

In [19]:
# Create the DataFrame

import pandas as pd
laptops_df = pd.DataFrame({'Name': name_lst,
    'Price': price_lst,
    'Type': type_lst,
    'Processor Brand': processor_brand_lst,
    'Processor Name': processor_name_lst,
    'Processor Generation': processor_gen_lst,
    'SSD': ssd_lst,
    'SSD Capacity': ssd_cap_lst,
    'HDD Capacity': hdd_cap_lst,
    'EMMC Storage Capacity': emmc_lst,
    'RAM': ram_lst,
    'Clock Speed': clk_speed_lst,
    'Graphic Processor': gpu_lst,
    'Operating System': os_lst,
    'Touchscreen': touchscreen_lst,
    'Screen Size': screen_size_lst,
    'Screen Resolution': screen_res_lst,
    'Screen Type': screen_type_lst,
    'Weight': weight_lst,
    'Fingerprint Sensor': fp_sensor_lst,
    'Backlit Keyboard': key_light_lst
})

In [20]:
laptops_df.sample(5)

Unnamed: 0,Name,Price,Type,Processor Brand,Processor Name,Processor Generation,SSD,SSD Capacity,HDD Capacity,EMMC Storage Capacity,...,Clock Speed,Graphic Processor,Operating System,Touchscreen,Screen Size,Screen Resolution,Screen Type,Weight,Fingerprint Sensor,Backlit Keyboard
706,Lenovo IdeaPad Slim 3 Intel Core i3 11th Gen -...,"₹35,990",Thin and Light Laptop,Intel,Core i3,11th Gen,Yes,512 GB,0,0,...,"Base Clock 1.70 Ghz, Max Turbo Boost Frequency...",Intel Integrated UHD,Windows 11 Home,No,35.56 cm (14 Inch),1920 x 1080 Pixel,"Full HD TN 250nits Anti-glare, 45% NTSC",1.41 Kg,No,No
912,HP Intel Core i3 10th Gen - (8 GB/512 GB SSD/W...,"₹38,990",Thin and Light Laptop,Intel,Core i3,10th Gen,Yes,512 GB,0,0,...,2.1 GHz with Turbo Boost Upto 4.1 GHz,Intel Integrated UHD,Windows 10 Home,No,39.62 cm (15.6 inch),1920 x 1080 Pixel,Full HD LED Backlit Anti-glare Micro-edge Disp...,1.74 kg,No,No
793,Acer Aspire 3 Pentium Quad Core - (4 GB/256 GB...,"₹37,990",Thin and Light Laptop,Intel,Pentium Quad Core,,Yes,256 GB,0,0,...,2.0 GHz up to 3.30 GHz,Intel Integrated UHD,Windows 11 Home,No,35.56 cm (14 Inch),1366 x 768 Pixel,"HD resolution, Acer ComfyViewTM LED-backlit TF...",1.45 Kg,No,Yes
941,Lenovo IdeaPad Slim 3 Intel Core i3 11th Gen -...,"₹35,990",Thin and Light Laptop,Intel,Core i3,11th Gen,Yes,512 GB,0,0,...,"Base Clock 1.70 Ghz, Max Turbo Boost Frequency...",Intel Integrated UHD,Windows 11 Home,No,35.56 cm (14 Inch),1920 x 1080 Pixel,"Full HD TN 250nits Anti-glare, 45% NTSC",1.41 Kg,No,No
120,Primebook 4G Android Based MediaTek MT8788 - (...,"₹13,990",Thin and Light Laptop,MediaTek,MediaTek MT8788,,No,0,0,64 GB,...,Max Frequency Up to 2.0 GHz,MediaTek Integrated ARM Mali G72,Prime OS,No,29.46 cm (11.6 Inch),1366 x 768 Pixels,HD IPS,1.065 Kg,No,No


In [21]:
laptops_df.shape

(984, 21)

In [22]:
laptops_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 984 entries, 0 to 983
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Name                   984 non-null    object
 1   Price                  984 non-null    object
 2   Type                   984 non-null    object
 3   Processor Brand        984 non-null    object
 4   Processor Name         984 non-null    object
 5   Processor Generation   569 non-null    object
 6   SSD                    984 non-null    object
 7   SSD Capacity           984 non-null    object
 8   HDD Capacity           984 non-null    object
 9   EMMC Storage Capacity  984 non-null    object
 10  RAM                    984 non-null    object
 11  Clock Speed            848 non-null    object
 12  Graphic Processor      984 non-null    object
 13  Operating System       984 non-null    object
 14  Touchscreen            984 non-null    object
 15  Screen Size            

#### Export the laptops_df to csv or excel file

In [23]:
#laptops_df.to_csv("Laptops_data.csv", index=False)
laptops_df.to_excel("Laptops_data.xlsx", index=False)