In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import os
import numpy as np
from itertools import zip_longest

In [2]:
def bs4_soup(pages):
    URL = f'https://www.flipkart.com/search?q=apple+mobiles&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page={pages}'
    request = requests.get(URL)
    soup = bs(request.text,'html.parser')
    soup = soup.find('div',class_ = '_1YokD2 _3Mn1Gg')

    return soup , URL

In [3]:
def mob_specs(soup):
    names = soup.find_all('div', class_='_4rR01T')
    mob_names = [i.text if i else None for i in names]

    ratings = soup.find_all('div', class_='_3LWZlK')
    mob_ratings = [i.text if i else None for i in ratings]

    price = soup.find_all('div', class_='_30jeq3 _1_WHN1')
    mob_price = [i.text if i else None for i in price]

    img = soup.find_all('div', class_='CXW8mj')
    mob_img_URL = [i.find('img')['src'] if i.find('img') else None for i in img]

    return mob_names, mob_ratings, mob_price, mob_img_URL


In [4]:
def features_from_mob_links(soup):
    mobile_links = ['https://www.flipkart.com' + i['href'] for i in soup.find_all('a', class_='_1fQZEK')]
    mobile_req = [requests.get(url=i) for i in mobile_links]
    mobile_soup = [bs(i.text, 'html.parser').find('div', class_='_1YokD2 _2GoDe3') for i in mobile_req]
    mobile_specs_soup = [i.find_all('div', class_='_3k-BhJ') for i in mobile_soup]

    storage_ram = [specs_.find('table', class_='_14cfVK').text if specs_ else None for mob_specs in mobile_specs_soup for specs_ in mob_specs if 'Memory' in str(specs_.text) and 'Storage' in str(specs_.text)]
    storage_ram_cross_checked = [i for i in storage_ram if 'Internal' in i.split()]

    os_processor = [specs_.find('table', class_='_14cfVK').text if specs_ else None for mob_specs in mobile_specs_soup for specs_ in mob_specs if 'Os' in str(specs_.text) and 'Processor' in str(specs_.text)]
    os_processor_cross_checked = [i for i in os_processor if 'Operating' in i.split()]

    camera = [specs_.find('table', class_='_14cfVK').text if specs_ else None for mob_specs in mobile_specs_soup for specs_ in mob_specs if 'Camera' in str(specs_.text) and 'FeaturesPrimary' in str(specs_.text)]
    camera_cross_checked = [i for i in camera if 'Primary' and 'Camera' in i.split()]

    display = [specs_.find('table', class_='_14cfVK').text if specs_ else None for mob_specs in mobile_specs_soup for specs_ in mob_specs if 'Display' in str(specs_.text) and 'FeaturesDisplay' in str(specs_.text)]
    display_cross_checked = [i for i in display if 'Display' and 'cm' in i.split()]

    network = [specs_.find('table', class_='_14cfVK').text if specs_ else None for mob_specs in mobile_specs_soup for specs_ in mob_specs if 'Connectivity' in str(specs_.text) and 'FeaturesNetwork' in str(specs_.text)]
    network_cross_checked = [i for i in network if 'Network' in i.split()]

    battery = [specs_.find('table', class_='_14cfVK').text if specs_ else None for mob_specs in mobile_specs_soup for specs_ in mob_specs if 'Battery' in str(specs_.text) and 'FeaturesBattery' in str(specs_.text)]
    battery_cross_checked = [i for i in battery if 'Battery' in i.split()]

    return mobile_specs_soup, storage_ram_cross_checked, os_processor_cross_checked, camera_cross_checked, display_cross_checked, network_cross_checked, battery_cross_checked


In [5]:
def fill_nan_values_func(mob_names, mob_ratings, mob_price, mob_img_URL, storage_ram, os_processor, camera, display, network, battery):
    tuple_data = zip_longest(mob_names, mob_ratings, mob_price, mob_img_URL, storage_ram, os_processor, camera, display, network, battery,fillvalue=np.nan)
    
    data = [i for i in tuple_data]
    return data
    

In [6]:
def print_len(mob_names, mob_ratings, mob_price, mob_img_URL, storage_ram, os_processor, camera, display, network, battery):
    print(f"Length of mob_names: {len(mob_names)}")
    print(f"Length of mob_ratings: {len(mob_ratings)}")
    print(f"Length of mob_price: {len(mob_price)}")
    print(f"Length of mob_img_URL: {len(mob_img_URL)}")
    print(f"Length of storage_ram: {len(storage_ram)}")
    print(f"Length of os_processor: {len(os_processor)}")
    print(f"Length of camera: {len(camera)}")
    print(f"Length of display: {len(display)}")
    print(f"Length of network: {len(network)}")
    print(f"Length of battery: {len(battery)}")   

In [7]:
df = pd.DataFrame()

page_start = 1
page_ends = 19

for pages in range(page_start, page_ends):

    soup, URL = bs4_soup(pages=pages)
    
    mob_names, mob_ratings, mob_price, mob_img_URL = mob_specs(soup)
    mobile_specs_soup, storage_ram, os_processor, camera, display, network, battery = features_from_mob_links(soup=soup)
    
    data = fill_nan_values_func(mob_names, mob_ratings, mob_price, mob_img_URL, storage_ram, os_processor, camera, display, network, battery)

    if mobile_specs_soup is not None and soup is not None:
        
        print_len(mob_names, mob_ratings, mob_price, mob_img_URL, storage_ram, os_processor, camera, display, network, battery)

        page_df = pd.DataFrame(data=data,columns=['name', 'ratings', 'price', 'imgURL', 'storage_ram','os_processor',
                                                   'camera', 'display', 'network', 'battery'])

        if not page_df.empty:
            df = pd.concat(objs=[df, page_df], ignore_index=True)
            print(f'scraping data from page:{pages}')
            print(f'working in {URL} Page')
            print(f'page df : page_shape : {page_df.shape} \n {page_df.sample(2)}')
            print(f'df : df shape : {df.shape} \n {df.sample(2)}')
            print()

    else:
        print(f"Skipping page {pages} due to empty mobile_specs_soup or soup.")


Length of mob_names: 24
Length of mob_ratings: 24
Length of mob_price: 24
Length of mob_img_URL: 24
Length of storage_ram: 24
Length of os_processor: 24
Length of camera: 24
Length of display: 24
Length of network: 24
Length of battery: 0
scraping data from page:1
working in https://www.flipkart.com/search?q=apple+mobiles&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page=1 Page
page df : page_shape : (24, 10) 
                                        name ratings    price   
10  APPLE iPhone 14 Plus (Midnight, 128 GB)     4.6  ₹79,999  \
1        APPLE iPhone 13 (Midnight, 128 GB)     4.7  ₹61,999   

                                               imgURL             storage_ram   
10  https://rukminim2.flixcart.com/image/312/312/x...  Internal Storage128 GB  \
1   https://rukminim2.flixcart.com/image/312/312/k...  Internal Storage128 GB   

                                         os_processor   
10  Operating SystemiOS 16Processor TypeA15 Bionic...  \
1   Ope

In [None]:
df

Unnamed: 0,name,ratings,price,imgURL,storage_ram,os_processor,camera,display,network,battery
0,"Xiaomi 11Lite NE (Vinyl Black, 128 GB)",4.2,"₹21,430",https://rukminim2.flixcart.com/image/312/312/x...,Internal Storage128 GBRAM6 GBExpandable Storag...,Operating SystemAndroid 11Processor TypeQualco...,Primary Camera AvailableYesPrimary Camera64MP ...,Display Size16.64 cm (6.55 inch)Resolution1080...,"Network Type5G, 4G VOLTESupported Networks5G, ...",Battery Capacity4250 mAhBattery TypeLi-Po
1,"Redmi 9A Sport (Carbon Black, 32 GB)",4.1,"₹7,790",https://rukminim2.flixcart.com/image/312/312/x...,Internal Storage32 GBRAM2 GBMemory Card Slot T...,Operating SystemAndroid Q 10Processor CoreOcta...,Primary Camera13MP Rear Camera,Display Size16.59 cm (6.53 inch)Resolution720 ...,"Network Type4G VOLTE, 4GSupported Networks4G V...",Battery Capacity5000 mAh
2,"REDMI 10 Power (Sporty Orange, 128 GB)",4.2,"₹15,000",https://rukminim2.flixcart.com/image/312/312/x...,Internal Storage128 GBRAM8 GBMemory Card Slot ...,Operating SystemAndroid 13Processor CoreOcta C...,Primary Camera50MP Rear CameraDual Camera Lens...,Display Size17.02 cm (6.7 inch)Resolution2400 ...,Network Type4G VOLTESupported Networks4G LTE,Battery Capacity6000 mAh
3,"Redmi 9A Sport (Coral Green, 32 GB)",4.2,"₹8,990",https://rukminim2.flixcart.com/image/312/312/x...,Internal Storage32 GBRAM3 GBMemory Card Slot T...,Operating SystemAndroid Q 10Processor CoreOcta...,Primary Camera13MP Rear Camera,Display Size16.59 cm (6.53 inch)Resolution1600...,"Network Type4G VOLTE, 4GSupported Networks4G V...",Battery Capacity5000 mAh
4,"Mi 11 Lite (Vinyl Black, 128 GB)",4.2,"₹17,499",https://rukminim2.flixcart.com/image/312/312/k...,Internal Storage128 GBRAM8 GBExpandable Storag...,Operating SystemAndroid 11Processor TypeQualco...,Primary Camera AvailableYesPrimary Camera64MP ...,Display Size16.64 cm (6.55 inch)Resolution2400...,"Network Type4G VOLTE, 4G, 3G, 2GSupported Netw...",Battery Capacity4250 mAhTalk Time30 hrs
...,...,...,...,...,...,...,...,...,...,...
547,"Mi 10T Pro (Lunar Silver, 128 GB)",4.2,"₹26,899",https://rukminim2.flixcart.com/image/312/312/k...,Internal Storage128 GBRAM8 GBCall Log MemoryYes,Operating SystemAndroid 10Processor TypeQualco...,Primary Camera AvailableYesPrimary Camera108MP...,Display Size16.94 cm (6.67 inch)Resolution2400...,"Network Type5G, 4G VOLTE, 4G, 3G, 2GSupported ...",Battery Capacity5000 mAhTalk Time26 hrs
548,"Redmi Y1 (Gold, 32 GB)",4.3,"₹8,999",https://rukminim2.flixcart.com/image/312/312/j...,Internal Storage32 GBRAM3 GBExpandable Storage...,Operating SystemAndroid Nougat 7.1.2Processor ...,Primary Camera AvailableYesPrimary Camera13MP ...,Display Size13.97 cm (5.5 inch)Resolution1280 ...,"Network Type3G, 4G, 2GSupported NetworksGSM, W...",Battery Capacity3080 mAh
549,"Redmi Note 7 Pro (Neptune Blue, 128 GB)",4.5,"₹17,999",https://rukminim2.flixcart.com/image/312/312/j...,Internal Storage128 GBRAM6 GBExpandable Storag...,Operating SystemAndroid Pie 9.0Processor TypeQ...,Primary Camera AvailableYesPrimary Camera48MP ...,Display Size16.0 cm (6.3 inch)Resolution2340 x...,"Network Type3G, 4G VOLTE, 2GSupported Networks...",Battery Capacity4000 mAh
550,"Mi 10i (Pacific Sunrise, 128 GB)",4.3,"₹20,999",https://rukminim2.flixcart.com/image/312/312/k...,Internal Storage128 GBRAM6 GBSupported Memory ...,Operating SystemAndroid Q 11Processor TypeQual...,Primary Camera AvailableYesPrimary Camera108MP...,Display Size16.94 cm (6.67 inch)Resolution2400...,Network Type5GSupported Networks5GInternet Con...,Battery Capacity4820 mAh


In [None]:
print(os.getcwd())
os.chdir(r'd:\vscode_machineLearning\BEST_PROJECTS\mobileRecommenderSystem')
print(os.getcwd())

d:\vscode_machineLearning\BEST_PROJECTS\mobileRecommenderSystem\notebook\data_Ingestion\scraping
d:\vscode_machineLearning\BEST_PROJECTS\mobileRecommenderSystem


In [None]:
df.to_csv(r'data\raw_data\mi_mobile_new_data.csv',index=False)