In [8]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import os
import numpy as np
from itertools import zip_longest

In [9]:
def bs4_soup(pages):
    URL = f'https://www.flipkart.com/search?q=oneplus+mobiles&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page={pages}'
    request = requests.get(URL)
    soup = bs(request.text,'html.parser')
    soup = soup.find('div',class_ = '_1YokD2 _3Mn1Gg')

    return soup , URL

In [10]:
def mob_specs(soup):
    names = soup.find_all('div', class_='_4rR01T')
    mob_names = [i.text if i else None for i in names]

    ratings = soup.find_all('div', class_='_3LWZlK')
    mob_ratings = [i.text if i else None for i in ratings]

    price = soup.find_all('div', class_='_30jeq3 _1_WHN1')
    mob_price = [i.text if i else None for i in price]

    img = soup.find_all('div', class_='CXW8mj')
    mob_img_URL = [i.find('img')['src'] if i.find('img') else None for i in img]

    return mob_names, mob_ratings, mob_price, mob_img_URL


In [11]:
def features_from_mob_links(soup):
    mobile_links = ['https://www.flipkart.com' + i['href'] for i in soup.find_all('a', class_='_1fQZEK')]
    mobile_req = [requests.get(url=i) for i in mobile_links]
    mobile_soup = [bs(i.text, 'html.parser').find('div', class_='_1YokD2 _2GoDe3') for i in mobile_req]
    mobile_specs_soup = [i.find_all('div', class_='_3k-BhJ') for i in mobile_soup]

    storage_ram = [specs_.find('table', class_='_14cfVK').text if specs_ else None for mob_specs in mobile_specs_soup for specs_ in mob_specs if 'Memory' in str(specs_.text) and 'Storage' in str(specs_.text)]
    storage_ram_cross_checked = [i for i in storage_ram if 'Internal' in i.split()]

    os_processor = [specs_.find('table', class_='_14cfVK').text if specs_ else None for mob_specs in mobile_specs_soup for specs_ in mob_specs if 'Os' in str(specs_.text) and 'Processor' in str(specs_.text)]
    os_processor_cross_checked = [i for i in os_processor if 'Operating' in i.split()]

    camera = [specs_.find('table', class_='_14cfVK').text if specs_ else None for mob_specs in mobile_specs_soup for specs_ in mob_specs if 'Camera' in str(specs_.text) and 'FeaturesPrimary' in str(specs_.text)]
    camera_cross_checked = [i for i in camera if 'Primary' and 'Camera' in i.split()]

    display = [specs_.find('table', class_='_14cfVK').text if specs_ else None for mob_specs in mobile_specs_soup for specs_ in mob_specs if 'Display' in str(specs_.text) and 'FeaturesDisplay' in str(specs_.text)]
    display_cross_checked = [i for i in display if 'Display' and 'cm' in i.split()]

    network = [specs_.find('table', class_='_14cfVK').text if specs_ else None for mob_specs in mobile_specs_soup for specs_ in mob_specs if 'Connectivity' in str(specs_.text) and 'FeaturesNetwork' in str(specs_.text)]
    network_cross_checked = [i for i in network if 'Network' in i.split()]

    battery = [specs_.find('table', class_='_14cfVK').text if specs_ else None for mob_specs in mobile_specs_soup for specs_ in mob_specs if 'Battery' in str(specs_.text) and 'FeaturesBattery' in str(specs_.text)]
    battery_cross_checked = [i for i in battery if 'Battery' in i.split()]

    return mobile_specs_soup, storage_ram_cross_checked, os_processor_cross_checked, camera_cross_checked, display_cross_checked, network_cross_checked, battery_cross_checked


In [12]:
def fill_nan_values_func(mob_names, mob_ratings, mob_price, mob_img_URL, storage_ram, os_processor, camera, display, network, battery):
    tuple_data = zip_longest(mob_names, mob_ratings, mob_price, mob_img_URL, storage_ram, os_processor, camera, display, network, battery,fillvalue=np.nan)
    
    data = [i for i in tuple_data]
    return data
    

In [13]:
def print_len(mob_names, mob_ratings, mob_price, mob_img_URL, storage_ram, os_processor, camera, display, network, battery):
    print(f"Length of mob_names: {len(mob_names)}")
    print(f"Length of mob_ratings: {len(mob_ratings)}")
    print(f"Length of mob_price: {len(mob_price)}")
    print(f"Length of mob_img_URL: {len(mob_img_URL)}")
    print(f"Length of storage_ram: {len(storage_ram)}")
    print(f"Length of os_processor: {len(os_processor)}")
    print(f"Length of camera: {len(camera)}")
    print(f"Length of display: {len(display)}")
    print(f"Length of network: {len(network)}")
    print(f"Length of battery: {len(battery)}")   

In [14]:
df = pd.DataFrame()

page_start = 1
page_ends = 9

for pages in range(page_start, page_ends):

    soup, URL = bs4_soup(pages=pages)
    
    mob_names, mob_ratings, mob_price, mob_img_URL = mob_specs(soup)
    mobile_specs_soup, storage_ram, os_processor, camera, display, network, battery = features_from_mob_links(soup=soup)
    
    data = fill_nan_values_func(mob_names, mob_ratings, mob_price, mob_img_URL, storage_ram, os_processor, camera, display, network, battery)

    if mobile_specs_soup is not None and soup is not None:
        
        print_len(mob_names, mob_ratings, mob_price, mob_img_URL, storage_ram, os_processor, camera, display, network, battery)

        page_df = pd.DataFrame(data=data,columns=['name', 'ratings', 'price', 'imgURL', 'storage_ram','os_processor',
                                                   'camera', 'display', 'network', 'battery'])

        if not page_df.empty:
            df = pd.concat(objs=[df, page_df], ignore_index=True)
            print(f'scraping data from page:{pages}')
            print(f'working in {URL} Page')
            print(f'page df : page_shape : {page_df.shape} \n {page_df.sample(2)}')
            print(f'df : df shape : {df.shape} \n {df.sample(2)}')
            print()

    else:
        print(f"Skipping page {pages} due to empty mobile_specs_soup or soup.")


Length of mob_names: 24
Length of mob_ratings: 24
Length of mob_price: 24
Length of mob_img_URL: 24
Length of storage_ram: 24
Length of os_processor: 24
Length of camera: 24
Length of display: 24
Length of network: 24
Length of battery: 24
scraping data from page:1
working in https://www.flipkart.com/search?q=oneplus+mobiles&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page=1 Page
page df : page_shape : (24, 10) 
                                        name ratings    price   
3  OnePlus 11R 5G (Galactic Silver, 128 GB)     4.5  ₹39,734  \
5        OnePlus 10R (Forest Green, 256 GB)       4  ₹34,990   

                                              imgURL   
3  https://rukminim2.flixcart.com/image/312/312/x...  \
5  https://rukminim2.flixcart.com/image/312/312/x...   

                                         storage_ram   
3                      Internal Storage128 GBRAM8 GB  \
5  Internal Storage256 GBRAM12 GBMemory Card Slot...   

                        

In [15]:
df

Unnamed: 0,name,ratings,price,imgURL,storage_ram,os_processor,camera,display,network,battery
0,"OnePlus Nord CE 2 Lite 5G (Blue Tide, 128 GB)",4.4,"₹17,676",https://rukminim2.flixcart.com/image/312/312/x...,Internal Storage128 GBRAM6 GBMemory Card Slot ...,Operating SystemAndroid 10Processor CoreOcta C...,Primary Camera64MP Rear Camera,Display Size16.74 cm (6.59 inch)Resolution2412...,Network Type5GSupported Networks5G,Battery Capacity5000 mAh
1,"OnePlus Nord CE 2 Lite 5G (Black Dusk, 128 GB)",4.4,"₹17,691",https://rukminim2.flixcart.com/image/312/312/x...,Internal Storage128 GBRAM6 GBMemory Card Slot ...,Operating SystemAndroid 13Processor CoreOcta C...,Primary Camera64MP Rear Camera,Display Size16.74 cm (6.59 inch)Resolution2412...,Network Type5GSupported Networks5G,Battery Capacity5000 mAh
2,"OnePlus 11R 5G (Galactic Silver, 256 GB)",4.5,"₹44,690",https://rukminim2.flixcart.com/image/312/312/x...,Internal Storage256 GBRAM16 GB,Operating SystemAndroid OxygenOS based on Andr...,Primary Camera50MP Rear CameraDual Camera Lens...,Display Size17.02 cm (6.7 inch)Resolution2772x...,Network Type5GSupported Networks5G,Battery Capacity5000 mAh
3,"OnePlus 11R 5G (Galactic Silver, 128 GB)",4.5,"₹39,734",https://rukminim2.flixcart.com/image/312/312/x...,Internal Storage128 GBRAM8 GB,Operating SystemAndroid OxygenOS based on Andr...,Primary Camera50MP Rear CameraDual Camera Lens...,Display Size17.02 cm (6.7 inch)Resolution2772x...,Network Type5GSupported Networks5G,Battery Capacity5000 mAh
4,"OnePlus 11R 5G (Sonic Black, 256 GB)",4.5,"₹44,284",https://rukminim2.flixcart.com/image/312/312/x...,Internal Storage256 GBRAM16 GB,Operating SystemAndroid Oxygen OxygenOS based ...,Primary Camera50MP Rear CameraDual Camera Lens...,Display Size17.02 cm (6.7 inch)Resolution2772x...,Network Type5GSupported Networks5G,Battery Capacity5000 mAh
...,...,...,...,...,...,...,...,...,...,...
187,"OnePlus N20 SE (BLUE OASIS, 64 GB)",4.2,"₹15,674",https://rukminim2.flixcart.com/image/312/312/x...,Internal Storage64 GBRAM4 GBMemory Card Slot T...,Operating SystemAndroid Oxygen 12.1Processor C...,Primary Camera50MP Rear CameraDual Camera Lens...,Display Size16.66 cm (6.56 inch)Resolution1612...,Network Type4G VOLTESupported Networks4G VoLTE,Battery Capacity5000 mAh
188,"OnePlus 9 Pro 5G (Morning Mist, 128 GB)",3.8,"₹47,999",https://rukminim2.flixcart.com/image/312/312/x...,Internal Storage128 GBRAM8 GB,Operating SystemAndroid 11Processor CoreDual C...,Primary Camera48MP Rear CameraDual Camera Lens...,Display Size17.02 cm (6.7 inch)Resolution3216 ...,"Network Type5GSupported Networks4G LTE, GSM, C...",Battery Capacity4500 mAh
189,"OnePlus 6 (Mirror Black, 128 GB)",4.2,"₹17,990",https://rukminim2.flixcart.com/image/312/312/x...,Internal Storage128 GBRAM8 GBMemory Card Slot ...,Operating SystemAndroid Oreo 8.1Processor Core...,Primary Camera AvailableYesPrimary Camera20MP ...,Display Size15.95 cm (6.28 inch)Resolution1080...,Network Type4GSupported Networks4G VoLTEWi-FiY...,Battery Capacity3300 mAh
190,"OnePlus 6T McLaren Edition (Speed Orange, 256 GB)",,"₹26,990",https://rukminim2.flixcart.com/image/312/312/x...,Internal Storage256 GBRAM10 GBMemory Card Slot...,Operating SystemAndroid Oxygen 9Processor Core...,Primary Camera AvailableYesPrimary Camera20MP ...,Display Size16.28 cm (6.41 inch)Resolution2280...,"Network Type4G VOLTE, 4GSupported Networks4G V...",Battery Capacity3300 mAh


In [16]:
print(os.getcwd())
os.chdir(r'd:\vscode_machineLearning\BEST_PROJECTS\mobileRecommenderSystem')
print(os.getcwd())

d:\vscode_machineLearning\BEST_PROJECTS\mobileRecommenderSystem\notebook\data_Ingestion\scraping
d:\vscode_machineLearning\BEST_PROJECTS\mobileRecommenderSystem


In [17]:
df.to_csv(r'data\raw_data\one_pluse_mobile_new_data.csv',index=False)