In [11]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import os
import numpy as np
from itertools import zip_longest

In [12]:
def bs4_soup(pages):
    URL = f'https://www.flipkart.com/search?q=techno+mobile&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page={pages}'
    request = requests.get(URL)
    soup = bs(request.text,'html.parser')
    soup = soup.find('div',class_ = '_1YokD2 _3Mn1Gg')

    return soup , URL

In [13]:
def mob_specs(soup):
    names = soup.find_all('div', class_='_4rR01T')
    mob_names = [i.text if i else None for i in names]

    ratings = soup.find_all('div', class_='_3LWZlK')
    mob_ratings = [i.text if i else None for i in ratings]

    price = soup.find_all('div', class_='_30jeq3 _1_WHN1')
    mob_price = [i.text if i else None for i in price]

    img = soup.find_all('div', class_='CXW8mj')
    mob_img_URL = [i.find('img')['src'] if i.find('img') else None for i in img]

    return mob_names, mob_ratings, mob_price, mob_img_URL


In [14]:
def features_from_mob_links(soup):
    mobile_links = ['https://www.flipkart.com' + i['href'] for i in soup.find_all('a', class_='_1fQZEK')]
    mobile_req = [requests.get(url=i) for i in mobile_links]
    mobile_soup = [bs(i.text, 'html.parser').find('div', class_='_1YokD2 _2GoDe3') for i in mobile_req]
    mobile_specs_soup = [i.find_all('div', class_='_3k-BhJ') for i in mobile_soup]

    storage_ram = [specs_.find('table', class_='_14cfVK').text if specs_ else None for mob_specs in mobile_specs_soup for specs_ in mob_specs if 'Memory' in str(specs_.text) and 'Storage' in str(specs_.text)]
    storage_ram_cross_checked = [i for i in storage_ram if 'Internal' in i.split()]

    os_processor = [specs_.find('table', class_='_14cfVK').text if specs_ else None for mob_specs in mobile_specs_soup for specs_ in mob_specs if 'Os' in str(specs_.text) and 'Processor' in str(specs_.text)]
    os_processor_cross_checked = [i for i in os_processor if 'Operating' in i.split()]

    camera = [specs_.find('table', class_='_14cfVK').text if specs_ else None for mob_specs in mobile_specs_soup for specs_ in mob_specs if 'Camera' in str(specs_.text) and 'FeaturesPrimary' in str(specs_.text)]
    camera_cross_checked = [i for i in camera if 'Primary' and 'Camera' in i.split()]

    display = [specs_.find('table', class_='_14cfVK').text if specs_ else None for mob_specs in mobile_specs_soup for specs_ in mob_specs if 'Display' in str(specs_.text) and 'FeaturesDisplay' in str(specs_.text)]
    display_cross_checked = [i for i in display if 'Display' and 'cm' in i.split()]

    network = [specs_.find('table', class_='_14cfVK').text if specs_ else None for mob_specs in mobile_specs_soup for specs_ in mob_specs if 'Connectivity' in str(specs_.text) and 'FeaturesNetwork' in str(specs_.text)]
    network_cross_checked = [i for i in network if 'Network' in i.split()]

    battery = [specs_.find('table', class_='_14cfVK').text if specs_ else None for mob_specs in mobile_specs_soup for specs_ in mob_specs if 'Battery' in str(specs_.text) and 'FeaturesBattery' in str(specs_.text)]
    battery_cross_checked = [i for i in battery if 'Battery' in i.split()]

    return mobile_specs_soup, storage_ram_cross_checked, os_processor_cross_checked, camera_cross_checked, display_cross_checked, network_cross_checked, battery_cross_checked


In [15]:
def fill_nan_values_func(mob_names, mob_ratings, mob_price, mob_img_URL, storage_ram, os_processor, camera, display, network, battery):
    tuple_data = zip_longest(mob_names, mob_ratings, mob_price, mob_img_URL, storage_ram, os_processor, camera, display, network, battery,fillvalue=np.nan)
    
    data = [i for i in tuple_data]
    return data
    

In [16]:
def print_len(mob_names, mob_ratings, mob_price, mob_img_URL, storage_ram, os_processor, camera, display, network, battery):
    print(f"Length of mob_names: {len(mob_names)}")
    print(f"Length of mob_ratings: {len(mob_ratings)}")
    print(f"Length of mob_price: {len(mob_price)}")
    print(f"Length of mob_img_URL: {len(mob_img_URL)}")
    print(f"Length of storage_ram: {len(storage_ram)}")
    print(f"Length of os_processor: {len(os_processor)}")
    print(f"Length of camera: {len(camera)}")
    print(f"Length of display: {len(display)}")
    print(f"Length of network: {len(network)}")
    print(f"Length of battery: {len(battery)}")   

In [17]:
df = pd.DataFrame()

page_start = 1
page_ends = 9

for pages in range(page_start, page_ends):

    soup, URL = bs4_soup(pages=pages)
    
    mob_names, mob_ratings, mob_price, mob_img_URL = mob_specs(soup)
    mobile_specs_soup, storage_ram, os_processor, camera, display, network, battery = features_from_mob_links(soup=soup)
    
    data = fill_nan_values_func(mob_names, mob_ratings, mob_price, mob_img_URL, storage_ram, os_processor, camera, display, network, battery)

    if mobile_specs_soup is not None and soup is not None:
        
        print_len(mob_names, mob_ratings, mob_price, mob_img_URL, storage_ram, os_processor, camera, display, network, battery)

        page_df = pd.DataFrame(data=data,columns=['name', 'ratings', 'price', 'imgURL', 'storage_ram', 'camera',
       'oS_Processor', 'display', 'network', 'battery'])

        if not page_df.empty:
            df = pd.concat(objs=[df, page_df], ignore_index=True)
            print(f'scraping data from page:{pages}')
            print(f'working in {URL} Page')
            print(f'page df : page_shape : {page_df.shape} \n {page_df.sample(2)}')
            print(f'df : df shape : {df.shape} \n {df.sample(2)}')
            print()

    else:
        print(f"Skipping page {pages} due to empty mobile_specs_soup or soup.")


In [None]:
df

Unnamed: 0,name,ratings,price,imgURL,storage_ram,camera,oS_Processor,display,network,battery
0,"Tecno Pova 3 (Eco Black, 128 GB)",4.1,"₹13,998",https://rukminim2.flixcart.com/image/312/312/l...,Internal Storage128 GBRAM6 GBExpandable Storag...,Operating SystemAndroid 12Processor TypeMediaT...,Primary Camera AvailableYesPrimary Camera50MP ...,Display Size17.53 cm (6.9 inch)Resolution1080 ...,"Network Type4G VOLTE, 4GSupported Networks4G V...",Battery Capacity7000 mAhBattery TypeLi-Polymer
1,"Tecno Pop 5 Pro (Deepsea Luster, 32 GB)",4.2,"₹6,960",https://rukminim2.flixcart.com/image/312/312/k...,Internal Storage32 GBRAM3 GBExpandable Storage...,Operating SystemAndroid 11 (Go Edition)Process...,Primary Camera AvailableYesPrimary Camera8MP R...,Display Size16.56 cm (6.52 inch)Resolution720 ...,"Network Type4G VOLTE, 4GSupported Networks4G V...",Battery Capacity6000 mAhBattery TypeLi-Po
2,"Tecno Pova 3 (Eco Black, 64 GB)",4.2,"₹10,849",https://rukminim2.flixcart.com/image/312/312/l...,Internal Storage64 GBRAM4 GBExpandable Storage...,Operating SystemAndroid 12Processor TypeMediaT...,Primary Camera AvailableYesPrimary Camera50MP ...,Display Size17.53 cm (6.9 inch)Resolution1080 ...,"Network Type4G VOLTE, 4GSupported Networks4G V...",Battery Capacity7000 mAhBattery TypeLi-Po
3,"Tecno Pova 3 (Tech Silver, 64 GB)",4.2,"₹10,978",https://rukminim2.flixcart.com/image/312/312/l...,Internal Storage64 GBRAM4 GBExpandable Storage...,Operating SystemAndroid 12Processor TypeMediaT...,Primary Camera AvailableYesPrimary Camera50MP ...,Display Size17.53 cm (6.9 inch)Resolution1080 ...,"Network Type4G VOLTE, 4GSupported Networks4G V...",Battery Capacity7000 mAhBattery TypeLi-Po
4,"Tecno Spark Go 2023 (UYUNI BLUE, 32+3 GB)",4.1,"₹6,750",https://rukminim2.flixcart.com/image/312/312/x...,Internal Storage32+3 GBRAM3 GBMemory Card Slot...,Operating SystemAndroid 12Primary Clock Speed2...,Primary Camera13MP Rear CameraDual Camera Lens...,Display Size16.66 cm (6.56 inch)Resolution720x...,Network Type4G VOLTESupported Networks4G VoLTE,Battery Capacity5000 mAh
5,"Tecno Spark 9 (Infinity Black, 64 GB)",4.1,"₹8,541",https://rukminim2.flixcart.com/image/312/312/x...,Internal Storage64 GBRAM4 GBExpandable Storage...,Operating SystemAndroid 12Processor TypeMediaT...,Primary Camera AvailableYesPrimary Camera13MP ...,Display Size16.76 cm (6.6 inch)Resolution720 x...,"Network Type4G VOLTE, 4GSupported Networks4G V...",Battery Capacity5000 mAhBattery TypeLi-Po
6,"Tecno Spark Go 2023 (Endless Black, 32+3 GB)",4.1,"₹6,708",https://rukminim2.flixcart.com/image/312/312/x...,Internal Storage32+3 GBRAM3 GBMemory Card Slot...,"Operating SystemAndroid 12Processor Type2?GHz,...",Primary Camera13MP Rear CameraDual Camera Lens...,Display Size16.66 cm (6.56 inch)Resolution720*...,Network Type4G VOLTESupported Networks4G VoLTE,Battery Capacity5000 mAhBattery TypeNon-Remova...
7,"Tecno Pop 7 pro (?Uyuni Blue, 64 GB)",4.0,"₹6,499",https://rukminim2.flixcart.com/image/312/312/x...,Internal Storage64 GBRAM2 GBMemory Card Slot T...,Operating SystemAndroid Android 12.0Processor ...,Primary Camera12MP Rear CameraDual Camera Lens...,Display Size16.66 cm (6.56 inch)Resolution1612...,Network Type4G VOLTESupported Networks4G VoLTE,Battery Capacity5000 mAh
8,"Tecno Pop 7 pro (?Endless Black, 64 GB)",4.0,"₹6,998",https://rukminim2.flixcart.com/image/312/312/x...,Internal Storage64 GBRAM2 GBMemory Card Slot T...,Operating SystemAndroid Android 12.0Processor ...,Primary Camera12MP Rear CameraDual Camera Lens...,Display Size16.66 cm (6.56 inch)Resolution1612...,Network Type4G VOLTESupported Networks4G VoLTE,Battery Capacity5000 mAh
9,"Tecno Camon 19 (Memphis Green, 128 GB)",4.2,"₹13,648",https://rukminim2.flixcart.com/image/312/312/x...,Internal Storage128 GBRAM6 GBExpandable Storag...,Operating SystemAndroid 12Processor TypeMediaT...,Primary Camera AvailableYesPrimary Camera64MP ...,Display Size17.27 cm (6.8 inch)Resolution1080 ...,"Network Type4G VOLTE, 4GSupported Networks4G V...",Battery Capacity5000 mAhBattery TypeLi-Po


In [None]:
print(os.getcwd())
os.chdir(r'd:\vscode_machineLearning\BEST_PROJECTS\mobileRecommenderSystem')
print(os.getcwd())

d:\vscode_machineLearning\BEST_PROJECTS\mobileRecommenderSystem\notebook\data_Ingestion\scraping
d:\vscode_machineLearning\BEST_PROJECTS\mobileRecommenderSystem


In [None]:
df.to_csv(r'data\raw_data\techno_mobile_data.csv',index=False)