In [22]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import re

from datetime import timedelta,datetime

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

from urllib.parse import quote_plus


from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

def getMongoClient():
    uri = "mongodb+srv://dagulathiya30:" + \
        quote_plus("Darshan@45") + \
        "@scrape.8yqpmc0.mongodb.net/?retryWrites=true&w=majority"
    client = MongoClient(uri, server_api=ServerApi('1'))
    return client



def pressClear(driver):
    ac = ActionChains(driver)
    ac.send_keys(Keys.TAB * 3 + Keys.ENTER)
    ac.perform()
    ac = ActionChains(driver)
    ac.send_keys(Keys.TAB + Keys.ENTER)
    ac.perform()


def clearBrowser(driver):
    driver.switch_to.window(driver.window_handles[0])
    driver.get("chrome://settings/?search=clear")

    pressClear(driver)
    pressClear(driver)

    driver.switch_to.window(driver.window_handles[1])


def getLinkHTML(driver, itemlink):

    clearBrowser(driver)

    driver.execute_script("window.open('')")
    driver.switch_to.window(driver.window_handles[2])
    driver.get(itemlink)

    data = driver.page_source
    driver.close()
    driver.switch_to.window(driver.window_handles[1])
    return BeautifulSoup(data, 'html.parser')


def scrapeMyntraNewID(driver, mid_base="/dresses?f=Brand%3A", brand_name="SASSAFRAS"):
    base_url = "https://www.myntra.com" + mid_base + brand_name + "&sort=new"

    driver.switch_to.window(driver.window_handles[1])
    driver.get(base_url)
    curr_page_html = BeautifulSoup(driver.page_source, 'html.parser')

    nextPage = True
    mclient = getMongoClient()
    pid_mdp = mclient.get_database('Scrape').get_collection("BrandProductId")
    
    #Delete Older Entries
    del_pid = pid_mdp.find({"date":{"$lt":datetime.today() + timedelta(days=-33)}},{"_id":0,"pid":1})
    del_pid = [row["pid"] for row in list(del_pid)]
    pid_mdp.delete_many({"pid":{"$in":del_pid}})
    # Fetch remaining entries
    prev_pid = pid_mdp.find({"site_name":"Myntra","brand_name": brand_name})
    if not prev_pid:
        prev_pid = []
    else:
        prev_pid = [row["pid"] for row in prev_pid]
        
    product_ids = []
    while nextPage:
        for elem in curr_page_html.find_all("li", {"class": "product-base"}):
            lnk = elem.find('a')['href']
            product_id = re.findall("[0-9]+\\/buy", lnk)[0][:-4]
            if product_id in prev_pid:
                nextPage = False
                break
            product_ids.append(product_id)
        
        if not nextPage:
            break

        nextPage = False
        next_link = curr_page_html.find("li", {"class": "pagination-next"})
        if next_link:
            next_link = next_link.find('a')['href']
            nextPage = True

            curr_page_html = getLinkHTML(driver, next_link)

    ndata = []
    for pid in set(product_ids):
        ndata.append({"site_name":"Myntra","brand_name":brand_name,"pid":pid,"date":datetime.today()})
    if ndata:
        pid_mdp.insert_many(ndata)
    mclient.close()
    return product_ids + prev_pid, del_pid

def scrapeMyntra(driver, mid_base="/dresses?f=Brand%3A", brand_name="SASSAFRAS"):
    base_url = "https://www.myntra.com/"

    #get all new ids and ids to delete
    pids,del_pids = scrapeMyntraNewID(driver,mid_base=mid_base,brand_name=brand_name)
    print("page_done")
    ##############################
    if not pids:
        return
    ##############################
    data = []
    for pid in pids:
        hpg = getLinkHTML(driver,base_url + str(pid))

        elem = hpg.find_all("div",{"id":"detailedRatingContainer"})
        if not elem:
            continue
        
        elem = elem[0]
        avg = elem.find_all("div",{"class":"index-flexRow index-averageRating"})[0].find("span").text
        cnt = elem.find_all("div",{"class":"index-countDesc"})[0].text.split(" ")[0]
        
        sp = hpg.find("span",{"class":"pdp-price"})
        if sp:
            sp = sp.text[1:]


        size_detail = hpg.find_all("div",{"class":"size-buttons-size-buttons"})
        size_detail = size_detail[0].find_all("div",{"class":"size-buttons-buttonContainer"})

        size_row = dict()
        for btn in size_detail:
            btn = btn.find_all("button")[0]
            cls_btn = btn["class"][0].lower()
            size_name = btn.find_all("p")[0].text

            
            if "disabled" in cls_btn:
                size_row[size_name] = "NA"
            else:
                size_row[size_name] = "AV"
        print({"pid":pid,"date":datetime.today(),"avg_rating":avg,"user_count":cnt,"Sizes":size_row})
        data.append({"pid":pid,"date":datetime.today(),"avg_rating":avg,"user_count":cnt,"Sizes":size_row})
        

    #########################
    mclient = getMongoClient()
    pid_mdp = mclient.get_database('Scrape').get_collection("PRD_RT_CNT")


    ###### ---- Delete Products ----- #########
    pid_mdp.delete_many({"pid":{"$in":del_pids}})

    pid_mdp.insert_many(data)
    mclient.close()
    #########################
    return



def threadStarterMyntra(exe_pth = "./chromedriver-mac-arm64",mid_base="/dresses?f=Brand%3A", brand_name="SASSAFRAS"):

    options = webdriver.ChromeOptions()
    service = ChromeService(executable_path=exe_pth)
    #service = ChromeService(ChromeDriverManager().install())

    driver = webdriver.Chrome(service=service, options=options)
    
    driver.get("chrome://settings/?search=clear")
    driver.execute_script("window.open('')")
    driver.switch_to.window(driver.window_handles[1])

    scrapeMyntra(driver,mid_base=mid_base,brand_name=brand_name)

    driver.close()

def startScraper(exe_pth = "./chromedriver-mac-arm64/chromedriver"):

    brands = ["SASSAFRAS","Anouk"]
    mid_base="/dresses?f=Brand%3A"
    thrds = []

    dbs = []
    for brd in brands:
        #dbs.append({"exe_pth":exe_pth,"mid_base":mid_base,"brand_name":brd})

        dbs.append({"exe_pth":exe_pth,"mid_base":mid_base,"brand_name":brd})
        t1 = threading.Thread(target=threadStarterMyntra,args=(),kwargs={"exe_pth":exe_pth,"mid_base":mid_base,"brand_name":brd})
        thrds.append(t1)

    with ThreadPoolExecutor(2) as pool:
        prom = {pool.submit(threadStarterMyntra,exe_pth = kw["exe_pth"],mid_base = kw["mid_base"],brand_name = kw["brand_name"]): kw["mid_base"] + kw["brand_name"] for kw in dbs}
        for task in as_completed(prom):
            try:
                task.result()
            except Exception as e:
                print(e)
    #for t in thrds:
    #    t.start()
    #for t in thrds:
    #    t.join()        
    return

startScraper()

page_done
Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=116.0.5845.110)
Stacktrace:
0   chromedriver                        0x0000000100d1e65c chromedriver + 4318812
1   chromedriver                        0x0000000100d16d00 chromedriver + 4287744
2   chromedriver                        0x00000001009487ec chromedriver + 296940
3   chromedriver                        0x00000001009206e4 chromedriver + 132836
4   chromedriver                        0x00000001009abde4 chromedriver + 703972
5   chromedriver                        0x00000001009be5b8 chromedriver + 779704
6   chromedriver                        0x000000010097a178 chromedriver + 500088
7   chromedriver                        0x000000010097afc0 chromedriver + 503744
8   chromedriver                        0x0000000100cdec40 chromedriver + 4058176
9   chromedriver                        0x0000000100ce3160 chromedriver + 4075872
10  chromedriver               

In [None]:
import datetime
datetime.date.today()

In [26]:

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

from urllib.parse import quote_plus

uri = "mongodb+srv://dagulathiya30:"+ quote_plus("Darshan@45") + "@scrape.8yqpmc0.mongodb.net/?retryWrites=true&w=majority"
client = MongoClient(uri, server_api=ServerApi('1'))

try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [40]:
dbp = client.get_database('Scrape').get_collection("BrandProductId")
cnt = 0
for row in dbp.find({}):
    cnt += 1
print(cnt)

0


In [39]:
dbp.delete_many({})

<pymongo.results.DeleteResult at 0x11914feb0>

In [5]:
import http


AttributeError: module 'http' has no attribute 'cookies'

In [23]:
ddp = client.get_database('Scrape').get_collection("PRD_RT_CNT")
print(len(list(ddp.find({}))))

705


In [None]:
import sys
a,b,c,d = [[] for _ in range(4)]

for line in input():
    line = list(map(float,line.split("")))

    a.append(line[0])
    b.append(line[1])
    c.append(line[])

Expected Output: True Outcome True
Expected Output: False Outcome False
Expected Output: False Outcome False
Expected Output: True Outcome True
Expected Output: False Outcome False
Expected Output: True Outcome True
