In [1]:
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

from bs4 import BeautifulSoup
import re
import json
import requests

from datetime import timedelta, datetime

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

from urllib.parse import quote_plus

import threading
from concurrent.futures import ThreadPoolExecutor, as_completed


def getMongoClient():
    uri = "mongodb+srv://dagulathiya30:" + \
        quote_plus("Darshan@45") + \
        "@scrape.8yqpmc0.mongodb.net/?retryWrites=true&w=majority"
    client = MongoClient(uri, server_api=ServerApi('1'))
    return client


def pressClear(driver):
    ac = ActionChains(driver)
    ac.send_keys(Keys.TAB * 3 + Keys.ENTER)
    ac.perform()
    ac = ActionChains(driver)
    ac.send_keys(Keys.TAB + Keys.ENTER)
    ac.perform()


def safeCheck(driver):

    if len(driver.window_handles) >= 2:
        for i in range(2, len(driver.window_handles)):
            driver.switch_to.window(driver.window_handles[i])
            driver.close()
    driver.switch_to.window(driver.window_handles[1])
    return


def clearBrowser(driver):
    driver.switch_to.window(driver.window_handles[0])
    driver.get("chrome://settings/?search=clear")
    pressClear(driver)
    driver.get("chrome://settings/?search=clear")
    pressClear(driver)

    driver.switch_to.window(driver.window_handles[1])


def getLinkHTML(driver, itemlink):

    clearBrowser(driver)
    safeCheck(driver)

    driver.execute_script("window.open('')")
    driver.switch_to.window(driver.window_handles[2])
    driver.get(itemlink)

    data = driver.page_source
    driver.close()
    driver.switch_to.window(driver.window_handles[1])
    return BeautifulSoup(data, 'html.parser')


def getProductPage(link):

    d = requests.get(link, headers={'User-Agent': 'Mozilla/5.0'})

    soup_res = BeautifulSoup(d.text, 'html.parser')
    scripts = soup_res.find_all('script')

    for script in scripts:
        if script.string == None:
            continue
        script = str(script)
        if script.startswith("<script>window.__myx = "):
            match_res = re.search("{.*}", script)
            jdata = script[match_res.start():match_res.end()]
            jdata = json.loads(jdata)

            return jdata
    return None


def scrapeMyntraNewID(driver, mid_base="/dresses?f=Brand%3A", brand_name="SASSAFRAS", category="DRESSES", updater=False):
    base_url = "https://www.myntra.com" + mid_base + brand_name + "&sort=new"

    driver.switch_to.window(driver.window_handles[1])
    driver.get(base_url)
    safeCheck(driver)

    curr_page_html = BeautifulSoup(driver.page_source, 'html.parser')

    nextPage = True
    mclient = getMongoClient()
    pid_mdp = mclient.get_database('Scrape').get_collection("BrandProductId")

    # Delete Older Entries
    if updater:
        del_pid = pid_mdp.find(
            {"site_name": "Myntra", "brand_name": brand_name, "category": category}, {"_id": 0, "pid": 1})
        del_pid = [row["pid"] for row in list(del_pid)]

    # Fetch remaining entries
    if not updater:
        prev_pid = pid_mdp.find(
            {"date": {"$gte": datetime.today() + timedelta(days=-33)}}, {"_id": 0, "pid": 1})
    else:
        prev_pid = pid_mdp.find({}, {"_id": 0, "pid": 1})

    hash_prev_id = dict()
    for row in prev_pid:
        hash_prev_id[row["pid"]] = True

    product_ids = []
    while nextPage:
        for elem in curr_page_html.find_all("li", {"class": "product-base"}):
            lnk = elem.find('a')['href']
            product_id = re.findall("[0-9]+\\/buy", lnk)[0][:-4]

            if hash_prev_id.get(product_id, False) and updater == False:
                if updater == False:
                    nextPage = False
                break
            product_ids.append([product_id, lnk])

        if not nextPage:
            break

        nextPage = False
        next_link = curr_page_html.find("li", {"class": "pagination-next"})
        if next_link:
            next_link = next_link.find('a')['href']
            nextPage = True

            curr_page_html = getLinkHTML(driver, next_link)

    ndata = []
    for pid, product_link in product_ids:
        if hash_prev_id.get(product_id, False) and updater == False:
            continue
        ndata.append({"site_name": "Myntra", "brand_name": brand_name,
                     "pid": pid, "date": datetime.today(), "category": category, "product_link": product_link})
    if updater:
        pid_mdp.delete_many({"pid": {"$in": del_pid}, "site_name": "Myntra",
                            "brand_name": brand_name, "category": category})

    if ndata:
        pid_mdp.insert_many(ndata)

    product_links = pid_mdp.find(
        {"date": {"$gte": datetime.today() + timedelta(days=-33)}, "site_name": "Myntra", "brand_name": brand_name, "category": category}, {"_id": 0, "product_link": 1, "pid": 1})
    data = [[row['pid'], row["product_link"]] for row in list(product_links)]
    mclient.close()
    return data


def scrapeMyntra(driver, mid_base="/dresses?f=Brand%3A", brand_name="SASSAFRAS", category="DRESSES"):
    base_url = "https://www.myntra.com/"

    # get all new ids and ids to delete
    prd_data = scrapeMyntraNewID(
        driver, mid_base=mid_base, brand_name=brand_name, category=category)
    # print("page_done")
    ##############################
    if not prd_data:
        return
    ##############################
    data = []
    for pid, plink in prd_data:
        try:

            pdpData = getProductPage(base_url + plink)
            
            
            if not pdpData:
                continue
            
            pdpData = pdpData['pdpData']
            sp = pdpData.get("price", {}).get("discounted", 0)
            mrp = pdpData.get("price", {}).get("mrp", 0)
            avg = pdpData.get("ratings", {}).get("averageRating", 0)
            cnt = pdpData.get("ratings", {}).get("totalCount", 0)
            size_row = dict()
            
            for row in pdpData.get('sizes', []):
                size_row[row['label']] = row['available']
            img_urls = []
            for row in pdpData.get('media',dict()).get('albums',[]):
                if row["name"] == 'default':
                    for img in row["images"]:
                        img_urls.append(img["imageURL"])
            

            data.append({"pid": pid, "date": datetime.today(),
                        "avg_rating": avg, "user_count": cnt, "Sizes": size_row, "SP": sp, "mrp": mrp, "img_urls": img_urls})

        except Exception as e:
            print("An exception occurred", e)
            print("PID", plink)

            # PUt logs file here
    #########################
    mclient = getMongoClient()
    pid_mdp = mclient.get_database('Scrape').get_collection("PRD_RT_CNT")

    ###### ---- Delete Products ----- #########

    pid_mdp.insert_many(data)
    mclient.close()
    #########################
    return


def threadStarterMyntra(exe_pth="./chromedriver-mac-arm64/", mid_base="/dresses?f=Brand%3A", brand_name="SASSAFRAS", category="DRESSES"):

    options = webdriver.ChromeOptions()
    service = ChromeService()
    # driver = webdriver.Chrome(service=service, options=options)
    driver = webdriver.Chrome(service=ChromeService(
        ChromeDriverManager().install()))

    driver.get("chrome://settings/?search=clear")
    driver.execute_script("window.open('')")
    driver.switch_to.window(driver.window_handles[1])

    scrapeMyntra(driver, mid_base=mid_base,
                 brand_name=brand_name, category=category)

    driver.close()


def startScraper(exe_pth="E:\\Scrap\\chromedriver-win64\\"):
    ##########
    # Put your brands and prefix of links here
    # brands = ["SASSAFRAS", "Anouk", "Tokyo Talkies"]
    prefix_link_brand = {
        "DRESSES": {'url': "/dresses?f=Brand%3A", "brands": ["SASSAFRAS"]}

        ###
        # , "CategoryName" : {'url': 'url_prefix_here', 'brands' : ['brandname1','brandname2']}
        ###
    }
    ##########
    # mid_base = "/dresses?f=Brand%3A"
    scrape_params = []
    for category in prefix_link_brand:
        for brand in prefix_link_brand[category]["brands"]:
            # t1 = threading.Thread(target=threadStarterMyntra, args=(), kwargs={
            #    "exe_pth": exe_pth, "mid_base": prefix_link_brand[category]['url'], "brand_name": brand, "category": category})

            scrape_params.append(
                [exe_pth, prefix_link_brand[category]['url'], brand, category])

    with ThreadPoolExecutor(MAX_WORKERS) as executor:
        process = {executor.submit(threadStarterMyntra,
                                   exe_pth=exe_pth,
                                   mid_base=mid_base,
                                   brand_name=brand_name,
                                   category=category): process_num for process_num, (exe_pth, mid_base, brand_name, category) in enumerate(scrape_params)}

        for completed_task in as_completed(process):
            task_num = process[completed_task]
            try:
                completed_task.result()
                print("Completed Task:", scrape_params[task_num])
            except Exception as exp:
                print("Exception Occured while completing:",
                      scrape_params[task_num])
                print(exp)

    return

#################### Config Params ################
##################### ---------------------------------################################


MAX_WORKERS = 1
startScraper(exe_pth="./chromedriver-mac-arm64/chromedriver")


https://www.myntra.com/dresses/sassafras/sassafras-square-neck-sheath-mini-dress/24956648/buy
1
https://www.myntra.com/dresses/sassafras/sassafras-square-neck-knitted-sheath-midi-dress/24956686/buy
2
https://www.myntra.com/dresses/sassafras/sassafras-turtle-neck-cotton-sheath-mini-dress/24956636/buy
3
https://www.myntra.com/dresses/sassafras/sassafras-black-a-line-mini-dress/24991404/buy
4
https://www.myntra.com/dresses/sassafras/sassafras-navy-blue-velvet-t-shirt-dress/24990238/buy
5
https://www.myntra.com/dresses/sassafras/sassafras-long-sleeve-bodycon-midi-dress/24956644/buy
6
https://www.myntra.com/dresses/sassafras/sassafras-rib-bodycon-midi-dress/24956658/buy
7
https://www.myntra.com/dresses/sassafras/sassafras-black-fit--flare-midi-dress/24991392/buy
8
https://www.myntra.com/dresses/sassafras/sassafras-beige-ribbed-sheath-midi-dress/24956654/buy
9
https://www.myntra.com/dresses/sassafras/sassafras-purple-velvet-bodycon-dress/24990244/buy
10
https://www.myntra.com/dresses/sassafr

58
https://www.myntra.com/dresses/sassafras/sassafras-rust-square-neck-long-sleeve-sheath-midi-dress/24340780/buy


In [1]:
getProductPage("https://www.myntra.com/dresses/sassafras/sassafras-square-neck-sheath-mini-dress/24956690/buy")

NameError: name 'getProductPage' is not defined

In [None]:
import datetime
datetime.date.today()

In [None]:

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

from urllib.parse import quote_plus

uri = "mongodb+srv://dagulathiya30:"+ quote_plus("Darshan@45") + "@scrape.8yqpmc0.mongodb.net/?retryWrites=true&w=majority"
client = MongoClient(uri, server_api=ServerApi('1'))

try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

In [None]:
dbp = client.get_database('Scrape').get_collection("BrandProductId")
cnt = 0
for row in dbp.find({}):
    cnt += 1
print(cnt)

In [None]:
dbp.delete_many({})

In [None]:
import http


In [None]:
ddp = client.get_database('Scrape').get_collection("PRD_RT_CNT")
print(len(list(ddp.find({}))))

In [None]:
import sys
a,b,c,d = [[] for _ in range(4)]

for line in input():
    line = list(map(float,line.split("")))

    a.append(line[0])
    b.append(line[1])
    c.append(line[])

In [2]:
import bs4
import json
import requests
import re
d = requests.get("https://www.myntra.com/saree",headers={'User-Agent': 'Mozilla/5.0',})

soup_res = bs4.BeautifulSoup(d.text,'html.parser')
scripts = soup_res.find_all('script')



In [3]:
s = ""
d = soup_res.find_all('code')
for row in d:
    if row["data-class"].startswith("23"):
        divs = row.find_all('div')
        for div in divs:
            if div['data-tag'].endswith('93'):
                for span in div.find_all('span'):
                    if "21" in span['data-id']:
                        s += span.find_all('i')[0]['value']

In [4]:
s

''

In [5]:
for s in scripts:
    s = str(s)

    if s.startswith('<script>window.__myx ='):
        match_res = re.search("{.*}", s)
        jdata = s[match_res.start():match_res.end()]
        jdata = json.loads(jdata)
        print(json.dumps(jdata['searchData']['results']['products'],indent=2))

[
  {
    "landingPageUrl": "sarees/kalini/kalini-ethnic-motif-woven-design-zari-kanjeevaram-saree/22536178/buy",
    "loyaltyPointsEnabled": false,
    "adId": "",
    "isPLA": false,
    "productId": 22536178,
    "product": "KALINI Ethnic Motif Woven Design Zari Kanjeevaram Saree",
    "productName": "KALINI Ethnic Motif Woven Design Zari Kanjeevaram Saree",
    "rating": 3.75630259513855,
    "ratingCount": 2023,
    "isFastFashion": true,
    "futureDiscountedPrice": 0,
    "futureDiscountStartDate": "",
    "discount": 2508,
    "brand": "KALINI",
    "searchImage": "http://assets.myntassets.com/assets/images/22536178/2023/3/28/060630ba-0cb0-46f5-ab19-59a0d7502af41679989291511KALINIMaroonBlueWovenDesignZariPureSilkKanjeevaramSaree1.jpg",
    "effectiveDiscountPercentageAfterTax": 0,
    "effectiveDiscountAmountAfterTax": 0,
    "buyButtonWinnerSkuId": 71472036,
    "buyButtonWinnerSellerPartnerId": 23284,
    "relatedStylesCount": 0,
    "relatedStylesType": "",
    "productVideo

In [10]:
for s in scripts:
    s = str(s)

    if s.startswith('<script>window.__myx ='):
        match_res = re.search("{.*}", s)
        jdata = s[match_res.start():match_res.end()]
        jdata = json.loads(jdata)
        for row in jdata['searchData']['results']['products']:
            if "24973404" in str(row):
                print(row)

        

In [3]:
def getProductPage(link):
    print(link)
    d = requests.get(link,headers={'User-Agent': 'Mozilla/5.0'})
    print(d)
    soup_res = bs4.BeautifulSoup(d.text, 'html.parser')
    scripts = soup_res.find_all('script')

    for script in scripts:
        if script.string == None:
            continue
        script = str(script)
        if script.startswith("<script>window.__myx = "):
            print(script)
            match_res = re.search("{.*}", script)
            jdata = script[match_res.start():match_res.end()]
            jdata = json.loads(jdata)
            print(jdata)
            return jdata
    return None

getProductPage("https://www.myntra.com/dresses?f=Brand%3AHarpa&sort=new&p=1")

https://www.myntra.com/dresses?f=Brand%3AHarpa&sort=new&p=1
<Response [200]>
<script>window.__myx = {"searchData":{"results":{"responseType":"LIST","totalCount":454,"totalCountRepresentation":"454","hasNextPage":true,"listPageContext":"fashion","totalPLAShown":0,"totalPLACount":0,"isDiscountFallback":false,"filters":{"savedFilters":[],"primaryFilters":[{"id":"size_facet","filterValues":[{"id":"XS","value":"XS","count":115,"meta":"","pLevel":""},{"id":"S","value":"S","count":367,"meta":"","pLevel":""},{"id":"M","value":"M","count":274,"meta":"","pLevel":""},{"id":"L","value":"L","count":260,"meta":"","pLevel":""},{"id":"XL","value":"XL","count":169,"meta":"","pLevel":""}]},{"id":"Color","filterValues":[{"id":"Black","value":"Black","count":116,"meta":"36454f","pLevel":"pop"},{"id":"Navy Blue","value":"Navy Blue","count":53,"meta":"3c4477","pLevel":"pop"},{"id":"Blue","value":"Blue","count":28,"meta":"0074D9","pLevel":"pop"},{"id":"Beige","value":"Beige","count":9,"meta":"e8e6cf","pLevel

{'searchData': {'results': {'responseType': 'LIST',
   'totalCount': 454,
   'totalCountRepresentation': '454',
   'hasNextPage': True,
   'listPageContext': 'fashion',
   'totalPLAShown': 0,
   'totalPLACount': 0,
   'isDiscountFallback': False,
   'filters': {'savedFilters': [],
    'primaryFilters': [{'id': 'size_facet',
      'filterValues': [{'id': 'XS',
        'value': 'XS',
        'count': 115,
        'meta': '',
        'pLevel': ''},
       {'id': 'S', 'value': 'S', 'count': 367, 'meta': '', 'pLevel': ''},
       {'id': 'M', 'value': 'M', 'count': 274, 'meta': '', 'pLevel': ''},
       {'id': 'L', 'value': 'L', 'count': 260, 'meta': '', 'pLevel': ''},
       {'id': 'XL', 'value': 'XL', 'count': 169, 'meta': '', 'pLevel': ''}]},
     {'id': 'Color',
      'filterValues': [{'id': 'Black',
        'value': 'Black',
        'count': 116,
        'meta': '36454f',
        'pLevel': 'pop'},
       {'id': 'Navy Blue',
        'value': 'Navy Blue',
        'count': 53,
        'met

In [3]:
import re
for s in scripts:
    if s.string == None:
        continue
    s = str(s)
    print(s)
    if s.startswith("<script>window.__myx = "):
        match_res = re.search("{.*}",s)
        jdata = s[match_res.start():match_res.end()]
        jdata = json.loads(jdata)
        print(jdata)

<script type="application/ld+json">
	        {
	        	"@context" : "https://schema.org",
	            "@type" : "Organization",
	            "Name" : "Myntra",
	            "URL" : "https://www.myntra.com",
	            "contactPoint" : [{
	            	"@type" : "ContactPoint",
	            	"telephone" : "+91-80-61561999",
	            	"contactType" : "Customer Service"
	            }],
	            "logo" : "https://constant.myntassets.com/web/assets/img/logo_2021.png",
	            "sameAs" : [
	            	"https://www.facebook.com/myntra",
	            	"https://twitter.com/myntra",
	            	"https://plus.google.com/+myntra",
	            	"https://www.instagram.com/myntra",
	            	"https://www.youtube.com/user/myntradotcom"
	            ]
	        }
	    </script>
<script type="application/ld+json">
	        {
	        	"@context" : "https://schema.org",
	            "@type" : "Product",
	            "name" : "Melange by Lifestyle Floral Printed Regular Pure Cot

In [4]:
jdata['pdpData']

{'id': 22451388,
 'name': 'Melange by Lifestyle Floral Printed Regular Pure Cotton Kurta with Trousers',
 'mrp': 3999,
 'manufacturer': 'Lifestyle In ternational Private Limited,77 Town Centre,No 3 West Wing, Off, Yamlur PO, Bengaluru, Karnataka 560037',
 'countryOfOrigin': 'India',
 'colours': None,
 'baseColour': 'Beige',
 'brand': {'uidx': '',
  'name': 'Melange by Lifestyle',
  'image': '',
  'bio': '',
  'brandAttributes': None},
 'brandOrderDetails': '',
 'media': {'videos': [],
  'albums': [{'name': 'default',
    'images': [{'src': 'http://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/22451388/2023/3/21/99b110fb-f5f5-40ef-833a-aa1ed64de8481679387653734KurtaSets1.jpg',
      'secureSrc': 'https://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/22451388/2023/3/21/99b110fb-f5f5-40ef-833a-aa1ed64de8481679387653734KurtaSets1.jpg',
      'host': None,
      'imageURL': 'http://assets.myntassets.com/assets/ima

In [16]:
for row in jdata['pdpData']['media']['albums']:
    if row["name"] == 'default':
        for img in row["images"]:
            print(img["imageURL"])

http://assets.myntassets.com/assets/images/22451388/2023/3/21/99b110fb-f5f5-40ef-833a-aa1ed64de8481679387653734KurtaSets1.jpg
http://assets.myntassets.com/assets/images/22451388/2023/3/21/a120e877-7d1a-4504-b15c-b152b0f7b6d91679387653768KurtaSets2.jpg
http://assets.myntassets.com/assets/images/22451388/2023/3/21/7512ae5f-f9e5-45fd-bbbf-bd0f4698fc6b1679387653781KurtaSets3.jpg
http://assets.myntassets.com/assets/images/22451388/2023/3/21/79fa00fe-23be-46c2-8693-ea96984bb5921679387653806KurtaSets4.jpg
http://assets.myntassets.com/assets/images/22451388/2023/3/21/23dbc994-d86a-4c45-92cc-815030b3f7d71679387653751KurtaSets5.jpg
http://assets.myntassets.com/assets/images/22451388/2023/3/21/668769d0-b01f-4cb4-b12b-d59e00ab2c181679387653794KurtaSets6.jpg
