In [None]:
import pandas as pd
import sqlite3
import time
import requests
import concurrent.futures

import warnings
warnings.filterwarnings('ignore')



In [None]:
base_url = "https://www.ajio.com/"

URLs for AJAX requests

In [None]:
# url1 = "https://www.ajio.com/api/p/462730549_black"
# url2 = "https://www.ajio.com/api/category/8302?fields=SITE&currentPage=0&pageSize=100&format=json&query=%3Arelevance&sortBy=relevance&curated=true&curatedid=men-clothing&gridColumns=3&facets=&advfilter=true"
# url3 = "https://www.ajio.com/api/search?fields=SITE&currentPage=0&pageSize=100&format=json&query={category}%3Arelevance&sortBy=relevance&text={category}%20clothing&gridColumns=3&advfilter=true&platform=site"

# Categories

In [None]:
# Retrieving Names of Categories
types_df = pd.read_csv("listOfTypes.csv")
categories = types_df['Type'].to_list()
categories

# Category-wise Total Pages

In [None]:
page_nums = []
for category in categories:
    if category == 'men-clothing':
        url = "https://www.ajio.com/api/category/8302?fields=SITE&currentPage=0&pageSize=100&format=json&query=%3Arelevance&sortBy=relevance&curated=true&curatedid=men-clothing&gridColumns=3&facets=&advfilter=true"
    else:
        url = f"https://www.ajio.com/api/search?fields=SITE&currentPage=0&pageSize=100&format=json&query={category}%3Arelevance&sortBy=relevance&text={category}%20clothing&gridColumns=3&advfilter=true&platform=site"
    
    response = requests.session().get(url).json()
    total = response['pagination']['totalPages']
    print(category, total)
    page_nums.append(total)

# Extract Product Data

Main data extraction method

In [None]:
def extract(item):
    """
        Columns ->

       • Website: String
       • Product Link: String
       • Product Name: String
       • Product Brand: String
       • Product Category: String  ->  # In lower cells
       • Sizes Available: Array
       • Price: Integer
       • MRP Integer
       • Gender: String
       • Description: String
       • Primary Image Link: String
       • Secondary Image Links: Array
   
    """
    
    item = requests.session().get(item).json()
    
    website = base_url
    
    
    #Product Link
    try:
        product_link = base_url + item['baseOptions'][0]['options'][0]['url'][1:]
    except:
        product_link = None
        
    #Product Name
    try:
        product_name = item['name']
    except:
        product_name = None
    
    #Product Brand
    try:
        product_brand = item['brandName']
    except:
        product_brand = None
    
    #Product Category
    try:
        product_category = item['brickName']
    except:
        product_category = None
    
    #Sizes Available
    try:
        sizes = []
        for i in range(len(item['variantOptions'])):
            sizes.append(item['variantOptions'][i]['variantOptionQualifiers'][4]['value'])
        sizes = ",".join(sizes)
    except:
        sizes = None
    
    #Price
    try:
        price = item['price']['formattedValue'][4:]
    except:
        price = None
    
    #MRP
    try:
        mrp = item['wasPriceData']['formattedValue'][4:]
    except:
        mrp = None
    
    #Gender
    try:
        gender = item['brickCategory']
    except:
        gender = None
    
    #Description
    try:
        description = item['brickSubCategory']
    except:
        description = None
    
    #Primary Image Link
    try:
        p_img_link = item['baseOptions'][0]['options'][0]['modelImage']['url']
    except:
        p_img_link = None
    
    #Secondary Image Links
    try:
        s_img_links = []
        for i in range(len(item['images'])):
            if item['images'][i]['format'] == 'product' and item['images'][i]['galleryIndex'] != 0:
                s_img_links.append(item['images'][i]['url'])
        s_img_links = ",".join(s_img_links)
        if len(s_img_links) == 0:
            s_img_links = None
    except:
        s_img_links = None
    
    
    final_results = [website, product_link, product_name, product_brand, product_category, sizes,
                    price, mrp, gender, description, p_img_link, s_img_links]

    data.append(final_results)
    

In [None]:
def get_page(url):
    r = requests.session().get(url).json()
    
    urls = []
    
    for item in range(len(r['products'])):
        color_id = r['products'][item]['fnlColorVariantData']['colorGroup']
        urls.append(base_url + "api/p/" + color_id)
    return urls

# Create Base Dataframe

Create this csv file if starting from beginning

In [None]:
# Create a base CSV file for storing
columns = ["Website", "Product_Link", "Product_Name", "Product_Brand", "Product_Category", "Size_Avail", "Price",
          "MRP", "Gender", "Description", "Primary_Image_Links", "Secondary_Image_Links"]

base_df = pd.DataFrame(columns=columns)
base_df.to_csv("myntra.csv", index=False)

# Driver Function

In [None]:
"""
Main Function
"""

page_lim = 10000

counter = 0

data = []

headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}
s = requests.Session()

start = time.time()

# for category in range(len(categories)):
for category in range(len(categories)):
    
    
    for page_num in range(0, min(page_lim, page_nums[category])):
        
        if category == 0:
            url = f"https://www.ajio.com/api/category/8302?fields=SITE&currentPage={page_num}&pageSize=100&format=json&query=%3Arelevance&sortBy=relevance&curated=true&curatedid=men-clothing&gridColumns=3&facets=&advfilter=true"
        else:
            url = f"https://www.ajio.com/api/search?fields=SITE&currentPage={page_num}&pageSize=100&format=json&query={categories[category]}%3Arelevance&sortBy=relevance&text={categories[category]}&gridColumns=3&advfilter=true&platform=site"
        
        try:
            urls = get_page(url)
        except:
            print("Error occured for category: " + categories[category])
            break

        # Extracting with Multithreading
        with concurrent.futures.ThreadPoolExecutor() as executor:
            executor.map(extract, urls)
    
        counter += (len(urls))
        
        print("Total Amount scraped :- ", counter, sep="  ")
        print("Current Category:- ", categories[category])
        print("Page No:- ", page_num)
        print("\nTime Elapsed:- ", round((time.time() - start)/60, 2), "mins\n")
        print()

        # Adding to CSV for every page result
        temp_df = pd.DataFrame(data, columns=columns)
        temp_df.to_csv('myntra.csv', mode='a', header=False, index=False)
        
        # Re-initializing list for next page
        data = []
    
end = time.time()
print("\nTotal Time Elapsed:- ", round((end - start)/60, 2), "mins\n")

In [None]:
df = pd.read_csv("myntra.csv")

In [None]:
df = df.drop_duplicates()
df.dropna(inplace = True)
df["Affiliate_Link"] = [None]*df.shape[0]

In [None]:
df

In [None]:
conn = sqlite3.connect('AjioProductsData.db')
c = conn.cursor()

In [None]:
c.execute('CREATE TABLE product_details (Website varchar(40) NOT NULL,Product_Link TEXT PRIMARY KEY,Product_Name varchar(50) NOT NULL,Product_Brand varchar(50) NOT NULL,Product_Category varchar(50),Size_Avail varchar(20) NOT NULL,Price int NOT NULL,MRP int NOT NULL,Gender varchar(15) NOT NULL,Description TEXT NOT NULL,Primary_Image_Links TEXT NOT NULL,Secondary_Image_Links TEXT NOT NULL,Affiliate_Link TEXT NOT NULL)')
conn.commit()

In [None]:
df.to_sql('product_details', conn, if_exists='replace', index = False)

In [None]:
# Print 20 products
c.execute('''  
SELECT * FROM product_details
          ''')

for row in c.fetchmany(size=20):
    print (row)