In [1]:
import pymongo
import json
import requests
from bs4 import BeautifulSoup
from pprint import pprint
from tqdm import tqdm
from bson import ObjectId

In [2]:
class Product:
    def __init__(self, title, images, options, variants, available, description, sku, publishedDate, groupId, categoryId, createdDate, lastModifiedDate):
        self._id = ObjectId()
        self.title = title
        self.images = images
        self.options = options
        self.variants = variants
        self.available = available
        self.description = description
        self.sku = sku
        self.publishedDate = publishedDate
        self.groupId = groupId.__str__()
        self.categoryId = categoryId.__str__()
        self.createdDate = createdDate
        self.lastModifiedDate = lastModifiedDate

class ProductVariant:
    def __init__(self, name, price, compareAtPrice, available, quantity, options):
        self._id = ObjectId()
        self.name = name
        self.price = int(price)
        self.compareAtPrice = int(compareAtPrice)
        self.available = available
        self.quantity = quantity
        self.options = options

class Image:
    def __init__(self, url, position):
        self.url = url
        self.position = position

class Option:
    def __init__(self, name, values, position):
        self.name = name
        self.values = values
        self.position = position

class Group:
    def __init__(self, name):
        self._id = ObjectId()
        self.name = name
        
class Category:
    def __init__(self, name, groupId):
        self._id = ObjectId()
        self.name = name
        self.groupId = groupId.__str__()


In [3]:
import pymongo

myclient = pymongo.MongoClient("mongodb://localhost:27017/")

mydb = myclient["clothes"]

In [4]:
col_products = mydb["products"]
col_groups = mydb["groups"]
col_categories = mydb["categories"]

In [5]:
groups = {
    "ao": Group("Áo"),
    "quan": Group("Quần"),
    "dam": Group("Đầm"),
    "chan_vay": Group("Chân váy"),
}

categories = {
    "ao_2_day": Category("Áo 2 dây", groups["ao"]._id),
    "ao_ba_lo": Category("Áo ba lỗ", groups["ao"]._id),
    "ao_da": Category("Áo da", groups["ao"]._id),
    "ao_dai": Category("Áo dài", groups["ao"]._id),
    "ao_khoac_the_thao": Category("Áo khoác thể thao", groups["ao"]._id),
    "ao_len": Category("Áo len", groups["ao"]._id),
    "ao_mangto": Category("Áo măngto", groups["ao"]._id),
    "ao_so_mi": Category("Áo sơ mi", groups["ao"]._id),
    "ao_thun": Category("Áo thun", groups["ao"]._id),
    "ao_tre_vai": Category("Áo trễ vai", groups["ao"]._id),
    "ao_vest": Category("Áo vest", groups["ao"]._id),
    "do_ngu_do_mac_nha": Category("Đồ ngủ đồ mặc nhà", groups["ao"]._id),
    "chan_vay_dai": Category("Chân váy dài", groups["chan_vay"]._id),
    "chan_vay_ngan": Category("Chân váy ngắn", groups["chan_vay"]._id),
    "dam_maxi": Category("Đầm maxi", groups["dam"]._id),
    "dam_om": Category("Đầm ôm", groups["dam"]._id),
    "quan_dai": Category("Quần dài", groups["quan"]._id),
    "quan_jean": Category("Quần jean", groups["quan"]._id),
    "quan_short": Category("Quần short", groups["quan"]._id),
}

In [6]:
for group in groups.values():
    col_groups.insert_one(group.__dict__)

for category in categories.values():
    col_categories.insert_one(category.__dict__)

In [7]:
def to_dict(obj):
    if isinstance(obj, list):
        return [to_dict(item) for item in obj]
    elif hasattr(obj, '__dict__'):
        return {key: to_dict(value) for key, value in vars(obj).items()}
    else:
        return obj
    
def processProduct(urls, groupId, categoryId):
    for url in urls:
        p_response = requests.get(f'{url}?page=1&limit=500')
        p_response = p_response.json()
        products = p_response['products']
        for product in tqdm(products):
            title = product['title']
            description = product['body_html']
            createdDate = product['created_at']
            lastModifiedDate = product['updated_at']
            if 'available' in product:
                available = product['available'] 
            else:
                available = True
            publishedDate = product['published_at']
            images = []
            for image in product['images']:
                images.append(Image(image['src'], image['position']))
            options = []
            for option in product['options']:
                options.append(Option(option['name'], option['values'], option['position']))
            variants = []
            for variant in product['variants']:
                name = variant['title']
                price = variant['price']
                compareAtPrice = variant['compare_at_price']
                available = variant['available']
                if 'inventory_quantity' in variant:
                    quantity = variant['inventory_quantity']
                else:
                    quantity = 10
                i = 1
                v_options = {}
                while variant.get('option' + str(i)):
                    v_options[str(i)] = variant['option' + str(i)]
                    i += 1
                variants.append(ProductVariant(name, price, compareAtPrice, available, quantity, v_options))
            sku = product['variants'][0]['sku']
            product = Product(title, images, options, variants, available, description, sku, publishedDate, groupId, categoryId, createdDate, lastModifiedDate)
            col_products.insert_one(to_dict(product))

# Ao 2 day

In [8]:
urls = [
    "https://www.pantio.vn/collections/ao-2-day/products.json",
    "https://callia.vn/collections/ao-hai-day/products.json",
    "https://maybi.com/collections/ao-hai-day/products.json"
]
processProduct(urls, groups["ao"]._id, categories["ao_2_day"]._id)

100%|██████████| 10/10 [00:00<00:00, 251.11it/s]
100%|██████████| 2/2 [00:00<00:00, 662.82it/s]
100%|██████████| 12/12 [00:00<00:00, 1033.67it/s]


# Ao ba lo

In [9]:
urls = [
    "https://ash.vn/collections/ao-ba-l%E1%BB%97-bra-n%E1%BB%AF/products.json",
    "https://salehub.com.vn/collections/ao-ba-lo-nu/products.json",
    "https://www.maisononline.vn/collections/ao-ba-lo-nu/products.json"
]
processProduct(urls, groups["ao"]._id, categories["ao_ba_lo"]._id)

100%|██████████| 3/3 [00:00<00:00, 726.50it/s]
100%|██████████| 24/24 [00:00<00:00, 963.31it/s]
100%|██████████| 50/50 [00:00<00:00, 877.86it/s]


# Ao da

In [10]:
urls = [
    "https://lamerfashion.com/collections/ao-khoac-da/products.json",
    "https://davinet.vn/collections/ao-khoac-da-nu/products.json",
    "https://shop.harley-alnaboodah.vn/collections/ao-khoac-da-nu/products.json",
    "https://dragonmark.vn/collections/ao-da-nu/products.json",
    
]
processProduct(urls, groups["ao"]._id, categories["ao_da"]._id)

100%|██████████| 32/32 [00:00<00:00, 359.02it/s]
100%|██████████| 6/6 [00:00<00:00, 608.47it/s]
100%|██████████| 17/17 [00:00<00:00, 886.39it/s]
100%|██████████| 31/31 [00:00<00:00, 789.10it/s]


# Ao dai

In [11]:
urls = [
    "https://econice.vn/collections/ao-dai/products.json",
    "https://vania.com.vn/collections/ao-dai/products.json",
    "https://nemshop.vn/collections/ao-dai/products.json",
    "https://www.pantio.vn/collections/ao-dai/products.json",
    "https://ceilio.vn/collections/ao-dai/products.json",
    "https://sumirestore.com/collections/ao-dai/products.json",
    "https://hongvic.vn/collections/ao-dai/products.json"
    
]
processProduct(urls, groups["ao"]._id, categories["ao_dai"]._id)

100%|██████████| 50/50 [00:00<00:00, 1044.97it/s]
100%|██████████| 50/50 [00:00<00:00, 1140.74it/s]
100%|██████████| 34/34 [00:00<00:00, 1046.97it/s]
100%|██████████| 29/29 [00:00<00:00, 832.43it/s]
100%|██████████| 50/50 [00:00<00:00, 1088.30it/s]
100%|██████████| 50/50 [00:00<00:00, 950.45it/s]
100%|██████████| 17/17 [00:00<00:00, 813.99it/s]


# Ao khoac the thao

In [12]:
urls = [
    "https://donglucsport.vn/collections/ao-khoac-nu/products.json",
    "https://fitme.vn/collections/ao-khoac-the-thao-nu/products.json",
    
]
processProduct(urls, groups["ao"]._id, categories["ao_khoac_the_thao"]._id)

100%|██████████| 19/19 [00:00<00:00, 898.22it/s]
100%|██████████| 9/9 [00:00<00:00, 911.19it/s]


# Ao len

In [13]:
urls = [
    "https://www.pantio.vn/collections/ao-len/products.json",
    "https://lamerfashion.com/collections/ao-len/products.json",
    "https://nemshop.vn/collections/ao-len/products.json",
    "https://johnhenry.vn/collections/ao-len-nu-freelancer/products.json"
]
processProduct(urls, groups["ao"]._id, categories["ao_len"]._id)

100%|██████████| 50/50 [00:00<00:00, 936.28it/s]
100%|██████████| 50/50 [00:00<00:00, 984.64it/s]
100%|██████████| 13/13 [00:00<00:00, 1079.83it/s]
100%|██████████| 5/5 [00:00<00:00, 829.08it/s]


# Ao mangto

In [14]:
urls = [
    "https://www.pantio.vn/collections/ao-mang-to/products.json",
    "https://nemshop.vn/collections/mang-to-1/products.json",
]
processProduct(urls, groups["ao"]._id, categories["ao_mangto"]._id)

100%|██████████| 29/29 [00:00<00:00, 992.86it/s]
100%|██████████| 34/34 [00:00<00:00, 1078.37it/s]


# Ao so mi

In [15]:
urls = [
    "https://www.pantio.vn/collections/ao-so-mi/products.json",
    "https://evadeeva.com.vn/collections/ao-so-mi/products.json",
    "https://format.com.vn/collections/ao-so-mi-nu/products.json",
    "https://www.maisononline.vn/collections/ao-so-mi-nu-1/products.json"
]
processProduct(urls, groups["ao"]._id, categories["ao_so_mi"]._id)

100%|██████████| 50/50 [00:00<00:00, 961.11it/s]
100%|██████████| 50/50 [00:00<00:00, 1097.22it/s]
100%|██████████| 40/40 [00:00<00:00, 1059.35it/s]
100%|██████████| 50/50 [00:00<00:00, 801.43it/s]


# Ao thun

In [16]:
urls = [
    "https://ninomaxxconcept.com/collections/ao-thun-danh-cho-nu/products.json",
    "https://www.maisononline.vn/collections/ao-thun-nu/products.json",
    "https://gavani.vn/collections/ao-thun-nu/products.json",
    "https://johnhenry.vn/collections/ao-thun-nu-freelancer/products.json",
    "https://marc.com.vn/collections/ao-thun-nu/products.json"
]
processProduct(urls, groups["ao"]._id, categories["ao_thun"]._id)

100%|██████████| 50/50 [00:00<00:00, 750.27it/s]
100%|██████████| 50/50 [00:00<00:00, 890.36it/s]
100%|██████████| 27/27 [00:00<00:00, 957.32it/s]
100%|██████████| 38/38 [00:00<00:00, 937.37it/s]
100%|██████████| 21/21 [00:00<00:00, 713.96it/s]


# Ao tre vai

In [17]:
urls = [
    "https://cheapy.vn/collections/ao-tre-vai/products.json",
    "https://www.maisononline.vn/collections/ao-tre-vai-nu/products.json",
]
processProduct(urls, groups["ao"]._id, categories["ao_tre_vai"]._id)

100%|██████████| 41/41 [00:00<00:00, 1025.56it/s]
100%|██████████| 8/8 [00:00<00:00, 829.98it/s]


# Ao vest

In [18]:
urls = [
    "https://www.pantio.vn/collections/ao-vest/products.json",
    "https://nemshop.vn/collections/ao-vest/products.json",
    "https://lamerfashion.com/collections/ao-vest/products.json",
    "https://chicland.vn/collections/ao-vest/products.json"
]
processProduct(urls, groups["ao"]._id, categories["ao_vest"]._id)

100%|██████████| 41/41 [00:00<00:00, 832.03it/s]
100%|██████████| 44/44 [00:00<00:00, 992.55it/s]
100%|██████████| 44/44 [00:00<00:00, 871.21it/s]
100%|██████████| 44/44 [00:00<00:00, 798.81it/s]


# Do ngu do mac nha

In [19]:
urls = [
    "https://wannabe.com.vn/collections/do-ngu-wannabe/products.json",
    "https://sunfly.com.vn/collections/bo-mac-nha/products.json",
    "https://cardina.vn/collections/do-bo-nu-mac-nha/products.json",
    "https://www.pantio.vn/collections/do-ngu-do-mac-nha/products.json"
]
processProduct(urls, groups["ao"]._id, categories["do_ngu_do_mac_nha"]._id)

100%|██████████| 50/50 [00:00<00:00, 900.95it/s]
100%|██████████| 50/50 [00:00<00:00, 879.62it/s]
100%|██████████| 50/50 [00:00<00:00, 831.66it/s]
100%|██████████| 16/16 [00:00<00:00, 868.48it/s]


# Chan vay dai

In [20]:
urls = [
    "https://www.pantio.vn/collections/chan-vay-dai/products.json",
]
processProduct(urls, groups["chan_vay"]._id, categories["chan_vay_dai"]._id)

100%|██████████| 50/50 [00:00<00:00, 1047.94it/s]


# Chan vay ngan

In [21]:
urls = [
    "https://www.pantio.vn/collections/chan-vay-ngan/products.json",
    "https://somehow.vn/collections/chan-vay-ngan/products.json"
]
processProduct(urls, groups["chan_vay"]._id, categories["chan_vay_ngan"]._id)

100%|██████████| 50/50 [00:00<00:00, 853.76it/s]
100%|██████████| 24/24 [00:00<00:00, 866.49it/s]


# Dam maxi

In [22]:
urls = [
    "https://lamerfashion.com/collections/dam-maxi/products.json",
    "https://nemshop.vn/collections/dam-maxi/products.json",
    "https://evadeeva.com.vn/collections/dam-maxi/products.json",
]
processProduct(urls, groups["dam"]._id, categories["dam_maxi"]._id)

100%|██████████| 50/50 [00:00<00:00, 911.68it/s]
100%|██████████| 28/28 [00:00<00:00, 1034.68it/s]
100%|██████████| 4/4 [00:00<00:00, 851.07it/s]


# Dam om

In [23]:
urls = [
    "https://www.maisononline.vn/collections/dam-om-body/products.json",
    "https://maybi.com/collections/dam-om/products.json",
]
processProduct(urls, groups["dam"]._id, categories["dam_om"]._id)

100%|██████████| 22/22 [00:00<00:00, 1001.74it/s]
100%|██████████| 24/24 [00:00<00:00, 862.46it/s]


# Quan dai

In [24]:
urls = [
    "https://www.pantio.vn/collections/quan-dai/products.json",
    "https://coupletx.com/collections/quan-dai-nu/products.json",
    "https://ninomaxxconcept.com/collections/quan-dai-danh-cho-nu/products.json",
    "https://salehub.com.vn/collections/quan-dai-nu/products.json",
]
processProduct(urls, groups["quan"]._id, categories["quan_dai"]._id)

100%|██████████| 50/50 [00:00<00:00, 944.22it/s]
100%|██████████| 4/4 [00:00<00:00, 667.59it/s]
100%|██████████| 50/50 [00:00<00:00, 971.55it/s]
100%|██████████| 37/37 [00:00<00:00, 1063.38it/s]


# Quan jean

In [25]:
urls = [
    "https://www.pantio.vn/collections/quan-bo/products.json",
    "https://genviet.com/collections/quan-dai-nu-jeans/products.json",
    "https://johnhenry.vn/collections/quan-jeans-nu-freelancer/products.json",
    "https://salehub.com.vn/collections/quan-dai-nu/products.json",
]
processProduct(urls, groups["quan"]._id, categories["quan_jean"]._id)

100%|██████████| 24/24 [00:00<00:00, 849.34it/s]
100%|██████████| 50/50 [00:00<00:00, 835.34it/s]
100%|██████████| 25/25 [00:00<00:00, 973.98it/s]
100%|██████████| 37/37 [00:00<00:00, 972.79it/s]


# Quan short

In [26]:
urls = [
    "https://www.pantio.vn/collections/quan-short/products.json",
    "https://ninomaxxconcept.com/collections/quan-short-danh-cho-nu/products.json",
    "https://genviet.com/collections/quan-short-nu/products.json",
    "https://winmaxx.com.vn/collections/quan-short-nu/products.json",
    "https://livansport.com/collections/quan-short-nu/products.json",
]
processProduct(urls, groups["quan"]._id, categories["quan_short"]._id)

100%|██████████| 50/50 [00:00<00:00, 910.13it/s]
100%|██████████| 39/39 [00:00<00:00, 843.44it/s]
100%|██████████| 17/17 [00:00<00:00, 783.29it/s]
100%|██████████| 17/17 [00:00<00:00, 630.48it/s]
100%|██████████| 10/10 [00:00<00:00, 848.91it/s]
