In [1]:
import pymongo
import json
import requests
from bs4 import BeautifulSoup
from pprint import pprint
from tqdm import tqdm
from bson import ObjectId

In [2]:
class Product:
    def __init__(self, title, images, options, variants, available, description, sku, publishedDate, groupId, categoryId, createdDate, lastModifiedDate):
        self.title = title
        self.images = images
        self.options = options
        self.variants = variants
        self.available = available
        self.description = description
        self.sku = sku
        self.publishedDate = publishedDate
        self.groupId = groupId
        self.categoryId = categoryId
        self.createdDate = createdDate
        self.lastModifiedDate = lastModifiedDate

class ProductVariant:
    def __init__(self, name, price, compareAtPrice, available, quantity, options):
        self._id = ObjectId()
        self.name = name
        self.price = price
        self.compareAtPrice = compareAtPrice
        self.available = available
        self.quantity = quantity
        self.options = options


class Image:
    def __init__(self, url, position):
        self.url = url
        self.position = position

class Option:
    def __init__(self, name, values, position):
        self.name = name
        self.values = values
        self.position = position

class Group:
    def __init__(self, name):
        self.name = name
        
class Category:
    def __init__(self, name, groupId):
        self.name = name
        self.groupId = groupId


In [3]:
import pymongo

myclient = pymongo.MongoClient("mongodb://localhost:27017/")

mydb = myclient["clothes"]

In [4]:
col_products = mydb["products"]
col_groups = mydb["groups"]
col_categories = mydb["categories"]

In [5]:
server = 'https://evadeeva.com.vn'

In [6]:
response = requests.get("https://evadeeva.com.vn/collections/san-pham")
soup = BeautifulSoup(response.content, "html.parser")

In [7]:
menuCollection = soup.find_all("ul", class_="menuCollection")
menuCollection

[<ul -="" class="menuCollection" san-pham="">
 <li><a href="/collections/dam">Đầm </a></li>
 <li><a href="/collections/ao">Áo </a></li>
 <li><a href="/collections/chan-vay">Chân váy </a></li>
 <li><a href="/collections/quan">Quần </a></li>
 <li><a href="/collections/jumpsuit">Jumpsuit </a></li>
 <li><a href="/collections/homewear">Homewear </a></li>
 <li><a href="/collections/ao-khoac">Áo khoác </a></li>
 <li><a href="/collections/phu-kien">Phụ kiện </a></li>
 </ul>]

In [8]:
def checkCategoryExist(menuCollection):
    categories = menuCollection.find_all("li")
    for category in categories:
        category_name = category.find("a").text.strip()
        if category_name not in ["Đầm", "Áo", "Chân váy", "Quần", "Jumpsuit", "Homewear", "Áo khoác", "Phụ kiện"]:
            return True
    return False

In [9]:
def to_dict(obj):
    if isinstance(obj, list):
        return [to_dict(item) for item in obj]
    elif hasattr(obj, '__dict__'):
        return {key: to_dict(value) for key, value in vars(obj).items()}
    else:
        return obj

In [10]:
def processProduct(c_url, groupId, categoryId):
    p_response = requests.get(f"{server}{c_url}/products.json?page=1&limit=500")
    p_response = p_response.json()
    products = p_response['products']
    for product in products:
        title = product['title']
        description = product['body_html']
        createdDate = product['created_at']
        lastModifiedDate = product['updated_at']
        available = product['available']
        publishedDate = product['published_at']
        images = []
        for image in product['images']:
            images.append(Image(image['src'], image['position']))
        options = []
        for option in product['options']:
            options.append(Option(option['name'], option['values'], option['position']))
        variants = []
        for variant in product['variants']:
            name = variant['title']
            price = variant['price']
            compareAtPrice = variant['compare_at_price']
            available = variant['available']
            quantity = variant['inventory_quantity']
            i = 1
            v_options = {}
            while variant.get('option' + str(i)):
                v_options[str(i)] = variant['option' + str(i)]
                i += 1
            variants.append(ProductVariant(name, price, compareAtPrice, available, quantity, v_options))
        sku = product['variants'][0]['sku']
        product = Product(title, images, options, variants, available, description, sku, publishedDate, groupId, categoryId, createdDate, lastModifiedDate)
        col_products.insert_one(to_dict(product))

In [11]:
groups = menuCollection[0].find_all("li")
for group in tqdm(groups, desc='Processing groups'):
    groupName = group.find("a").text.strip()
    groupLink = group.find("a")["href"]
    groupEntity = Group(name=groupName)
    groupEntity = col_groups.insert_one(groupEntity.__dict__)
    g_response = requests.get(server + groupLink)
    g_soup = BeautifulSoup(g_response.content, "html.parser")
    g_menuCollection = g_soup.find_all("ul", class_="menuCollection")
    if checkCategoryExist(g_menuCollection[0]):
        categories = g_menuCollection[0].find_all("li")
        for category in categories:
            c_name = category.find("a").text.strip()
            if (c_name.lower().startswith("tất cả")):
                continue
            c_link = category.find("a")["href"]
            c_groupId = groupEntity.inserted_id
            categoryEntity = Category(name=c_name, groupId=c_groupId)
            categoryEntity = col_categories.insert_one(categoryEntity.__dict__)
            processProduct(c_link, c_groupId, categoryEntity.inserted_id)
    else:
        c_groupId = groupEntity.inserted_id
        processProduct(groupLink, c_groupId, None)
        
    

Processing groups:   0%|          | 0/8 [00:00<?, ?it/s]

Processing groups: 100%|██████████| 8/8 [00:27<00:00,  3.46s/it]
