In [1]:
import requests
import pandas as pd
import numpy
from bs4 import BeautifulSoup
import re
import json
import copy

In [2]:
def post_request(category_id,size,url):
    data = {'categoryId':category_id,
            'customerGroupId':'0',
            'direction':'asc',
            'from':'0',
            'highlightEnabled':'false',
            'loadAggregations':'true',
            'order':'position',
            'searchTerm':'',
            'size': size,
            'store':'default'}
    
    r = requests.post(url,data = data)
    r_json = {}
    if r.status_code == 200:
        r_json = r.json()
    else:
        print('Response code is ', r.status_code)
    
    print('url post >',r.url)
    
    return r_json


In [3]:
def get_dict_groups(main_url):
    r = requests.get(main_url)
    sp = BeautifulSoup(r.text, 'lxml')
    categories = list(sp.nav.ul.children)
    categories.pop(0)
    categories.pop(-1)
    url_categories = [i.a.attrs['href'] for i in categories]
    dict_group = {}
    for url in url_categories:
        print(url)
        id_group,title = get_group_id(url)
        print(id_group,title)
        if id_group:
            dict_group[id_group] = {'name':title,'url':url}
    
    return dict_group

In [4]:
def get_group_id(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text,'lxml')
    print(soup)
    title = soup.title.text.strip()
    lista_script = str(soup.findAll('script')[41]).split('\n')
    group_id = ''
    for i in lista_script:
        if re.search('categoryId',i.strip()):
            group_id = i.strip()
    group_id = ''.join(filter(lambda i: i.isdigit(), group_id))
    return group_id,title

In [5]:
dict_group = get_dict_groups('https://www.tailoy.com.pe/')
dict_group


https://www.tailoy.com.pe/escolar.html
 Escolar
https://www.tailoy.com.pe/arte-y-dise-o.html
 Arte y Diseño
https://www.tailoy.com.pe/universitario.html
 Universitario
https://www.tailoy.com.pe/zona-de-lectura.html
 Oficina
https://www.tailoy.com.pe/cuidado-personal-y-limpieza.html
 Cuidado Personal y Limpieza
https://www.tailoy.com.pe/tecnologia.html
 Tecnología
https://www.tailoy.com.pe/zona-coleccionista.html
 Zona Coleccionista
https://www.tailoy.com.pe/outdoors-deporte.html
 Outdoors & Deporte
https://www.tailoy.com.pe/fisher-price.html
 Fisher Price
https://www.tailoy.com.pe/jugueteria.html
 Juguetería
https://www.tailoy.com.pe/abarrotes.html
 Abarrotes


{}

In [6]:
res = post_request('5',40,'https://www.tailoy.com.pe/elastic.php')
categories_by_group = res.get('aggregations').get('categories')
categories_by_group

url post > https://www.tailoy.com.pe/elastic.php


{'5': {'label': 'Tecnología',
  'level': '2',
  'path': '1/2/5/',
  'count': 486,
  'id': '5',
  'children': True},
 '27': {'label': 'Fotografía',
  'level': '3',
  'path': '1/2/5/27/',
  'parent': '5',
  'count': 59,
  'id': '27',
  'children': True},
 '32': {'label': 'Electrohogar',
  'level': '3',
  'path': '1/2/5/32/',
  'parent': '5',
  'count': 4,
  'id': '32',
  'children': True},
 '33': {'label': 'Accesorios de Computo',
  'level': '3',
  'path': '1/2/5/33/',
  'parent': '5',
  'count': 133,
  'id': '33',
  'children': True},
 '34': {'label': 'Otros',
  'level': '3',
  'path': '1/2/5/34/',
  'parent': '5',
  'count': 55,
  'id': '34',
  'children': True},
 '28': {'label': 'Art Technology',
  'level': '3',
  'path': '1/2/5/28/',
  'parent': '5',
  'count': 19,
  'id': '28',
  'children': True},
 '26': {'label': 'Audio',
  'level': '3',
  'path': '1/2/5/26/',
  'parent': '5',
  'count': 94,
  'id': '26',
  'children': True},
 '31': {'label': 'Celulares',
  'level': '3',
  'path':

In [7]:
def get_all_levels(json_group):
    list_levels = []
    for key, value in json_group.items():
        level = int(value.get('level'))
        if level not in list_levels:
            list_levels.append(level)
    return list_levels

def get_cat_by_level(json_group, num_level):
    json_by_level = {}
    for key, value in json_group.items():
        level = int(value.get('level'))
        if level == num_level:
            json_by_level[key] = value

    return json_by_level

def order_dict(res_dict, level_dict):
    missed_dict = {}
    for key,value in level_dict.items():
        #print('>>>> 2 >>>> ',value)
        parent = value.get('parent','')
        if parent:
            group = res_dict.get(parent,'')
            #print(group)
            if group:
                if group.get('cats',''):
                    group['cats'].update({key:value})
                else:
                    group['cats'] = {key:value}
                #print(group)
            else:
                id_level_one = list(res_dict.keys())[0]
                cats = res_dict.get(id_level_one).get('cats','')
                #print('###### cats >>> ',cats)
                if cats:
                    if cats.get(parent,''):
                        if cats.get(parent).get('sub_cats',''):
                            cats.get(parent)['sub_cats'].update({key:value})
                        else:
                            cats.get(parent)['sub_cats'] = {key:value}
                    else:
                        missed_dict.update({key:value})
                else:
                    missed_dict.update({key:value})
                pass
    return res_dict, missed_dict


def get_group_cat_subcat(categories_by_group):
    global_dict = {}
    missed_dict = {}
    levels = get_all_levels(categories_by_group)
    levels.sort()
    #print(levels)
    if len(levels) > 3:
        print('################ ALERT, 1 LEVEL ADDED #################')
    #print(categories_by_group)
    for level in levels:
        level_dict = get_cat_by_level(categories_by_group,level)
        #print("level dict >>> ",level_dict)
        if global_dict:
            pass
            #print('b')
            initial_dict = copy.deepcopy(global_dict)
            global_dict,missed_dict = order_dict(initial_dict,level_dict)
            if missed_dict:
                print('###  ERROR:missed values   ###')
        else:
            #print('a')
            global_dict.update(level_dict)
            
    return global_dict

In [8]:
ordered_categories = get_group_cat_subcat(categories_by_group)
ordered_categories


{'5': {'label': 'Tecnología',
  'level': '2',
  'path': '1/2/5/',
  'count': 486,
  'id': '5',
  'children': True,
  'cats': {'27': {'label': 'Fotografía',
    'level': '3',
    'path': '1/2/5/27/',
    'parent': '5',
    'count': 59,
    'id': '27',
    'children': True,
    'sub_cats': {'145': {'label': 'Drones',
      'level': '4',
      'path': '1/2/5/27/145/',
      'parent': '27',
      'count': 59,
      'id': '145'},
     '144': {'label': 'Cámaras Profesionales',
      'level': '4',
      'path': '1/2/5/27/144/',
      'parent': '27',
      'count': 59,
      'id': '144'},
     '141': {'label': 'Cámaras Compactas',
      'level': '4',
      'path': '1/2/5/27/141/',
      'parent': '27',
      'count': 59,
      'id': '141'},
     '142': {'label': 'Cámaras de Video',
      'level': '4',
      'path': '1/2/5/27/142/',
      'parent': '27',
      'count': 59,
      'id': '142'},
     '143': {'label': 'Cámaras instantáneas',
      'level': '4',
      'path': '1/2/5/27/143/',
      

## Get products

In [9]:
def iterate_over_group(data_by_group):
    df_group_products = pd.DataFrame([])
    for key, value in data_by_group.items():
        
        name_group = value.get('label')
        id_group = value.get('id')
        print('group >',name_group,id_group)
        cats_dict = value.get('cats')
        for k,v in cats_dict.items():
            
            name_cat = v.get('label')
            id_cat = v.get('id')
            print('cats >',name_cat,id_cat)
            subcats_dict = v.get('sub_cats','')
            #size_subcat = len(list(subcats_dict.keys()))
            if subcats_dict:
                for x,y in subcats_dict.items():
                    
                    name_subcat = y.get('label')
                    id_subcat = y.get('id')
                    count_subcat = y.get('count')
                    print('subcats >',name_subcat,id_subcat,count_subcat)
                    
                    
                    lista_products = get_products(id_subcat)
                    df_subcat = pd.DataFrame(lista_products)

                    df_subcat.insert(loc=0, column='id_group', value=id_group)
                    df_subcat.insert(loc=1, column='name_group', value=name_group)
                    df_subcat.insert(loc=2, column='id_cat', value=id_cat)
                    df_subcat.insert(loc=3, column='name_cat', value=name_cat)
                    df_subcat.insert(loc=4, column='id_subcat', value=id_subcat)
                    df_subcat.insert(loc=5, column='name_subcat', value=name_subcat)
                    df_subcat.insert(loc=6, column='count_subcat', value=count_subcat)

                    df_group_products = df_group_products.append(df_subcat)

            else:
                name_subcat, id_subcat, count_subcat = '','',''
                lista_products_2 = get_products(id_cat)
                df_cat = pd.DataFrame(lista_products_2)

                df_cat.insert(loc=0, column='id_group', value=id_group)
                df_cat.insert(loc=1, column='name_group', value=name_group)
                df_cat.insert(loc=2, column='id_cat', value=id_cat)
                df_cat.insert(loc=3, column='name_cat', value=name_cat)
                df_cat.insert(loc=4, column='id_subcat', value=id_subcat)
                df_cat.insert(loc=5, column='name_subcat', value=name_subcat)
                df_cat.insert(loc=6, column='count_subcat', value=count_subcat)

                df_group_products = df_group_products.append(df_cat)

            #df_subcat.insert(loc=idx_subcat, column='id_branch', value=str(tp[0]))
    return df_group_products

In [10]:
def get_products(id_category):
    size_rows = 4000
    data_products = []
    diff = 0
    while diff == 0:
        res = post_request(id_category,size_rows,'https://www.tailoy.com.pe/elastic.php')
        total = int(res.get('amount').get('total'))
        diff = size_rows - total
        size_rows += 1000
        data_products = res.get('products')

    list_products = []
    info_product = {}
    for product in data_products:
        info_product['id_pro'] = product.get('id')
        info_product['name_pro'] = product.get('name')
        info_product['img_url'] = product.get('base_image')
        info_product['codeas_pro'] = product.get('tailoyetl_999_codeas')
        info_product['marca_pro'] = ', '.join(product.get('tailoyetl_999_marcav2'))
        info_product['description_pro'] = clean_description(product.get('description'))
        info_product['keys_pro'] = product.get('word_search_tailoy')
        info_product['discount_pro'] = product.get('discount','')
        info_product['price_pro'] = product.get('prices_0').get('price')
        info_product['final_price_pro'] = product.get('prices_0').get('final_price')
        info_product['minimal_price_pro'] = product.get('prices_0').get('minimal_price')
        info_product['mix_price_pro'] = product.get('prices_0').get('min_price')
        info_product['max_price_pro'] = product.get('prices_0').get('max_price')
        info_product['rating_pro'] = product.get('rating')
        info_product['review_pro'] = product.get('review_count')
        info_product['max_sale_qty_pro'] = product.get('max_sale_qty')
        info_product['min_sale_qty_pro'] = product.get('min_sale_qty')
        info_product['qty_increments_pro'] = product.get('qty_increments')
        info_product['url_pro'] = product.get('url')

        list_products.append(info_product)
        info_product = {}
        
    return list_products


In [11]:
def clean_description(text):
    so = BeautifulSoup(text,'html.parser')
    clean_txt = so.text.split('\n')[-1].strip()
    return clean_txt

In [12]:
iterate_over_group(ordered_categories)

group > Tecnología 5
cats > Fotografía 27
subcats > Drones 145 59
url post > https://www.tailoy.com.pe/elastic.php
subcats > Cámaras Profesionales 144 59
url post > https://www.tailoy.com.pe/elastic.php
subcats > Cámaras Compactas 141 59
url post > https://www.tailoy.com.pe/elastic.php
subcats > Cámaras de Video 142 59
url post > https://www.tailoy.com.pe/elastic.php
subcats > Cámaras instantáneas 143 59
url post > https://www.tailoy.com.pe/elastic.php
cats > Electrohogar 32
subcats > Electrodomésticos 170 4
url post > https://www.tailoy.com.pe/elastic.php
cats > Accesorios de Computo 33
subcats > Mousepads 179 5


KeyboardInterrupt: 

## Main

In [18]:
#ict_group = get_dict_groups('https://www.tailoy.com.pe/')

dict_group = {'9': {'name': 'Escolar', 'url': 'https://www.tailoy.com.pe/escolar.html'},
 '8': {'name': 'Arte y Diseño',
  'url': 'https://www.tailoy.com.pe/arte-y-dise-o.html'},
 '10': {'name': 'Universitario',
  'url': 'https://www.tailoy.com.pe/universitario.html'},
 '11': {'name': 'Oficina',
  'url': 'https://www.tailoy.com.pe/zona-de-lectura.html'},
 '621': {'name': 'Cuidado Personal y Limpieza',
  'url': 'https://www.tailoy.com.pe/cuidado-personal-y-limpieza.html'},
 '5': {'name': 'Tecnología',
  'url': 'https://www.tailoy.com.pe/tecnologia.html'},
 '6': {'name': 'Zona Coleccionista',
  'url': 'https://www.tailoy.com.pe/zona-coleccionista.html'},
 '7': {'name': 'Outdoors & Deporte',
  'url': 'https://www.tailoy.com.pe/outdoors-deporte.html'},
 '16': {'name': 'Fisher Price',
  'url': 'https://www.tailoy.com.pe/fisher-price.html'},
 '3': {'name': 'Juguetería',
  'url': 'https://www.tailoy.com.pe/jugueteria.html'},
 '639': {'name': 'Abarrotes',
  'url': 'https://www.tailoy.com.pe/abarrotes.html'}}

In [19]:
for k,v in dict_group.items():

    res = post_request(k,40,'https://www.tailoy.com.pe/elastic.php')
    categories_by_group = res.get('aggregations').get('categories')
    tree_by_group = get_group_cat_subcat(categories_by_group)
    print(tree_by_group)
    key_first = list(tree_by_group.keys())[0]
    print(key_first)
    name_file = key_first + '_' + str(tree_by_group.get(key_first).get('label')).lower().replace(" ",'') 

    df_group = iterate_over_group(tree_by_group)
    #print(df_group.head())
    df_group.to_csv(name_file+'_'+str(df_group.shape[0])+'.csv',index=False,header=True,encoding='utf-8')

url post > https://www.tailoy.com.pe/elastic.php
{'9': {'label': 'Escolar', 'level': '2', 'path': '1/2/9/', 'count': 2829, 'id': '9', 'children': True, 'cats': {'49': {'label': 'Forros y Etiquetas', 'level': '3', 'path': '1/2/9/49/', 'parent': '9', 'count': 79, 'id': '49', 'children': True, 'sub_cats': {'281': {'label': 'Cintas Adhesivas', 'level': '4', 'path': '1/2/9/49/281/', 'parent': '49', 'count': 24, 'id': '281'}, '283': {'label': 'Forros', 'level': '4', 'path': '1/2/9/49/283/', 'parent': '49', 'count': 18, 'id': '283'}, '284': {'label': 'Stickers', 'level': '4', 'path': '1/2/9/49/284/', 'parent': '49', 'count': 5, 'id': '284'}, '282': {'label': 'Etiquetas', 'level': '4', 'path': '1/2/9/49/282/', 'parent': '49', 'count': 32, 'id': '282'}}}, '51': {'label': 'Archivo', 'level': '3', 'path': '1/2/9/51/', 'parent': '9', 'count': 233, 'id': '51', 'children': True, 'sub_cats': {'292': {'label': 'Pioners Escolares', 'level': '4', 'path': '1/2/9/51/292/', 'parent': '51', 'count': 78, 'id

url post > https://www.tailoy.com.pe/elastic.php
subcats > Forros 283 18
url post > https://www.tailoy.com.pe/elastic.php
subcats > Stickers 284 5
url post > https://www.tailoy.com.pe/elastic.php
subcats > Etiquetas 282 32
url post > https://www.tailoy.com.pe/elastic.php
cats > Archivo 51
subcats > Pioners Escolares 292 78
url post > https://www.tailoy.com.pe/elastic.php
subcats > Carpetas y Portatodos 290 23
url post > https://www.tailoy.com.pe/elastic.php
subcats > Folders Escolares 291 154
url post > https://www.tailoy.com.pe/elastic.php
cats > Manualidades 53
subcats > Juegos didácticos 301 44
url post > https://www.tailoy.com.pe/elastic.php
subcats > Tijeras 308 58
url post > https://www.tailoy.com.pe/elastic.php
subcats > Ceramica en frío 299 20
url post > https://www.tailoy.com.pe/elastic.php
subcats > Blocks de manualidades 298 27
url post > https://www.tailoy.com.pe/elastic.php
subcats > Rompecabezas didácticos 307 6
url post > https://www.tailoy.com.pe/elastic.php
subcats > P

url post > https://www.tailoy.com.pe/elastic.php
subcats > Accesorios Pintura 214 39
url post > https://www.tailoy.com.pe/elastic.php
subcats > Juegos Didácticos 218 43
url post > https://www.tailoy.com.pe/elastic.php
subcats > Cerámica en frio 216 18
url post > https://www.tailoy.com.pe/elastic.php
subcats > Blocks de manualidades 215 27
url post > https://www.tailoy.com.pe/elastic.php
subcats > Pinceles Escolares 222 20
url post > https://www.tailoy.com.pe/elastic.php
subcats > Microporoso y Corospum 219 136
url post > https://www.tailoy.com.pe/elastic.php
subcats > Plastilinas 223 43
url post > https://www.tailoy.com.pe/elastic.php
subcats > Pasamanería y Otros 221 77
url post > https://www.tailoy.com.pe/elastic.php
subcats > Papeles de Manualidades 220 133
url post > https://www.tailoy.com.pe/elastic.php
subcats > Rompecabezas Didácticos 224 6
url post > https://www.tailoy.com.pe/elastic.php
subcats > Cinta de Agua 217 7
url post > https://www.tailoy.com.pe/elastic.php
cats > Dibuj

url post > https://www.tailoy.com.pe/elastic.php
subcats > Motas y Tizas 363 17
url post > https://www.tailoy.com.pe/elastic.php
subcats > Engrapadores, sacagrapas y grapas 361 31
url post > https://www.tailoy.com.pe/elastic.php
subcats > Perforadores 365 44
url post > https://www.tailoy.com.pe/elastic.php
subcats > Calculadoras 360 37
url post > https://www.tailoy.com.pe/elastic.php
subcats > Limpiatipos 362 3
url post > https://www.tailoy.com.pe/elastic.php
cats > Útiles 59
subcats > Lapiceros 342 133
url post > https://www.tailoy.com.pe/elastic.php
subcats > Plumones indelebles 345 14
url post > https://www.tailoy.com.pe/elastic.php
subcats > Resaltadores 348 82
url post > https://www.tailoy.com.pe/elastic.php
subcats > Tijeras 350 58
url post > https://www.tailoy.com.pe/elastic.php
subcats > Correctores 341 17
url post > https://www.tailoy.com.pe/elastic.php
subcats > Borradores 340 31
url post > https://www.tailoy.com.pe/elastic.php
subcats > Lápices 343 65
url post > https://www.

url post > https://www.tailoy.com.pe/elastic.php
subcats > Correctores 368 17
url post > https://www.tailoy.com.pe/elastic.php
subcats > Resaltadores 375 82
url post > https://www.tailoy.com.pe/elastic.php
subcats > Lápices 370 65
url post > https://www.tailoy.com.pe/elastic.php
subcats > Tajadores 376 35
url post > https://www.tailoy.com.pe/elastic.php
subcats > Portaminas y minas 374 19
url post > https://www.tailoy.com.pe/elastic.php
subcats > Plumones Indelebles 371 13
url post > https://www.tailoy.com.pe/elastic.php
subcats > Borradores 367 31
url post > https://www.tailoy.com.pe/elastic.php
subcats > Plumones para papel 372 287
url post > https://www.tailoy.com.pe/elastic.php
cats > Papelería 68
subcats > Papel Bond 390 25
url post > https://www.tailoy.com.pe/elastic.php
subcats > Papel Fotocopia 392 16
url post > https://www.tailoy.com.pe/elastic.php
subcats > Papeles Finos 395 1
url post > https://www.tailoy.com.pe/elastic.php
subcats > Papeles Adhesivos 394 2
url post > https:

url post > https://www.tailoy.com.pe/elastic.php
subcats > Drones 145 59
url post > https://www.tailoy.com.pe/elastic.php
subcats > Cámaras Compactas 141 59
url post > https://www.tailoy.com.pe/elastic.php
subcats > Cámaras instantáneas 143 59
url post > https://www.tailoy.com.pe/elastic.php
subcats > Cámaras Profesionales 144 59
url post > https://www.tailoy.com.pe/elastic.php
cats > Electrohogar 32
subcats > Electrodomésticos 170 4
url post > https://www.tailoy.com.pe/elastic.php
cats > Accesorios de Computo 33
subcats > Otros accesorios 180 53
url post > https://www.tailoy.com.pe/elastic.php
subcats > Mouse 178 42
url post > https://www.tailoy.com.pe/elastic.php
subcats > Kit Teclados 175 9
url post > https://www.tailoy.com.pe/elastic.php
subcats > Teclados 181 5
url post > https://www.tailoy.com.pe/elastic.php
subcats > Mousepads 179 5
url post > https://www.tailoy.com.pe/elastic.php
subcats > Discos duros y memorias 174 25
url post > https://www.tailoy.com.pe/elastic.php
cats > Ot

url post > https://www.tailoy.com.pe/elastic.php
{'16': {'label': 'Fisher Price', 'level': '2', 'path': '1/2/16/', 'count': 162, 'id': '16', 'children': True, 'cats': {'612': {'label': 'Gimnasios', 'level': '3', 'path': '1/2/16/612/', 'parent': '16', 'count': 8, 'id': '612'}, '613': {'label': 'Juguetes', 'level': '3', 'path': '1/2/16/613/', 'parent': '16', 'count': 107, 'id': '613'}, '616': {'label': 'Móviles para Bebés', 'level': '3', 'path': '1/2/16/616/', 'parent': '16', 'count': 3, 'id': '616'}, '610': {'label': 'Cuidado del Bebé', 'level': '3', 'path': '1/2/16/610/', 'parent': '16', 'count': 7, 'id': '610'}, '619': {'label': 'Sonajas', 'level': '3', 'path': '1/2/16/619/', 'parent': '16', 'count': 6, 'id': '619'}, '614': {'label': 'Mecedoras', 'level': '3', 'path': '1/2/16/614/', 'parent': '16', 'count': 4, 'id': '614'}, '611': {'label': 'Exteriores', 'level': '3', 'path': '1/2/16/611/', 'parent': '16', 'count': 14, 'id': '611'}, '618': {'label': 'Sillas de Comer', 'level': '3', 'p

url post > https://www.tailoy.com.pe/elastic.php
cats > Abarrotes No Comestibles 641
subcats > Bolsas para alimentos 642 3
url post > https://www.tailoy.com.pe/elastic.php
subcats > Otros 645 4
url post > https://www.tailoy.com.pe/elastic.php


In [None]:
[{
    'id_group':'45',
    'name_group': 'Escolar'
    'cats':[{
        'id_cat' : '56',
        'name_cat': 'Archivos',
        'sub_cat':[{
            'id_subcat':'65456',
            'name_subcat': 'Carpetas y Portatodos'
            
        },{
            'id_subcat':'65456',
            'name_subcat': 'Papeleria'
            
        },{
            'id_subcat':'65456',
            'name_subcat': 'xxx'
            
        }]
    },{
        'id' : '56',
        'name': 'Archivos',
        'sub_sub_cat':{
            
        }
    }]
},{
    'id':'45',
    'name': 'Tecnologia'
    'sub_cat':[{
        'id' : '56',
        'name': 'Archivos',
        'sub_sub_cat':{
            
        }
    },{
        'id' : '56',
        'name': 'Archivos',
        'sub_sub_cat':{
            
        }
    }]
},]