# Astykzhan Shop Web Scrapping

In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import re
import time
import json
import ast

import warnings
warnings.filterwarnings("ignore")

Function opening web page:

* city_num = 1 if Astana
* city_num = 2 if Kostanay

In [2]:
def open_page(category, page_num, city_num, url = 'https://astykzhan.kz/catalog/list/{}/?PAGEN_1={}&city={}'):
    page_url = url.format(category, page_num, city_num)
    
    #exceptions
    while True:
        try:
            page = requests.get(page_url)
            break
        except requests.exceptions.ConnectionError:
            print("TimeOut...Sleep")
            time.sleep(10)
            pass
        
    #getting the pages number
    if page.status_code == 200:
        pages = []
        soup = BeautifulSoup(page.text, "html.parser")
        content = soup.findAll('div', class_ = 'pagenation-elem')
        for i in content:
            if len(i.text.strip('\n')) > 0:
                pages.append(int(i.text.strip('\n')))
        if pages != []:
            num_pages = max(pages)
        else:
            num_pages = 1
    return page, num_pages

In [3]:
url = 'https://astykzhan.kz/catalog/list/avtokosmetika_4944/?PAGEN_1=1&city=1'

page = requests.get(url)

soup = BeautifulSoup(page.text, "html.parser")
content = soup.findAll('div', class_ = 'product-card-elem service_product_element')

# Astana

In [4]:
categories_to_scrap = ['avtokosmetika_4944',
        'aksessuary_4963',
        'zhidkosti-tehnicheskie_4954',
        'nizhnee-bele-detskoe_5139',
        'nizhnee-bele-zhenskoe_5149',
        'nizhnee-bele-muzhskoe_5158',
        'postelnoe-bele-prinadlezhnosti_5123',
        'tekstil-dlya-vannoj_5137',
        'tekstil-dlya-kuhni_5128',
        'prinadlezhnosti-dlya-uborki_5109',
        'upakovochnye-materialy_5101',
        'golovnye-ubory_5255',
        'sumki_5246',
        'obuv-detskaya-dlya-devochek_5220',
        'obuv-detskaya-dlya-malchikov_5228',
        'obuv-zhenskaya_5236',
        'obuv-muzhskaya_5244',
        'aksessuary-dlya-vannoj-komnaty_4776',
        'emkosti-dlya-produktov_4809',
        'krupnogabaritnyj-plastikovye-i_4806',
        'kuhonnaya-posuda_4749',
        'kuhonnaya-utvar_4759',
        'kuhonnyj-inventar_4766',
        'plastikovaya-posuda_4745',
        'prisposobleniya-dlya-uhoda-za-_4814',
        'stolovaya-posuda_4734',
        'ukrasheniya-i-aksessuary-dlya-_4783',
        'chulochno-nosochnye-izdeliya-d_5161',
        'chulochno-nosochnye-izdeliya-z_5164',
        'chulochno-nosochnye-izdeliya-m_5171',
        'audio-video-tehnika_5038',
        'otoplenie-ventilyaciya_5052',
        'tehnika-dlya-krasoty-i-zdorovy_5015',
        'tehnika-dlya-kuhni_4988',
        'elektropribory-po-uhodu-za-bel_5026',
        'odezhda-detskaya_5184',
        'odezhda-dlya-novorozhdennyh_5173',
        'odezhda-zhenskaya_5198',
        'aksessuary-dlya-prazdnika_4817',
        'knigi_4840',
        'novogodnij-assortiment_4794',
        'odnorazovaya-posuda_4824',
        'pechatnaya-produkciya_4832',
        'pressa_4845',
        'suveniry-podarochnye-izdeliya_4831',
        'aksessuary-elektroustanovochny_4976',
        'lampy-osveshenie_4977',
        'aksessuary-dlya-bani-i-sauny_5277',
        'rasteniya-grunty-prinadlezhnos_4930',
        'sadovaya-mebel_4942',
        'sadovoe-oborudovanie_4911',
        'oborudovanie-stroitelnoe-sante_5088',
        'sredstva-individualnoj-zashity_5060',
        'aksessuary-dlya-pismennogo-sto_4856',
        'bumazhnaya-produkciya_4846',
        'tovary-dlya-pisma-i-tvorchestv_4871',
        'zimnie-vidy-sporta_4887',
        'letnie-vidy-sporta_4889',
        'prinadlezhnosti-dlya-piknika_4908',
        'turizm_4899',
        'dieticheskoe-i-diabeticheskoe-_4074',
       'zavtraki-suhie-hlopya_4073',
       'kashi_4071',
       'krupy_4078',
       'makaronnye-izdeliya_4079',
       'maslo-rastitelnoe_4077',
       'momentalnye-gotovye-blyuda_4080',
       'muka_4070',
       'myusli_4072',
       'sousy_4075',
       'specii-i-pripravy_4082',
       'uksus_4076',
       'chipsy-suhariki-sneki-pop-korn_4081',
       'konditerka-zamorozhenaya_4652',
       'morozhenoe_4647',
       'myaso-i-myasoprodukty-zamorozh_4637',
       'pf-zamorozhennye_4620',
       'ptica-i-produkty-iz-pticy-zamo_4630',
       'rybnaya-produkciya-zamorozhenn_4645',
       'frukty-yagody-ovoshi-griby-zam_4615',
       'novyj-god_4730',
       'hleb_4221',
       'hlebobulochnye-izdeliya_4227',
       'voda_4701',
       'prohladitelnyj-napitok_4693',
       'sok-nektar-mors_4690',
       'zhevatelnaya-rezinka_4178',
       'konditerskie-izdeliya-muchnye_4161',
       'konditerskie-izdeliya-saharist_4171',
       'konfety-shokoladnye_4182',
       'torty-pirozhnye-dlitelnogo-hra_4194',
       'torty-pirozhnye-ohlazhdennye_4191',
       'shokolad_4186',
       'vatnye-izdeliya-i-vlazhnye-sal_4398',
       'detskaya-gigiena_4406',
       'zhenskaya-gigiena_4402',
       'protivozachatochnye-sredstva_4411',
       'salfetki-platochki-polotenca_4393',
       'tualetnaya-bumaga_4386',
       'kakao_4214',
       'kofe_4208',
       'sahar_4218',
       'chaj_4199',
       'insekticidy-i-repellenty_4289',
       'osvezhiteli-vozduha_4264',
       'poroshki_4238',
       'sredstva-dlya-uhoda-za-obuvyu-_4268',
       'sredstva-po-uhodu-za-tkanyami_4244',
       'chistyashie-moyushie-sredstva_4250',
       'konservy-myasnye_4054',
       'konservy-ovoshnye_4038',
       'konservy-olivki-masliny_4056',
       'konservy-rybnye-i-iz-moreprodu_4049',
       'konservy-fruktovo-yagodnye_4055',
       'dlya-devochek_4441',
       'dlya-malchikov_4446',
       'igrushki-dlya-novorozhdennyh_4415',
       'igry-na-vozduhe_4418',
       'myagkie-igrushki_4425',
       'nastolnye-igry-pazzly_4435',
       'razvivayushie-igry_4429',
       'ikra_4550',
       'kolbasa-sosiski-delikatesy-mya_4507',
       'myaso-ptica-ohlazhdennye_4534',
       'pashtety-zakuski_4558',
       'preservy_4554',
       'ryba-i-moreprodukty_4541',
       'rybnaya-gastronomiya_4545',
       'syr_4523',
       'yajco_4560',
       'desertnoe-koktejli_4569',
       'deserty-molochnye_4604',
       'jogurty-i-jogurtovye-produkty_4600',
       'kislomolochnye-napitki_4586',
       'margariny_4577',
       'maslo-slivochnoe-i-spredy_4573',
       'moloko_4564',
       'molochnye-produkty-dlya-detsko_4609',
       'slivki_4580',
       'smetana_4592',
       'soevye-produkty_4613',
       'tvorog-i-tvorozhnye-izdeliya_4594',
       'korm-dlya-koshek_4274',
       'korm-dlya-sobak_4279',
       'sredstva-uhoda-za-zhivotnymi-a_4295',
       'pyure-smesi-soki-detskoe_4232',
       'aksessuary-dlya-uhoda-za-volos_4378',
       'britvennye-sistemy_4331',
       'dezodoranty_4351',
       'dekorativnaya-kosmetika_4366',
       'mylo_4302',
       'nabory-parfyumerno-kosmetiches_4361',
       'parfyumeriya_4370',
       'sredstva-i-prinadlezhnosti-dly_4308',
       'sredstva-posle-britya_4326',
       'sredstva-solncezashitnye-i-dly_4358',
       'sredstva-uhoda-za-volosami_4335',
       'sredstva-uhoda-za-polostyu-rta_4315',
       'sredstva-uhoda-za-telom_4339',
       'sredstva-dlya-britya_4322',
       'zelen_4500',
       'ovoshi-griby_4483',
       'orehi-suhofrukty-ovoshi-sushen_4501',
       'frukty-svezhie_4468',
       'yagody-svezhie_4478',
       'frukty-yagody-ovoshi-griby_800',
       'pekarnya_2132',
       'kopchenosti_5341',
       'holodnaya-kulinariya_5329',
       'goryachaya-kulinariya_5334',
       'baranina_5316',
       'govyadina_5309',
       'konina_5324',
       'ptica_5321',
       'svinina_5303',
       'farsh_5325',
       'hleb-sobstvennogo-proizvodstva_5280',
       'hlebobulochnye-izdeliya-sp_5283',
       'pf-proizvodstva_5335']

In [5]:
%%time

products = []
titles = []
prices = []
old_prices = []
categories = []

for category in categories_to_scrap:
    page, num_pages = open_page(category, 1, 1)
    for i in np.arange(1,num_pages + 1):
        page, _ = open_page(category, i, 1)
        print(category, num_pages, i)
        if page.status_code == 200:
            soup = BeautifulSoup(page.text, "html.parser")
            content = soup.findAll('div', class_ = 'product-card-elem service_product_element')
            for product in content:
                
                #long product name
                name = product.find('div', {'class': 'product-card-elem-name'}).text.replace('\n','')
                
                # only unique names
                if name not in products:
                    products.append(name)

                    #only title
                    title = re.findall('([A-ZА-Я]{2,}){1,}([-+,.]\d+)?', name)
                    if title is None:
                        titles.append('"None"')
                    else:
                        result = ''

                    for i in title:
                        if i[1] is None:
                            result += i[0] + ' '
                        else:
                            result += i[0] + i[1] + ' '
                    titles.append(result)

                    #product price
                    price = product.find('div', {'class': 'product-card-elem-bottom'}).find('span').text
                    prices.append(price)

                    #product old price
                    string = product.find('div', {'class': 'product-card-elem-oldprice'}).find('span')
                    if string is None:
                        old_prices.append(price)
                    else:
                        string = str(string)
                        old_price = re.search('\d+', string)
                        old_prices.append(old_price[0])

                    #category
                    cat = re.search('[A-z_-]{2,}', category)
                    categories.append(cat[0][:-1])
        
        else:
            print(page.status_code)

avtokosmetika_4944 1 1
aksessuary_4963 1 1
zhidkosti-tehnicheskie_4954 1 1
nizhnee-bele-detskoe_5139 4 1
nizhnee-bele-detskoe_5139 4 2
nizhnee-bele-detskoe_5139 4 3
nizhnee-bele-detskoe_5139 4 4
nizhnee-bele-zhenskoe_5149 3 1
nizhnee-bele-zhenskoe_5149 3 2
nizhnee-bele-zhenskoe_5149 3 3
nizhnee-bele-muzhskoe_5158 4 1
nizhnee-bele-muzhskoe_5158 4 2
nizhnee-bele-muzhskoe_5158 4 3
nizhnee-bele-muzhskoe_5158 4 4
postelnoe-bele-prinadlezhnosti_5123 1 1
tekstil-dlya-vannoj_5137 1 1
tekstil-dlya-kuhni_5128 1 1
prinadlezhnosti-dlya-uborki_5109 6 1
prinadlezhnosti-dlya-uborki_5109 6 2
prinadlezhnosti-dlya-uborki_5109 6 3
prinadlezhnosti-dlya-uborki_5109 6 4
prinadlezhnosti-dlya-uborki_5109 6 5
prinadlezhnosti-dlya-uborki_5109 6 6
upakovochnye-materialy_5101 6 1
upakovochnye-materialy_5101 6 2
upakovochnye-materialy_5101 6 3
upakovochnye-materialy_5101 6 4
upakovochnye-materialy_5101 6 5
upakovochnye-materialy_5101 6 6
golovnye-ubory_5255 1 1
sumki_5246 1 1
obuv-detskaya-dlya-devochek_5220 1 1
o

morozhenoe_4647 26 20
morozhenoe_4647 26 21
morozhenoe_4647 26 22
morozhenoe_4647 26 23
morozhenoe_4647 26 24
morozhenoe_4647 26 25
morozhenoe_4647 26 26
myaso-i-myasoprodukty-zamorozh_4637 2 1
myaso-i-myasoprodukty-zamorozh_4637 2 2
pf-zamorozhennye_4620 13 1
pf-zamorozhennye_4620 13 2
pf-zamorozhennye_4620 13 3
pf-zamorozhennye_4620 13 4
pf-zamorozhennye_4620 13 5
pf-zamorozhennye_4620 13 6
pf-zamorozhennye_4620 13 7
TimeOut...Sleep
pf-zamorozhennye_4620 13 8
pf-zamorozhennye_4620 13 9
pf-zamorozhennye_4620 13 10
pf-zamorozhennye_4620 13 11
pf-zamorozhennye_4620 13 12
pf-zamorozhennye_4620 13 13
ptica-i-produkty-iz-pticy-zamo_4630 3 1
ptica-i-produkty-iz-pticy-zamo_4630 3 2
ptica-i-produkty-iz-pticy-zamo_4630 3 3
rybnaya-produkciya-zamorozhenn_4645 5 1
rybnaya-produkciya-zamorozhenn_4645 5 2
rybnaya-produkciya-zamorozhenn_4645 5 3
rybnaya-produkciya-zamorozhenn_4645 5 4
rybnaya-produkciya-zamorozhenn_4645 5 5
frukty-yagody-ovoshi-griby-zam_4615 13 1
frukty-yagody-ovoshi-griby-zam_461

chistyashie-moyushie-sredstva_4250 17 9
chistyashie-moyushie-sredstva_4250 17 10
chistyashie-moyushie-sredstva_4250 17 11
chistyashie-moyushie-sredstva_4250 17 12
chistyashie-moyushie-sredstva_4250 17 13
chistyashie-moyushie-sredstva_4250 17 14
TimeOut...Sleep
chistyashie-moyushie-sredstva_4250 17 15
chistyashie-moyushie-sredstva_4250 17 16
chistyashie-moyushie-sredstva_4250 17 17
konservy-myasnye_4054 6 1
konservy-myasnye_4054 6 2
konservy-myasnye_4054 6 3
TimeOut...Sleep
konservy-myasnye_4054 6 4
konservy-myasnye_4054 6 5
konservy-myasnye_4054 6 6
konservy-ovoshnye_4038 15 1
konservy-ovoshnye_4038 15 2
konservy-ovoshnye_4038 15 3
konservy-ovoshnye_4038 15 4
konservy-ovoshnye_4038 15 5
konservy-ovoshnye_4038 15 6
konservy-ovoshnye_4038 15 7
konservy-ovoshnye_4038 15 8
konservy-ovoshnye_4038 15 9
konservy-ovoshnye_4038 15 10
konservy-ovoshnye_4038 15 11
konservy-ovoshnye_4038 15 12
konservy-ovoshnye_4038 15 13
konservy-ovoshnye_4038 15 14
konservy-ovoshnye_4038 15 15
konservy-olivki-ma

sredstva-uhoda-za-telom_4339 25 12
sredstva-uhoda-za-telom_4339 25 13
sredstva-uhoda-za-telom_4339 25 14
sredstva-uhoda-za-telom_4339 25 15
sredstva-uhoda-za-telom_4339 25 16
sredstva-uhoda-za-telom_4339 25 17
sredstva-uhoda-za-telom_4339 25 18
sredstva-uhoda-za-telom_4339 25 19
sredstva-uhoda-za-telom_4339 25 20
sredstva-uhoda-za-telom_4339 25 21
sredstva-uhoda-za-telom_4339 25 22
sredstva-uhoda-za-telom_4339 25 23
sredstva-uhoda-za-telom_4339 25 24
sredstva-uhoda-za-telom_4339 25 25
sredstva-dlya-britya_4322 3 1
sredstva-dlya-britya_4322 3 2
sredstva-dlya-britya_4322 3 3
zelen_4500 1 1
ovoshi-griby_4483 3 1
ovoshi-griby_4483 3 2
ovoshi-griby_4483 3 3
orehi-suhofrukty-ovoshi-sushen_4501 2 1
orehi-suhofrukty-ovoshi-sushen_4501 2 2
frukty-svezhie_4468 2 1
frukty-svezhie_4468 2 2
yagody-svezhie_4478 1 1
frukty-yagody-ovoshi-griby_800 1 1
pekarnya_2132 1 1
kopchenosti_5341 1 1
holodnaya-kulinariya_5329 5 1
holodnaya-kulinariya_5329 5 2
holodnaya-kulinariya_5329 5 3
holodnaya-kulinariya_53

In [6]:
len(products) == len(titles) == len(prices) == len(old_prices) == len(categories)

True

## Excel

In [7]:
data = pd.DataFrame({'name':products, 'price':prices, 'category':categories})
data["price"] = data["price"].astype(int)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10751 entries, 0 to 10750
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      10751 non-null  object
 1   price     10751 non-null  int32 
 2   category  10751 non-null  object
dtypes: int32(1), object(2)
memory usage: 210.1+ KB


In [9]:
data

Unnamed: 0,name,price,category
0,Ароматизатор LUAZON CRAZY FRUIT для авто блистер,349,avtokosmetika
1,Ароматизатор ВКХ Арома-саше в конвертике для авто,399,avtokosmetika
2,Автосиликон ВКХ Lion водоотталкивающий баллон ...,419,avtokosmetika
3,Ароматизатор ВКХ Арома-саше в сумочке для авто,435,avtokosmetika
4,Ключ AUTOMOBIL Унисма-1 жидкий баллон 440мл,659,avtokosmetika
...,...,...,...
10746,Манты БАРАНИНА пф вес,1905,pf-proizvodstva
10747,Мясо ГОВЯДИНА Фрикадельки пф вес,2000,pf-proizvodstva
10748,Мясо ГОВЯДИНА Биточки пф вес,2175,pf-proizvodstva
10749,Мясо ГОВЯДИНА Тефтели пф вес,2199,pf-proizvodstva


In [10]:
data.name.value_counts()

ХАЛВА ПОДСОЛНЕЧНАЯ АЛМАТИНСКАЯ пак 800гр                         1
МОРОЖЕНОЕ ПЛОМБИР ВАНИЛЬ ЛАЙКА пак 450гр                         1
Порошок LOSK Color Автомат пак 1,35кг                            1
МОРОЖЕНОЕ ЛЕСНАЯ ЯГОДА ТАЛОСТО живое пл/б 500гр                  1
Салфетки JOHNSONS BABY Влажные детские Нежная забота пак 20шт    1
                                                                ..
СПРЕД МИШКА ШОКОЛАДНОЕ АЛАТАУ 62% вес                            1
Жевательные конфеты NESTLE Бон Пари Кола пак 120г                1
СОУС ЛЕЧО 3 ЖЕЛАНИЯ дой-пак 250гр                                1
Прокладки DISCREET Air Multiform Ежедневные карт/кор 100 шт      1
ПЕЧЕНЬЕ ОВСЯНОЕ ТЕМИРТАУ вес                                     1
Name: name, Length: 10751, dtype: int64

In [11]:
data.to_excel('astykzhan_Astana.xlsx', index = False, header = ['name','price', 'category'], engine = 'xlsxwriter')

In [14]:
requests.post('http://13.59.5.143:8082/webcatalogitems/excel', 'astykzhan_Astana.xls', 1000)

<Response [400]>

In [13]:
len(np.unique(data.name))

10751