# Galmart Shop Web Scrapping

In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import re
import time

Function opening web page:

In [2]:
cats = {'кз0000001':'grocery',
        'кз0000293':'semi-finished',
        'кз0000034':'textile',
        'кз0000048':'sausages_cheese',
        'кз0000061':'confectionery',
        'кз0000081':'confectionery_selfmade',
        'кз0000094':'preserves',
        'кз0000109':'cosmetics',
        'кз0000155':'gastronomy',
        'кз0000181':'dairy',
        'кз0000207':'detergents',
        'кз0000223':'beverages',
        'кз0000255':'fruits_vegetables',
        'кз0000300':'tableware',
        'кз0000319':'seafood',
        'кз0000330':'detergents',
        'кз0000338':'shoe-care',
        'кз0000343':'tobacco',
        'кз0000350':'sauna',
        'кз0000356':'child',
        'кз0000382':'household',
        'кз0000429':'animals',
        'кз0000439':'holidays',
        'кз0000462':'magazines',
        'кз0000470':'stationery',
        'кз0000484':'bread',
        'кз0000499':'hot-drinks',
        'кз0000513':'socks'}

In [3]:
def open_page(category, page_num, url = 'https://store.galmart.kz/product-category/'):
    
    #forming the page address and headers
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:45.0) Gecko/20100101 Firefox/45.0'}
    page_url = url + category + '/page/'+ str(page_num)
    
    #exceptions
    while True:
        try:
            page = requests.get(page_url, headers = headers)
            break
        except requests.exceptions.ConnectionError:
            print("TimeOut...Sleep")
            time.sleep(10)
            pass
    
    #getting the number of pages
           
    if page.status_code == 200:
        pages = []
        soup = BeautifulSoup(page.text, "html.parser")
        content = soup.find('div', class_ = 'ast-woocommerce-container').find(class_ = 'page-numbers')
        if content is not None:
            string = content.text.split('\n')
            for i in string:
                if i.isdigit():
                    pages.append(int(i))

            if pages != []:
                num_pages = max(pages)
            else:
                num_pages = 1
        else:
            num_pages = 1
    else:
        print(page.status_code)
        num_pages = 1
            
    return page, num_pages

Reading the shop pages one by one. Information about the product is inside the _astra-shop-summary-wrap_ class; the detailed info is inside the _woocommerce-loop-product__title_ class, the price is described in the _woocommerce-Price-amount amount_ class.

In [4]:
%%time
products = []
prices = []
categories = []

for cat in cats:
    page, num_pages = open_page(cat, 1)
    
    for i in np.arange(1,num_pages+1):
        page, _ = open_page(cat, i)
        print(cat, num_pages, i)
        
        if page.status_code == 200:
            soup = BeautifulSoup(page.text, "html.parser")
            content = soup.findAll('div', class_ = 'astra-shop-summary-wrap')
            
            for product in content:
                if product.find(class_='woocommerce-loop-product__title') is not None:
                    
                    #product name
                    name = product.find('h2').text
                    if name not in products:
                        products.append(name)
                        
                        #price
                        if product.find('span', class_='woocommerce-Price-amount amount') is not None:
                            prices.append(product.find('span', class_='woocommerce-Price-amount amount').text)
                        else:
                            prices.append(0)
                            
                        #category
                        categories.append(cats[cat])
                
        else:
            print(page.status_code)

кз0000001 41 1
кз0000001 41 2
кз0000001 41 3
кз0000001 41 4
кз0000001 41 5
кз0000001 41 6
кз0000001 41 7
кз0000001 41 8
кз0000001 41 9
кз0000001 41 10
кз0000001 41 11
кз0000001 41 12
кз0000001 41 13
кз0000001 41 14
кз0000001 41 15
кз0000001 41 16
кз0000001 41 17
кз0000001 41 18
кз0000001 41 19
кз0000001 41 20
кз0000001 41 21
кз0000001 41 22
кз0000001 41 23
кз0000001 41 24
кз0000001 41 25
кз0000001 41 26
кз0000001 41 27
кз0000001 41 28
кз0000001 41 29
кз0000001 41 30
кз0000001 41 31
кз0000001 41 32
кз0000001 41 33
кз0000001 41 34
кз0000001 41 35
кз0000001 41 36
кз0000001 41 37
кз0000001 41 38
кз0000001 41 39
кз0000001 41 40
кз0000001 41 41
кз0000293 1 1
кз0000034 5 1
кз0000034 5 2
кз0000034 5 3
кз0000034 5 4
кз0000034 5 5
кз0000048 4 1
кз0000048 4 2
кз0000048 4 3
кз0000048 4 4
кз0000061 26 1
кз0000061 26 2
кз0000061 26 3
кз0000061 26 4
кз0000061 26 5
кз0000061 26 6
кз0000061 26 7
кз0000061 26 8
кз0000061 26 9
кз0000061 26 10
кз0000061 26 11
кз0000061 26 12
кз0000061 26 13
кз0000061 26 1

Deleting irrelevant symbols from the price:

In [5]:
prices_clean = []
pattern = '(\d+[,])?\d+'
for price in prices:
    price = str(price).replace(',','')
    string = re.search(pattern, str(price))
    prices_clean.append(string[0])

Taking relevant information from the product description:

In [6]:
origins = []
for product in products:
    separated = str(product).strip().split(',')
    if len(separated) > 1:
        string = str(product).strip().split(' ')
        if string[-1].istitle():
            origins.append(string[-1].strip()) 
        elif string[-2].istitle():
            origins.append(string[-2].strip()) 
        else:
            origins.append('NONE')
    else:
        origins.append('NONE')

## Excel

In [7]:
data = pd.DataFrame({'name':products, 'price':prices_clean, 'category':categories})
data['price'] = data['price'].astype(int)

In [8]:
print(data.shape[0])
print(len(pd.unique(data['name'])))

5780
5780


In [9]:
data.to_excel('Galmart.xlsx', index = False, header = ['name','price', 'category'], engine = 'xlsxwriter')

In [34]:
# answer from StackOverflow
params = {'file':'Galmart.xls', 'webStoreId':1001}
requests.post('http://13.59.5.143:8082/webcatalogitems/excel', 'Galmart.xls', params=params)

<Response [500]>