In [462]:
import urllib2 as u2
import json
import pandas as pd
import time
import numpy as np
import html5lib
import re
import os
import requests
from bs4 import BeautifulSoup

## Learning the tools

In [388]:
url = 'http://www.wine.co.za/wine/wine.aspx?WINEID='+str(39031)

#Query the website and return the html to the variable 'page'
page = u2.urlopen(url).read()

#Parse the html in the 'page' variable, and store it in 
#Beautiful Soup format
soup = BeautifulSoup(page,'html.parser')

In [389]:
data_json = soup.find('script', type='application/ld+json')
data = json.loads(data_json.text)
print(data)
data.pop('brand')

{u'category': u'Sauvignon Blanc', u'sku': u'39031', u'description': u'Passion fruit, ruby grapefruit, gun flint and freshly cut grass on the nose followed by fresh citrus notes. The palate shows beautiful richness, strong mineral core with an herbaceous undertone and a bright acidity running through the wine.', u'image': [u'http://images.wine.co.za/GetWineImage.ashx?ImageSize=social&IMAGEID=258093'], u'brand': {u'@type': u'Brand', u'name': u'Steenberg'}, u'@context': u'http://schema.org', u'@id': u'https://wine.co.za/wine/wine.aspx?WINEID=39031', u'@type': u'Product', u'name': u'Steenberg Sauvignon Blanc 2016'}


{u'@type': u'Brand', u'name': u'Steenberg'}

In [390]:
print(soup.find('span',id="lbl_foodsuggest").text)
print(soup.find('span',id="lbl_origin").text)
print(soup.find('span',id="lbl_mainvariety").text)
print(soup.find('span',id="lbl_winemaker").text)


Constantia
Sauvignon Blanc
JD Pretorius


In [391]:
analysis = soup.find('span', id="lbl_analysis")
analysis.find_all('span')
for s in ['alc : (\d+)', 'rs : (\d+)',
          'pH : (\d+)', 'ta : (\d+)']:
    try:
        result = re.search(s, 
                           analysis.text).group(0) 
        r = result.split(' : ') 
        print(r)
        data[r[0]] = r[1] 
    except AttributeError:
        pass

[u'alc', u'13']
[u'rs', u'1']
[u'pH', u'3']
[u'ta', u'6']


In [392]:
spans = soup.find('span', id='lbl_type')
spans = spans.text.replace(u'\u2003', u'')
for s in ['type : (\w+)', 'style : (\w+)',
          'body : (\w+)', 'taste : (\w+)',
          ' (\w+)$']:
    try:
        result = re.search(s, spans).group(0)
        try:
            r = result.split(' : ') 
            print(r)
            data[r[0]] = r[1]
        except IndexError:
            data[u'wood'] = result
    except AttributeError:
        pass

[u'type', u'White']
[u'style', u'Dry']
[u'taste', u'Fruity']


In [393]:

packing = soup.find('span', id='lbl_pack').text
for s in ['pack : (\w+)', 'size : (\w+)',
          'closure : (\w+)']:
    try:
        result = re.search(s, packing).group(0)
        r = result.split(' : ') 
        print(r)
        data[r[0]] = r[1]
    except AttributeError:
        pass


[u'pack', u'Bottle']
[u'size', u'750ml']
[u'closure', u'Screwcap']


In [394]:
print(repr(data))
df = pd.DataFrame(data)
df

{u'category': u'Sauvignon Blanc', u'sku': u'39031', u'style': u'Dry', u'description': u'Passion fruit, ruby grapefruit, gun flint and freshly cut grass on the nose followed by fresh citrus notes. The palate shows beautiful richness, strong mineral core with an herbaceous undertone and a bright acidity running through the wine.', u'closure': u'Screwcap', u'rs': u'1', u'taste': u'Fruity', u'type': u'White', u'image': [u'http://images.wine.co.za/GetWineImage.ashx?ImageSize=social&IMAGEID=258093'], u'size': u'750ml', u'alc': u'13', u'pack': u'Bottle', u'@context': u'http://schema.org', u'pH': u'3', u'ta': u'6', u'@id': u'https://wine.co.za/wine/wine.aspx?WINEID=39031', u'@type': u'Product', u'name': u'Steenberg Sauvignon Blanc 2016'}


Unnamed: 0,@context,@id,@type,alc,category,closure,description,image,name,pH,pack,rs,size,sku,style,ta,taste,type
0,http://schema.org,https://wine.co.za/wine/wine.aspx?WINEID=39031,Product,13,Sauvignon Blanc,Screwcap,"Passion fruit, ruby grapefruit, gun flint and ...",http://images.wine.co.za/GetWineImage.ashx?Ima...,Steenberg Sauvignon Blanc 2016,3,Bottle,1,750ml,39031,Dry,6,Fruity,White


## Set up functions

After investigating the source of a few different pages, it made sense to try to read a variety of flags, allowing for many to be missing

In [507]:
def collect_specific_tags(soup, data):
    """
    Collect embedded data from the website
        - foodnotes
        - alcohol, pH, rs, ta
        - type
        - style
        - wooded
        - packing
        - size
        - closure
    :param soup: BeautifulSoup object
    :param data: dict
        dictionary of data from webpage
    """
    labels_to_process = {
        'foodnote': 'lbl_foodsuggest',
        'winemaker': 'lbl_winemaker',
        'origin': 'lbl_origin',
        'variety': 'lbl_mainvariety',
    }
    
    for labelname in labels_to_process.keys():
        label = labels_to_process[labelname]
        try:
            data[labelname] = soup.find('span',
                                  id=label).text
        except:
            try:
                data[labelname] = soup.find('span',
                                  id=label)
            except AttributeError:
                # item not in wine summary
                data[labelname] = None 
                
    try:
        data['winery'] = soup.find('a',
                          id='hl_winery')
    except AttributeError:
        # item not in wine summary
        data['winery'] = None 
        
    ids_to_process = {
        "lbl_analysis" : ['alc : (\d+)', 'rs : (\d+)',
                          'pH : (\d+)', 'ta : (\d+)'],
        'lbl_type' : ['type : (\w+) ', 'style : (\w+) ',
                      ' (\w+)$'],
        'lbl_pack' : ['pack : (\w+)', 'size : (\w+)',
                      'closure : (\w+)']
    }
    for id_key in ids_to_process.keys():
        subtext_labels = ids_to_process[id_key]
        subtext=None
        try:
            subtext = soup.find('span', id = id_key)
            try:
                subtext = subtext.text.replace(u'\u2003', u'')
            except AttributeError as a:
                pass 
        except Exception as e:
            print e
        if subtext is not None:
            for s in subtext_labels:
                try:
                    result = re.search(s, 
                                       subtext).group(0) 
                    try:
                        r = result.split(' : ') 
                        data[r[0]] = r[1]
                    except IndexError: 
                        # wooded type is on it's own
                        if id_key == 'lbl_type':
                            data[u'wood'] = result
                        else: 
                            print 'Error with', id_key, s
                except AttributeError:
                    pass # tag does not exist in html


    return data



In [519]:
def save_html(n):
    url = 'http://www.wine.co.za/wine/wine.aspx?WINEID=' \
        + str(n)
    page = u2.urlopen(url).read()
    soup = BeautifulSoup(page,'html.parser')
    with open('~/Google Drive/Data Science/wine_htmls/winehtml_' 
              + str(n) + '.txt', 'w') as f:
        f.write(str(soup))
    
def save_all_htmls(start, stop):
    for n in range(start, stop):
        save_html(n)
        sleeptime = np.random.random() + 1 
        time.sleep(sleeptime)

def collect_data(n):
    """
    Collect jsons from various webpages and store
    :param url_range: integer range
        range of integers to try in url
    """
    data = None 
    name = None
    with open('~/Google Drive/Data Science/wine_htmls/winehtml_' 
      + str(n) + '.txt', 'r') as f:
        html_data = f.read()
    soup = BeautifulSoup(html_data, 'html.parser')
    if soup.find('title').text != '\r\n\twine.co.za\r\n':
        data_json = soup.find('script',
                              type='application/ld+json')
        try:
            data = json.loads(data_json.text)
        except ValueError:
            pass

    if data is not None:
        subdicts = ['brand', 'offers']
        for key in subdicts:
            try:
                s = data.pop(key)
                if key == 'brand':
                    data[key+'_name'] = s[key][name] 
                if key == 'offers':
                    
            except:
                pass

        data = collect_specific_tags(soup, data)
        name = data['name']

    return data, name
    
    
def collection_of_data(start, stop): 
    """
    Collect jsons from the stored webpages and store as df
    :param url_range: integer range
        range of integers pointing to stored webpages
    """
    scraped_data = []

    for n in range(start, stop):
        data, name = collect_data(n)
        if data == '':
            pass
        else:
            try:
                scraped_data.append(pd.DataFrame(data))
            except ValueError:
                print data
                
    print(len(scraped_data))
    if len(scraped_data)>0:
        data = pd.concat(scraped_data)
        data.to_csv('~/Google Drive/Data Science/WineData/wineData_' + \
                    str(start)+'_'+str(stop) + '.csv',
                    columns = data.columns, encoding='utf-8')

## Collect data from www.wine.co.za

Under the assumption that the website was not set up to manage high volumes, I made sure to scrape the sight at intervals a few seconds apart because I didn't want to affect the other users

In [523]:
save_all_htmls(1,44000)

In [524]:
collection_of_data(1, len(os.listdir(
    '~/Google Drive/Data Science/wine_htmls/')))

43999


In [526]:
wines = pd.read_csv(
    '~/Google Drive/Data Science/WineData/wineData_1_44000.csv',
    index_col=0)
wines.head()

Unnamed: 0,@context,@id,@type,alc,category,closure,description,foodnote,image,name,...,pack,rs,size,sku,style,ta,type,variety,winemaker,wood
0,http://schema.org,https://wine.co.za/wine/wine.aspx?WINEID=2,Product,17.0,Muscat dAlexandrie,,,,http://images.wine.co.za/GetWineImage.ashx?Ima...,L Emigre Muscat d Alexandrie 1998,...,,,,2,,,,Muscat dAlexandrie,,
0,http://schema.org,https://wine.co.za/wine/wine.aspx?WINEID=4,Product,12.0,Cabernet Sauvignon,,,,http://images.wine.co.za/GetWineImage.ashx?Ima...,Genesis Cabernet Sauvignon 1997/1998,...,,1.0,,4,,5.0,Red,Cabernet Sauvignon,Chris Kelly,
0,http://schema.org,https://wine.co.za/wine/wine.aspx?WINEID=5,Product,12.0,Cabernet Sauvignon,,"Immense in all proportions, this wine defines ...",,http://images.wine.co.za/GetWineImage.ashx?Ima...,Rustenberg Peter Barlow 1996,...,Bottle,1.0,,5,Dry,6.0,Red,Cabernet Sauvignon,Rod Easthope,wooded
0,http://schema.org,https://wine.co.za/wine/wine.aspx?WINEID=6,Product,13.0,Shiraz,,This is an elegant and flavoursome Shiraz with...,,http://images.wine.co.za/GetWineImage.ashx?Ima...,Henri Roselt Shiraz 1999,...,,2.0,,6,,5.0,Red,Shiraz,Andr van Dyk,
0,http://schema.org,https://wine.co.za/wine/wine.aspx?WINEID=7,Product,12.0,Sauvignon Blanc,,"A well balanced, full tropical flavoured with ...",,http://images.wine.co.za/GetWineImage.ashx?Ima...,Boland Sauvignon Blanc 2000,...,Bottle,4.0,,7,,6.0,White,Sauvignon Blanc,Johan Joubert,


In [527]:
wines.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31972 entries, 0 to 0
Data columns (total 22 columns):
@context       31972 non-null object
@id            31972 non-null object
@type          31972 non-null object
alc            29432 non-null float64
category       31235 non-null object
closure        22665 non-null object
description    27687 non-null object
foodnote       14345 non-null object
image          31972 non-null object
name           31972 non-null object
origin         27051 non-null object
pH             28053 non-null float64
pack           28043 non-null object
rs             28424 non-null float64
size           9698 non-null object
sku            31972 non-null int64
style          18686 non-null object
ta             27871 non-null float64
type           28870 non-null object
variety        30838 non-null object
winemaker      30603 non-null object
wood           13804 non-null object
dtypes: float64(4), int64(1), object(17)
memory usage: 5.6+ MB
