# For letters

In [1]:
import pandas as pd # library for data analysis
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML documents
import re
import shutil

In [15]:
ids = [] # id on web is id+1
headers = []
contents = []
sketches = []

#current_id = 1
for i in range(1, 903):
    try:
        ids.append(i)
    
        letter_url = "https://vangoghletters.org/vg/letters/let" + str(i).zfill(3) + "/letter.html"
        response = requests.get(letter_url)
        soup = BeautifulSoup(response.text, 'html.parser')
    
        if_sketch = False
        if soup.find_all('i', text = 'sketch'):
            if_sketch = True
        sketches.append(if_sketch)
        
    # remove markup that is not refering to a person
        r1 = soup.find_all("span", {"class": re.compile("^pers")})
        r2 = soup.find_all("span")
        
        for m in r2:
            if m not in r1:
                m.extract()

        this_header = soup.find('div',{'id':"header"}).find('h2').text
        headers.append(this_header)
    
        this_text = ''
        for paragraph in soup.find('div',{'id':"tab-container-1"}).find_all('div',{'class':'p'}):
            this_text += paragraph.text
            this_text += ' '
        
    # clean text
        this_text = re.sub(r'\[.*?\]+', '', this_text)
        this_text = this_text.replace('\n', '')
        contents.append(this_text)
    
    except:
        print(i)


In [14]:
# testing
value = '<div><p>Hello <span class = "pers"><a>there<a></span> my friend!<span><a>233</a></span></p></div>'
soup = BeautifulSoup(value)

r1 = soup.find_all("span", {"class": re.compile("^pers")})
r2 = soup.find_all("span")

for item in r2:
    if item not in r1:
        item.extract()
        
for s in soup.select('span'):
    # only remove the markup that is not refering to a person 
    # soup.find_all('div',{'class':re.compile("^span3")})
    print(s.text)
    
this_text = ''
for paragraph in soup.find('div').find_all('p'):
    this_text += paragraph.text
    this_text += ' '
print(this_text)

there
Hello there my friend! 


In [16]:
letter_d = {'id': ids, 'header': headers, 'content':contents, 'sketch':sketches}
letter_raw = pd.DataFrame(data=letter_d)
letter_raw.to_csv('letter_raw.csv',index=False)

In [17]:
letter_raw.head()

Unnamed: 0,id,header,content,sketch
0,1,"To Theo van Gogh. The Hague, Sunday, 29 Septem...","The Hague, 29 September 1872. My dear Theo, Th...",False
1,2,"To Theo van Gogh. The Hague, Friday, 13 Decemb...","The Hague, 13 December 1872. My dear Theo, Tha...",False
2,3,"To Theo van Gogh. The Hague, mid-January 1873.","The Hague, January 1873 My dear Theo, I heard ...",False
3,4,"To Theo van Gogh. The Hague, Tuesday, 28 Janua...","The Hague, 28 Jan. 1873 My dear Theo, It’s goo...",False
4,5,"To Theo van Gogh. The Hague, Monday, 17 March ...","The Hague, 17 March 1873 My dear Theo, It’s ti...",False


In [18]:
letter_raw['sketch'].value_counts()

False    766
True     136
Name: sketch, dtype: int64

## Collect info from letter_raw

Ectract from, to, place, date from header info.

In [19]:
letter_from = []
letter_to = []
place = []
date = []

for index, row in letter_raw.iterrows():
    from_to = row['header'].split('. ')[0]
    place_time = row['header'].split('. ')[-1]
    place.append(place_time.split(', ')[0])
    date.append(place_time.split(', ')[-1])# string need to convert to time format
    
    if(from_to.startswith('To')):
        letter_to.append(from_to.split('To ')[-1])
        letter_from.append('Vincent van Gogh')
    else:
        letter_to.append(from_to.split(' to ')[-1])
        letter_from.append(from_to.split(' to ')[0])

In [20]:
letter_raw['from'] = letter_from
letter_raw['to'] = letter_to
letter_raw['place'] = place
letter_raw['date'] = date

In [21]:
letter_raw.to_csv('letter_raw.csv',index=False)

# For paintings

In [15]:
# get links to all of his artwork from wikiart
urls = []

artwork_url = 'https://www.wikiart.org/en/vincent-van-gogh/all-works/text-list'
response = requests.get(artwork_url)
soup = BeautifulSoup(response.text, 'html.parser')
    

for li in soup.find_all('li', {'class':'painting-list-text-row'}):
    url = li.find('a')['href']
    urls.append('https://www.wikiart.org/'+url)

In [17]:
urls[100]

'https://www.wikiart.org//en/vincent-van-gogh/young-woman-sewing-1881'

In [25]:
# get labels and painting from each url, consider only oil paintings
names = []
dates = [] # may contain year and location
styles = []
genres = []
#medias = []
painting_url = []

for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # check if media contain oil/canvas
    if(not soup.find('s', text = 'Media:')):
        continue
        
    main_content = soup.find('div', {'class':'wiki-layout-artist-info wiki-layout-artwork-info'})
    media_c = main_content.find('s', text = 'Media:').parent.text
    if('oil' in media_c or 'canvas' in media_c):
        # get img url
        aside = main_content.find('aside')
        painting_url.append(aside.find('img')['src'].split('!')[0])
        
        # get artical
        artical = main_content.find('article')
        names.append(artical.find('h3').text)
        
        for li in artical.find_all('li'):
            if(li.find('s')):
                if('Date:' in li.find('s').text):
                    dates.append(li.find('span').text)
                if('Style:' in li.find('s').text):
                    styles.append(li.find('span').text)
                if('Genre:' in li.find('s').text):
                    genres.append(li.find('span').text)

In [38]:
# extract year and place from header
places = []
years = []
for date in dates:
    place = None
    if(len(date.split('; '))>1):
        place = date.split('; ')[-1]
    year = date.split('; ')[0].split('.')[-1]
    places.append(place)
    years.append(year)

In [40]:
painting_d = {'name': names, 'year': years, 'place':places, 'style':styles, 'genre':genres, 'painting_url':painting_url}
paintings = pd.DataFrame(data=painting_d)
paintings.to_csv('paintings.csv')

In [41]:
paintings.head()

Unnamed: 0,name,year,place,style,genre,painting_url
0,Still Life with Beer Mug and Fruit,1881,Netherlands,\nRealism\n,\nstill life\n,https://uploads0.wikiart.org/images/vincent-va...
1,Still Life with Cabbage and Clogs,1881,"Haag / Den Haag / La Haye / The Hague, Netherl...",\nRealism\n,\nstill life\n,https://uploads8.wikiart.org/images/vincent-va...
2,Windmils at Dordrecht,1881,Netherlands,\nRealism\n,\nlandscape\n,https://uploads5.wikiart.org/images/vincent-va...
3,Beach at Scheveningen in Calm Weather,1882,"Haag / Den Haag / La Haye / The Hague, Netherl...",\nRealism\n,\nlandscape\n,https://uploads6.wikiart.org/images/vincent-va...
4,Cluster of Old Houses with the New Church in T...,1882,"Haag / Den Haag / La Haye / The Hague, Netherl...",\nRealism\n,\ncityscape\n,https://uploads5.wikiart.org/images/vincent-va...


In [42]:
paintings['painting_url'][0]

'https://uploads0.wikiart.org/images/vincent-van-gogh/still-life-with-beer-mug-and-fruit-1881.jpg'

In [47]:
# download images
for index, row in paintings.iterrows():
    filename = row['year']+'_'+row['painting_url'].split("/")[-1] #####

    response = requests.get(row['painting_url'], stream = True)

    # check if the image was retrieved successfully
    if response.status_code == 200:
        # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
        response.raw.decode_content = True    
        
        with open(filename,'wb') as f:
            shutil.copyfileobj(response.raw, f)
            
    else:
        print(filename)