# Pyladies_8_webscraping  <img src="https://raw.githubusercontent.com/pyladies/pyladies-assets/master/geek/png/pylady_geek_partial.png" style="display:inline" width="150"  align="right">
# Task description

    - Visit the website: https://www.boat24.com/en/secondhandboats/
    - Collect the links of the first page into a list
    - Write a function that collects one ship's data in a dictionary and returns it
    - Map your function to the list of links (only the first 5 elements are enough), 
    - then make a table from the result
    - Bonus task:
        - Click on the second result page, see how the website link changes
        - Collect the links of all the ships on the first 5 pages and download their data
        - Data cleaning can be done after data collection
        - Create some diagrams with the plotnine package -- do EDA on the data
    - During the solution, it is important to pay attention to the CleanCode principles as much as you can

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd


# create some auxiliary function for scraping
def read_html(link):
    response = requests.get(link)
    return(BeautifulSoup(response.text, 'html'))

def get_texts(link_nodes):
  return([x.string for x in link_nodes])

def clean_lower(my_texts): 
    return [x.lower().strip() if x is not None else 'None' for x in my_texts]

## Collect links from the first page

In [2]:
# make response and a soup
soup = read_html('https://www.boat24.com/en/secondhandboats/')

In [3]:
# find the class (which containing links) and get links 
class_elements = soup.find_all(class_='blurb__title')
links = [element.find('a')['href'] for element in class_elements]
links

['https://www.boat24.com/en/powerboats/luhrs/luhrs-240-tournament-open/detail/564259/',
 'https://www.boat24.com/en/sailingboats/dufour/dufour-32-classic/detail/564635/',
 'https://www.boat24.com/en/powerboats/bayliner/bayliner-175/detail/562563/',
 'https://www.boat24.com/en/powerboats/jeanneau/jeanneau-merry-fisher-895-offshore/detail/552146/',
 'https://www.boat24.com/en/sailingboats/hallberg-rassy/hallberg-rassy-36mkii/detail/544578/',
 'https://www.boat24.com/en/powerboats/terhi/terhi-sea-fun/detail/565148/',
 'https://www.boat24.com/en/powerboats/inter/inter-7700-nor-line-dutch-edition/detail/557752/',
 'https://www.boat24.com/en/powerboats/maxum/maxum-2400-sr/detail/565291/',
 'https://www.boat24.com/en/sailingboats/amel/amel-maramu/detail/565290/',
 'https://www.boat24.com/en/sailingboats/dehler/dehler-36-cws/detail/565204/',
 'https://www.boat24.com/en/powerboats/cigala-bertinetti/cigala-bertinetti-shark-45/detail/565287/',
 'https://www.boat24.com/en/sailingboats/albin/albin-

## Write a function that collects one ship's data in a dictionary and returns it

In [4]:
# when creating the df, it appeared that the values of some rows were shifted compared to the others;
# probably it's because there was no information about certain keys on the page of the given ship;
# in addition, it seemed in retrospect that the .select() used for the keys / values variables 
# does not scrape the price of the ships correctly;
# in addition, I thought it would be good to have some more information about the ships: name, price, location, equipments;
# so I finally decided to define the list of desired keys in advance (based on the keys found in the first 23 boats)
# and then in each dict I ensure that the corresponding values are displayed, even if with an empty value

expected_keys = ['name', 'price', 'location', 'equipments', 'year built', 'condition', 'length x beam', 'draught',
                 'displacement', 'material', 'hull type', 'ce design category', 'certified no. of persons',
                 'headroom', 'no. of cabins', 'no. of berths', 'no. of bathrooms', 'fresh water tank',
                 'propulsion', 'engine', 'engine performance', 'fuel type', 'engine hours',
                 'mainsail', 'jib', 'genoa', 'spinnaker', 'hull color', 'keel type', 'toilets', 'showers',
                 'engine year', 'incl. vat', 'excl. vat', 'steering', 'holding tank']

def get_one_ship(link):
    boat_info = read_html(link)
    
    # create a default dictionary with all expected keys
    data = {key: 'NaN' for key in expected_keys}
    
    # add 'name' and 'price' values to the dictionary
    data['name'] = boat_info.select_one('.heading__title span').get_text(strip=True)
    data['price'] = boat_info.select_one('.contact-box__price').get_text(strip=True)
    
    # add 'location' values to the dictionary, with error handling
    # because when I filled df with datas, I saw that there are places where we can't find such an element
    location_section = boat_info.find(id='location')
    if location_section:
        data['location'] = location_section.find(class_='text').text

    # add 'equipments' values to the dictionary, with error handling
    # because when I filled df with datas, I saw that there are places where we can't find such an element
    equipments_list = boat_info.select_one('#equipment ul')
    if equipments_list:
        equipments_items = [li.get_text(strip=True) for li in equipments_list.find_all('li')]
        equipments = ', '.join(equipments_items)
        data['equipments'] = equipments

    # add further datas to the dictionary
    keys = clean_lower(get_texts(boat_info.select('.list__key')))
    values = clean_lower(get_texts(boat_info.select('.list__value')))
    additional_data = dict(zip(keys, values))
    data.update(additional_data)

    # remove the not well scraped keys
    filtered_data = {k: v for k, v in data.items() if k not in ['basis for negotiation', 'basis for negotiation, eu taxes paid']}
    
    return filtered_data

In [5]:
get_one_ship('https://www.boat24.com/en/sailingboats/bavaria/bavaria-cruiser-50/detail/563671/')

{'name': 'Bavaria Cruiser 50',
 'price': 'EUR 194.500,-',
 'location': 'Spain » Mallorca » Palma',
 'equipments': 'Shore Connection, Battery, Battery Charger, Inverter, Fire Extinguisher, Gas Detector, Anchor, Bow Anchor Capstan, Bilge Pump, Life Raft, Navigation Lights, Bow Thruster, Water Pressure System, Warm Water System, Board Computer, Compass, GPS, Autopilot, VHF Radio, Radar, Radar Reflector, Chartplotter, Wind Instrument, Depth Instrument, Speed Instrument, AIS, EPIRB, MOB System, Microwave, Oven, Sink, Cool Box, Fridge, Gas Stove, Sprayhood, Bimini Top, Swim Ladder, Bathing platform electric, Gangway, Deck Shower, Heating, Radio, Satellite Phone, Cockpit Table, Underwater Paint',
 'year built': '2012',
 'condition': 'good condition',
 'length x beam': '14.99 m x 4.67 m',
 'draught': '1.91 m',
 'displacement': "14'600 kg",
 'material': 'grp',
 'hull type': 'NaN',
 'ce design category': 'a - ocean',
 'certified no. of persons': '12 persons',
 'headroom': '211 cm',
 'no. of cabi

## Map the get_one_ship function to the list of links and make a DF

In [6]:
df = pd.DataFrame(list(map(get_one_ship, links[0:5])))
df.head()

Unnamed: 0,name,price,location,equipments,year built,condition,length x beam,draught,displacement,material,...,keel type,toilets,showers,engine year,incl. vat,excl. vat,steering,holding tank,None,eu taxes paid
0,Luhrs 240 Tournament open,"EUR 28.000,-",Spain » Islas Canarias » Puerto Colón,"Shore Connection, Battery, Anchor, Navigation ...",1992,good condition,7.48 m x 2.86 m,,3'400 kg,carbon fiber,...,,,,,,,,,,
1,Dufour 32 Classic,Under Offer,Netherlands » Fryslân » Sloten Friesland.,"Shore Connection, Battery, Battery Charger, Fi...",2004,very good condition,9.96 m x 3.30 m,1.25 m,4'430 kg,grp,...,fin keel,,,,,,tiller steering,60 l waste water,under offer,
2,Bayliner 175,"EUR 8.600,-",Norway » Østlandet » Moss,"Battery, Bilge Pump, Navigation Lights, Depth ...",2005,very good condition,5.33 m x 2.13 m,0.50 m,870 kg,,...,,,,,,,,,"eur 8.600,-",
3,Jeanneau Merry Fisher 895 Offshore,"EUR 169.000,-",Germany » Baden-Württemberg » Kembs,"Shore Connection, Battery, Battery Charger, In...",2020,very good condition,9.07 m x 2.99 m,0.64 m,4'500 kg,grp,...,,,,,,,,80 l waste water,,"eur 169.000,-"
4,Hallberg-Rassy 36MKII,Price on Request,Italy » Liguria » Sanremo,,2001,very good condition,11.00 m,,,,...,,,,,,,,,,price on request


## Collect the links of all the ships on the first 5 pages and download their data

        1. page: https://www.boat24.com/en/secondhandboats/
        2. page: https://www.boat24.com/en/secondhandboats/?page=20
        3. page: https://www.boat24.com/en/secondhandboats/?page=40
        4. page: https://www.boat24.com/en/secondhandboats/?page=60
        5. page: https://www.boat24.com/en/secondhandboats/?page=80

In [7]:
# write a for loop that collects the links on the first 5 pages

base_url = "https://www.boat24.com/en/secondhandboats/"
pages = [0, 20, 40, 60, 80]

five_page_links = []

for page in pages:
    if page == 0:
        url = base_url
    else:
        url = f"{base_url}?page={page}"

    soup = read_html(url)
    
    class_elements = soup.find_all(class_='blurb__title')
    links = [element.find('a')['href'] for element in class_elements]
    five_page_links.extend(links)

In [8]:
# check list element number (there are 23 ships / page)
len(five_page_links) == 5*23

True

In [9]:
# map the get_one_ship function to the new list of links and make a DF
df = pd.DataFrame(list(map(get_one_ship, five_page_links)))
print('Number of rows and columns: ' + str(df.shape))
df.head()

Number of rows and columns: (115, 50)


Unnamed: 0,name,price,location,equipments,year built,condition,length x beam,draught,displacement,material,...,"basis for negotiation, when importing into the eu, customs and import sales tax may be charged","when importing into the eu, customs and import sales tax may be charged",None,ballast,boat name,gennaker,starting price,"incl. vat, eu taxes paid",weight,incl. 7.7% swiss vat
0,Azimut 46 Evolution 2007,"EUR 390.000,-",Croatia » Primorje-Gorski kotar » Punat,"Shore Connection, Generator, Battery, Battery ...",2007,very good condition,14.60 m x 4.41 m,1.20 m,,grp,...,,,,,,,,,,
1,Sunseeker Portofino 47,"EUR 360.000,-",Germany » Lübecker Bucht » Neustadt,"Shore Connection, Generator, Battery, Battery ...",2008,very good condition,14.96 m x 4.29 m,1.20 m,16'400 kg,grp,...,,,,,,,,,,
2,Bénéteau Antares 7 OB V2,"EUR 81.150,-",Italy » Marche » Marina Dorica - Ancona,"Stove, Gas Stove, Sink, Fridge, Compass, GPS, ...",2023,,7.48 m x 2.53 m,0.80 m,1'651 kg,grp,...,,,,,,,,,,
3,Jeanneau Merry Fisher 895 Offshore,"EUR 169.000,-",Germany » Baden-Württemberg » Kembs,"Shore Connection, Battery, Battery Charger, In...",2020,very good condition,9.07 m x 2.99 m,0.64 m,4'500 kg,grp,...,,,,,,,,,,
4,Hallberg-Rassy 36MKII,Price on Request,Italy » Liguria » Sanremo,,2001,very good condition,11.00 m,,,,...,,,,,,,,,,


In [10]:
# if we want to continue working with the same df in the future,
# then it must be saved in a csv because by rerunning the above cells
# (when sending a new request) the page regenerates/rearranges the links,
# and it is no longer certain that we will see the same ones that I will analyze now

df.to_csv('boat24_five_page_ships.csv', index=False)
