# Maria Yasin
## GPX Data Webscraping Project

In [4]:
#import essential libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import re
import os
import requests
import gpxpy
import gpxpy.gpx

### Web Scraping

In [5]:
#Extract the hyperlinks for the GPX files using BeautifulSoup
url = 'http://mlg.ucd.ie/modules/python/EVdata'
reqs = requests.get(url)
soup = bs(reqs.text)

#Putting urls in a list
urls = []
for link in soup.find_all('a'):
    urls.append(url + '/' + link.get('href'))
type(urls)

list

In [6]:
urls

['http://mlg.ucd.ie/modules/python/EVdata/ev1.html',
 'http://mlg.ucd.ie/modules/python/EVdata/ev2.html',
 'http://mlg.ucd.ie/modules/python/EVdata/ev3.html',
 'http://mlg.ucd.ie/modules/python/EVdata/ev4.html',
 'http://mlg.ucd.ie/modules/python/EVdata/ev5.html',
 'http://mlg.ucd.ie/modules/python/EVdata/ev6.html',
 'http://mlg.ucd.ie/modules/python/EVdata/ev7.html',
 'http://mlg.ucd.ie/modules/python/EVdata/ev8.html',
 'http://mlg.ucd.ie/modules/python/EVdata/ev9.html',
 'http://mlg.ucd.ie/modules/python/EVdata/ev10.html',
 'http://mlg.ucd.ie/modules/python/EVdata/ev11.html',
 'http://mlg.ucd.ie/modules/python/EVdata/ev12.html',
 'http://mlg.ucd.ie/modules/python/EVdata/ev13.html',
 'http://mlg.ucd.ie/modules/python/EVdata/ev14.html',
 'http://mlg.ucd.ie/modules/python/EVdata/ev15.html',
 'http://mlg.ucd.ie/modules/python/EVdata/ev17.html',
 'http://mlg.ucd.ie/modules/python/EVdata/ev19.html']

In [7]:
#Downloading gpx files to a folder within the directory
URL= 'http://mlg.ucd.ie/modules/python/EVdata/' 
directory_name = '\Python_Assignment2'
order = r'[0-9]?[0-9][.]'
os.mkdir('Python_Assignment2')
path = os.getcwd()+ directory_name

In [8]:
def get_soup(url): 
    return bs(requests.get(url).text, 'html.parser')

In [9]:
count=0
# to iterate through all <a> tags 
for link in get_soup(URL).find_all('a'):
    innerlink = get_soup(URL+link.get('href'))
    file_name = re.sub(order,'', innerlink.find('h1').get_text()).strip().replace(' ','-') + '.gpx' 
    download_link= innerlink.find('a').get('href')
    with open(os.path.join(path, file_name), 'wb') as file: 
        response = requests.get(URL+download_link)
        file.write(response.content) 
        count=count+1
        print('Downladed: ',file_name) 

Downladed:  Atlantic-Coast-Route.gpx
Downladed:  Capitals-Route.gpx
Downladed:  Pilgrims-Route.gpx
Downladed:  Central-Europe-Route.gpx
Downladed:  Via-Romea-(Francigena).gpx
Downladed:  Atlantic-â-Black-Sea.gpx
Downladed:  Sun-Route.gpx
Downladed:  Mediterranean-Route.gpx
Downladed:  Baltic-â-Adriatic.gpx
Downladed:  Baltic-Sea-Cycle-Route.gpx
Downladed:  East-Europe-Route.gpx
Downladed:  North-Sea-Cycle-Route.gpx
Downladed:  Iron-Curtain-Trail.gpx
Downladed:  Waters-of-Central-Europe.gpx
Downladed:  Rhine-Cycle-Route.gpx
Downladed:  Rhone-Cycle-Route.gpx
Downladed:  Meuse-Cycle-Route.gpx


In [10]:
# Open the file in read mode and parse it
#If for any reason an error occurs due to the directory path, please change the path according to your specific directory
files = os.listdir(r'C:\Users\35383\Documents\Python_Assignment2')
gpx_files = [f for f in files if f.endswith('.gpx')]
for gpx_file in gpx_files:
    with open(gpx_file, 'r', encoding ='utf-8') as f:
        gpx = gpxpy.parse(f)

for track in gpx.tracks:
    for segment in track.segments:
        for point in segment.points:
            print('Point at ({0},{1}) -> {2}'.format(point.latitude, point.longitude, point.elevation))

for waypoint in gpx.waypoints:
    print('waypoint {0} -> ({1},{2})'.format(waypoint.name, waypoint.latitude, waypoint.longitude))

for route in gpx.routes:
    print('Route:')
    for point in route.points:
        print('Point at ({0},{1}) -> {2}'.format(point.latitude, point.longitude, point.elevation))

# There are many more utility methods and functions:
# You can manipulate/add/remove tracks, segments, points, waypoints and routes and
# get the GPX XML file from the resulting object:

print('GPX:', gpx.to_xml())

Point at (50.311012560046,17.350070178509) -> 339.5
Point at (50.306256332908,17.342838943005) -> 309.3
Point at (50.30722058705,17.338770031929) -> 311.3
Point at (50.304655724678,17.332625091076) -> 314.2
Point at (50.305728608284,17.329894602299) -> 315.3
Point at (50.302515992435,17.323722839356) -> 319.8
Point at (50.300283723981,17.323579341173) -> 320.0
Point at (50.297253498345,17.318656146526) -> 323.5
Point at (50.292427533773,17.295704483986) -> 359.7
Point at (50.284541168713,17.284618914127) -> 347.6
Point at (50.282153332136,17.283945679665) -> 350.7
Point at (50.282899656795,17.277773916721) -> 368.0
Point at (50.279731297395,17.270380407572) -> 387.4
Point at (50.277331390877,17.257921546698) -> 399.4
Point at (50.273944431442,17.25531578064) -> 390.4
Point at (50.272434347766,17.249854803085) -> 382.4
Point at (50.262075656545,17.243009805679) -> 392.4
Point at (50.260651403558,17.237062007189) -> 395.0
Point at (50.256027945766,17.23202213645) -> 404.4
Point at (50.25

The gpx files give us information about the track name, longitude, latitude and elevation for all the routes.

### Converting the route ev6 to a dataframe

In [11]:
#Converting the route ev6 to a dataframe
#Using the manually downloaded gpx files due to the ease of shorter file names
gpx_file = 'ev6.gpx'
with open(gpx_file) as gpx_file:
    gpx_file = gpxpy.parse(gpx_file)
# Create a dataframe to store information
df_ev6 = pd.DataFrame(columns=['Track Name', 'Length (km)', 'Uphill', 'Downhill'])

# Loop through each track and track segment
for track in gpx_file.tracks:
    for segment in track.segments:
        # Calculate total length
        length = segment.length_2d() / 1000
        
        # Calculate number of uphill and downhill points
        uphill, downhill = 0, 0
        prev_elevation = segment.points[0].elevation
        for point in segment.points:
            if point.elevation > prev_elevation:
                uphill += 1
            elif point.elevation < prev_elevation:
                downhill += 1
            prev_elevation = point.elevation
        
        # Add information to the dataframe
        df_ev6 = df_ev6.append({
            'Track Name':track.name,
            'Length (km)':length,
            'Uphill':uphill,
            'Downhill':downhill
        }, ignore_index=True)

# Show the dataframe
df_ev6

  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
 

  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
  df_ev6 = df_ev6.append({
 

Unnamed: 0,Track Name,Length (km),Uphill,Downhill
0,01: Le Pellerin â€“ Saint-Brevin-les-Pins (Dev...,36.22356,19,16
1,02: Nantes â€“ Le Pellerin (Developed with signs),23.217574,24,20
2,03: Morlaix Train Station â€“ Saint-Florent-le...,51.967279,23,29
3,04: Saint-Florent-le-Vieil â€“ Angers (Develop...,50.928535,28,22
4,05: Angers â€“ Saumur (Developed with signs),56.595694,34,38
...,...,...,...,...
88,89: Kovin â€“ Bela Crvka (Developed with signs),56.288756,23,20
89,90: Bela Crvka â€“ Brnjica (Developed with signs),48.61998,17,22
90,91: Brnjica â€“ Doni Milanovac (Developed with...,43.832472,21,19
91,92: Doni Milanovac â€“ Kladovo (Developed with...,62.189773,26,24


In [12]:
#Cleaning the name column
df_ev6['Track Name'] = df_ev6['Track Name'].replace('â€', '–', regex=True)

In [13]:
df_ev6

Unnamed: 0,Track Name,Length (km),Uphill,Downhill
0,01: Le Pellerin –“ Saint-Brevin-les-Pins (Deve...,36.22356,19,16
1,02: Nantes –“ Le Pellerin (Developed with signs),23.217574,24,20
2,03: Morlaix Train Station –“ Saint-Florent-le-...,51.967279,23,29
3,04: Saint-Florent-le-Vieil –“ Angers (Develope...,50.928535,28,22
4,05: Angers –“ Saumur (Developed with signs),56.595694,34,38
...,...,...,...,...
88,89: Kovin –“ Bela Crvka (Developed with signs),56.288756,23,20
89,90: Bela Crvka –“ Brnjica (Developed with signs),48.61998,17,22
90,91: Brnjica –“ Doni Milanovac (Developed with ...,43.832472,21,19
91,92: Doni Milanovac –“ Kladovo (Developed with ...,62.189773,26,24


### Exploring the different stages in route Eurvelo 6

In [14]:
# get the longest stage in Eurvelo 6
#We use the same name that we gave in the previous part for Eurvelo 6 route i.e. gpx_file
longest_stage = max(gpx_file.tracks, key=lambda t: t.length_3d())

#Print the name of the longest stage
Longest_stage_name = print(longest_stage.name)

37: Tuttlingen â€“ Ulm (Developed with signs)


So, the longest stage of Eurvelo 6 is 37: Tuttlingen – Ulm (Developed with signs).

### The track with the most uphill in route Eurvelo 1

In [15]:
#Eurvelo 1
ev1 = 'ev1.gpx'
with open(ev1) as ev1:
    ev1 = gpxpy.parse(ev1) 
#initialize variables
max_uphill = 0
uphill_track = None

# iterate over tracks in gpx
for track in ev1.tracks:
    
    # count the number of uphill points
    uphill_count = 0
    for segment in track.segments:
        for point in segment.points:
            if point.elevation > 0:
                uphill_count += 1
    
    # compare to max_uphill and update if necessary
    if uphill_count > max_uphill:
        max_uphill = uphill_count
        uphill_track = track.name

# print the track with the most uphill points
print(uphill_track)

054: Inverness â€“ Kingussie (Developed with signs)


The track with the most uphill in Eurvelo 1 is 054: Inverness – Kingussie (Developed with signs).

### The 3 flattest stages  in route Eurvelo 2

In [16]:
#Eurvelo 2
ev2 = 'ev2.gpx'
with open(ev2) as ev2:
    ev2 = gpxpy.parse(ev2) 
    
# Get the three flattest tracks
flattest_tracks = sorted(ev2.tracks, key=lambda x: x.get_uphill_downhill().uphill)[:3]

# Get the names of the flattest tracks
flattest_track_names = [track.name for track in flattest_tracks]

print(flattest_track_names)

['03: Maynooth â€“ Dublin (Developed)', '16: Newbury â€“ Reading (Developed with signs)', '02: Kinnegad â€“ Maynooth (Developed)']


The 3 flattest stages are:

03: Maynooth – Dublin (Developed)

16: Newbury – Reading (Developed with signs)

02: Kinnegad – Maynooth (Developed)

### The 5 most hilliest stages (most uphill) in Eurvelo 1 route

In [17]:
#Get the 5 most hilliest stages(most uphill) in Eurvelo 1 route
#sort the tracks by elevation
tracks_by_elevation = sorted(ev1.tracks, key=lambda t: t.get_uphill_downhill().uphill, reverse=True)

#get the names of the five most hilliest contiguous stages
most_hilly_stages = [track.name for track in tracks_by_elevation[0:5]]

print(most_hilly_stages)

['025: Kilboghavn â€“ Nesna (Developed)', '169: Santesteban â€“ Sarasate (Developed with signs)', '045: Forde â€“ Askvoll (Developed)', '099: Waterville â€“ Kenmare (Developed with signs)', '008: Burfjord â€“ Oksfjordhamn (Developed)']


The 5 hilliest stages in Eurvelo 1 are:

025: Kilboghavn – Nesna (Developed)

169: Santesteban – Sarasate (Developed with signs)

045: Forde – Askvoll (Developed)

099: Waterville – Kenmare (Developed with signs)

008: Burfjord – Oksfjordhamn (Developed)

(Note: These stages have been selected based on the highest elevation and and not the number of total uphills).

### Eurvelo 19 route Track Length

In [18]:
# Load the GPX file
ev19 = open('ev19.gpx')
ev19 = gpxpy.parse(ev19)

# Obtain the gpxpy track length
track_length = ev19.length_3d()/1000

print (track_length)

1103.6077229878247


So, we have calculated that route ev19 is roughly 1103 km long.