# Scraping the Eurovision Song Contest data from 1956 until 2019

In [2]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pprint import pprint
import re
%matplotlib inline

### DataFrame of Eurovision Song Contest Winners per Year (We did not use this list)

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_Eurovision_Song_Contest_winners'
res = requests.get(url)
#res
soup = BeautifulSoup(res.content, 'html.parser')
#soup
table = soup.body.find_all('table', {'class':'wikitable sortable'})[0]
#table
rows = table.find_all('tr')
#rows
rows = [row.text.strip().split('\n\n') for row in rows]
#rows
colnames = rows[0]
data = rows[1:]
df = pd.DataFrame(data, columns=colnames)
df.head(10)
df.to_csv('winners_per_year.csv', index=False)
check = pd.read_csv('winners_per_year.csv')
check.head()

Unnamed: 0,Year,Host City,Date,Winner,Song,Performer(s),Songwriter(s),Language,Points
0,1956,Lugano,24 May,Switzerland,"""Refrain""",Lys Assia,Géo VoumardÉmile Gardaz,French,Not announced
1,1957,Frankfurt,3 March,Netherlands,"""Net als toen""",Corry Brokken,Guus JansenWilly van Hemert,Dutch,31
2,1958,Hilversum,12 March,France,"""Dors, mon amour""",André Claveau,Hubert GiraudPierre Delanoë,French,27
3,1959,Cannes,11 March,Netherlands,"""Een beetje""",Teddy Scholten,Dick SchalliesWilly van Hemert,Dutch,21
4,1960,London,29 March,France,"""Tom Pillibi""",Jacqueline Boyer,André PoppPierre Cour,French,32


### DataFrame of Eurovision Song Contest per Year (I cannot scrape this table)

In [4]:
year = 2019
url = f'https://eurovisionworld.com/eurovision/{year}'
res = requests.get(url)
res
soup = BeautifulSoup(res.content, 'html.parser')
soup

#try1
table = soup.body.main
table

#try2
table = soup.body.main.find('div', {'id':'voting_table'})
table

#try3
table = soup.body.main.find_all('table', {'class':'voting_table'})
table

#CANNOT SCRAPE THE TABLE

[]

### Eurovision Song Contest Website (Did not work for all the different years)

In [4]:
#try1
url_end = ['tel-aviv-2019/grand-final', 'lisbon-2018/grand-final', 'kyiv-2017/grand-final', '', 'hilversum-1958/final', 'frankfurt-1957/final', 'lugano-1956/final']
url = f'https://eurovision.tv/event/{url_end[2]}'
res = requests.get(url)
#res
soup = BeautifulSoup(res.content, 'html.parser')
#soup
table = soup.body.main.find_all('table', {'class': 'event-table'})[0]
#table
rows = table.find_all('tr')
rows = [row.text.strip().split('\n') for row in rows]
rows[1:]
data = rows[1:]
df = pd.DataFrame(data)
pd.options.display.max_columns = None
df.head()
df = df[[0,7,15,21,27,30]].head(10)
df = df.rename(columns={0:'R/O', 7:'Country', 15:'Contestant', 21:'Song', 27:'Points', 30:'Place'})
df.head()
# DOES NOT WORK WITH ALL THE URLS

Unnamed: 0,R/O,Country,Contestant,Song,Points,Place
0,1,Israel,IMRI,I Feel Alive,,
1,2,Poland,Kasia Moś,Flashlight,,
2,3,Belarus,Naviband,Story of My Life,,
3,4,Austria,Nathan Trent,Running On Air,,
4,5,Armenia,Artsvik,Fly With Me,,


## Eurovision Song Contest Website (Try2)

In [5]:
#There are different url ends for the different years.
#There is a pattern, however, the end is '/grand-final' from 2004, before 2004 it is '\final' 
url_end = ['tel-aviv-2019/grand-final', 'lisbon-2018/grand-final', 'kyiv-2017/grand-final', 'hilversum-1958/final', 'frankfurt-1957/final', 'lugano-1956/final']
url = f'https://eurovision.tv/event/{url_end[0]}'
res = requests.get(url)
soup = BeautifulSoup(res.content, 'html.parser')
table = soup.body.main.find_all('table', {'class': 'event-table'})[0]
rows = table.find_all('tr')
#rows = [row.text.strip().split('\n') for row in rows]
sample_row = rows[2]
sample_row

<tr class="border-b border-grey">
<td>
2
</td>
<td>
<div class="flex items-center">
<a class="text-blue-dark hover:text-blue-darkest font-bold no-underline flex items-center" href="https://eurovision.tv/country/albania">
<div class="flex-no-shrink mr-10">
<img alt="🇦🇱" class="emojione" src="https://cdn.jsdelivr.net/emojione/assets/4.5/png/64/1f1e6-1f1f1.png" title=":flag_al:"/> </div>
<span>Albania</span>
</a>
</div>
</td>
<td>
<div class="flex items-center">
<img class="mr-10 rounded-full h-8 w-8" src="https://static.eurovision.tv/hb-cgi/images/8d9181f7-788b-4844-a6e6-906e0443f747/avatar_small.jpg"/>
<a class="font-bold text-blue-dark hover:text-blue-darkest no-underline" href="https://eurovision.tv/participant/jonida-maliqi">
Jonida Maliqi
</a>
</div>
</td>
<td>
<div class="flex items-center">
Ktheju tokës
<a class="flex items-center" href="https://youtube.com/watch?v=-NAbYUoxIfg">
<svg class="icon ml-10" height="15" viewbox="0 0 20 15" width="20" xmlns="http://www.w3.org/2000/svg"><

### Functions for scraping data of the columns of the Scorecard table

In [12]:
# get R/O
#sample_row.find('td').text.strip()

def get_ro(row):
    return row.find('td').text.strip()
    
get_ro(sample_row)

'2'

In [13]:
# get country
#sample_row.span.text

def get_country(row):
    return row.span.text

get_country(sample_row)

'Albania'

In [14]:
# get contestant
#sample_row.find_all('a')[1].text.strip()

def get_contestant(row):
    return row.find_all('a')[1].text.strip()

get_contestant(sample_row)

'Jonida Maliqi'

In [15]:
# get song
#sample_row.find_all('div')[3].text.strip()

def get_song(row):
    return row.find_all('div')[3].text.strip()

get_song(sample_row)

'Ktheju tokës'

In [16]:
# get points
#sample_row.find_all('td', class_='text-right')[0].text.strip()

def get_points(row):
    return row.find_all('td', class_='text-right')[0].text.strip()

get_points(sample_row)

'90'

In [17]:
# get place
#sample_row.find_all('td', class_='text-right')[1].text.strip()

def get_place(row):
    return row.find_all('td', class_='text-right')[1].text.strip()
    
get_place(sample_row)

'17th'

### Functions for scraping the year and the host city from the urls

In [18]:
# get year
#url

def get_year(url):
    year_pattern = r'(\d{4})'
    return int(re.findall(year_pattern, url)[0])

get_year(url)

2019

In [19]:
#get host city
#url

def get_host_city(url):
    #host_city_pattern = r'(\w+)-'
    host_city_pattern = r'([a-z-]*[a-z]+)-'
    host_city = re.findall(host_city_pattern, url)[0]
    host_city = host_city.title()
    return host_city
    
get_host_city(url)

'Tel-Aviv'

### Put the scraping functions together and create a dictionary

In [20]:
def create_row(row, url):
    return {'R/O': get_ro(row),
            'Year': get_year(url),
            'Host_City': get_host_city(url),
            'Country': get_country(row),
            'Contestant': get_contestant(row),
            'Song': get_song(row),
            'Points': get_points(row),
            'Place': get_place(row)}

create_row(sample_row, url)

{'R/O': '2',
 'Year': 2019,
 'Host_City': 'Tel-Aviv',
 'Country': 'Albania',
 'Contestant': 'Jonida Maliqi',
 'Song': 'Ktheju tokës',
 'Points': '90',
 'Place': '17th'}

### Create a sample DataFrame

In [21]:
data = [create_row(row, url) for row in rows[1:]] # don't want to get the first row, that is the header

In [22]:
df_from_data = pd.DataFrame(data)
df_from_data.head()

Unnamed: 0,R/O,Year,Host_City,Country,Contestant,Song,Points,Place
0,1,2019,Tel-Aviv,Malta,Michela,Chameleon,107,14th
1,2,2019,Tel-Aviv,Albania,Jonida Maliqi,Ktheju tokës,90,17th
2,3,2019,Tel-Aviv,Czech Republic,Lake Malawi,Friend of a Friend,157,11th
3,4,2019,Tel-Aviv,Germany,S!sters,Sister,24,25th
4,5,2019,Tel-Aviv,Russia,Sergey Lazarev,Scream,370,3rd


## Put everything together

### 1. Get all the URLs from 1956 until 2019

In [24]:
def get_eurovision_urls():
    urls_source = 'https://eurovision.tv/events'
    res = requests.get(urls_source)
    soup = BeautifulSoup(res.content, 'html.parser')
    a_tags1 = soup.find_all('a', class_='text-blue-darkest')[2:18]
    urls1 = [url['href']+'/grand-final' for url in a_tags1]
    a_tags2 = soup.find_all('a', class_='text-blue-darkest')[18:]
    urls2 = [url['href']+'/final' for url in a_tags2]
    urls = urls1 + urls2
    return urls

In [25]:
urls = get_eurovision_urls()
urls[0]

'https://eurovision.tv/event/tel-aviv-2019/grand-final'

### 2. A funtion that creates a DataFrame after scraping the URLs

In [28]:
def scrape_eurovision(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')
    table = soup.body.main.find_all('table', {'class': 'event-table'})[0]
    rows = table.find_all('tr')
    data = [create_row(row, url) for row in rows[1:]] # don't want to get the first row, that is the header
    df_from_data = pd.DataFrame(data)
    return df_from_data

In [29]:
# test the function
frame1 = scrape_eurovision(urls[0])

### 3. Run the function for all the URLs

In [30]:
frames = [scrape_eurovision(url) for url in urls]

### 4. Concat all the DataFrames of the different years

In [31]:
result = pd.concat(frames, ignore_index=True)
result.head()

Unnamed: 0,R/O,Year,Host_City,Country,Contestant,Song,Points,Place
0,1,2019,Tel-Aviv,Malta,Michela,Chameleon,107,14th
1,2,2019,Tel-Aviv,Albania,Jonida Maliqi,Ktheju tokës,90,17th
2,3,2019,Tel-Aviv,Czech Republic,Lake Malawi,Friend of a Friend,157,11th
3,4,2019,Tel-Aviv,Germany,S!sters,Sister,24,25th
4,5,2019,Tel-Aviv,Russia,Sergey Lazarev,Scream,370,3rd


In [32]:
result.tail()

Unnamed: 0,R/O,Year,Host_City,Country,Contestant,Song,Points,Place
1317,10,1956,Lugano,Belgium,Mony Marc,Le Plus Beau Jour De Ma Vie,—,2nd
1318,11,1956,Lugano,Germany,Freddy Quinn,So Geht Das Jede Nacht,—,2nd
1319,12,1956,Lugano,France,Dany Dauberson,Il Est Là,—,2nd
1320,13,1956,Lugano,Luxembourg,Michèle Arnaud,Les Amants De Minuit,—,2nd
1321,14,1956,Lugano,Italy,Tonina Torielli,Amami Se Vuoi,—,2nd


### 5. Save the final result as a CSV file

In [33]:
result.to_csv('eurovision_per_year.csv', index=False)
check = pd.read_csv('eurovision_per_year.csv')
check.head(10)

Unnamed: 0,R/O,Year,Host_City,Country,Contestant,Song,Points,Place
0,1,2019,Tel-Aviv,Malta,Michela,Chameleon,107,14th
1,2,2019,Tel-Aviv,Albania,Jonida Maliqi,Ktheju tokës,90,17th
2,3,2019,Tel-Aviv,Czech Republic,Lake Malawi,Friend of a Friend,157,11th
3,4,2019,Tel-Aviv,Germany,S!sters,Sister,24,25th
4,5,2019,Tel-Aviv,Russia,Sergey Lazarev,Scream,370,3rd
5,6,2019,Tel-Aviv,Denmark,Leonora,Love Is Forever,120,12th
6,7,2019,Tel-Aviv,San Marino,Serhat,Say Na Na Na,77,19th
7,8,2019,Tel-Aviv,North Macedonia,Tamara Todevska,Proud,305,7th
8,9,2019,Tel-Aviv,Sweden,John Lundvik,Too Late For Love,334,5th
9,10,2019,Tel-Aviv,Slovenia,Zala Kralj & Gašper Šantl,Sebi,105,15th


In [34]:
check.tail(10)

Unnamed: 0,R/O,Year,Host_City,Country,Contestant,Song,Points,Place
1312,5,1956,Lugano,France,Mathé Altéry,Le Temps Perdu,—,2nd
1313,6,1956,Lugano,Luxembourg,Michèle Arnaud,Ne Crois Pas,—,2nd
1314,7,1956,Lugano,Italy,Franca Raimondi,Aprite Le Finestre,—,2nd
1315,8,1956,Lugano,The Netherlands,Corry Brokken,Voorgoed Voorbij,—,2nd
1316,9,1956,Lugano,Switzerland,Lys Assia,Refrain,—,1st
1317,10,1956,Lugano,Belgium,Mony Marc,Le Plus Beau Jour De Ma Vie,—,2nd
1318,11,1956,Lugano,Germany,Freddy Quinn,So Geht Das Jede Nacht,—,2nd
1319,12,1956,Lugano,France,Dany Dauberson,Il Est Là,—,2nd
1320,13,1956,Lugano,Luxembourg,Michèle Arnaud,Les Amants De Minuit,—,2nd
1321,14,1956,Lugano,Italy,Tonina Torielli,Amami Se Vuoi,—,2nd
