# Data rework

Trying to get everything cleanly.

## Voting scores

Start with this. 
 - 1975-2019 data from Kaggle: https://www.kaggle.com/datasets/datagraver/eurovision-song-contest-scores-19752019
 - 2020 was cancelled
 - 2021, 2022 scraped from Wikipedia

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
## Read in data.

votes_1975_2019 = pd.read_excel("./data/eurovision_song_contest_1975_2019.xlsx")

# Clean up column names first
votes_1975_2019.columns = [c.strip().lower().replace(' ', '_') for c in votes_1975_2019.columns.values.tolist()]

print(votes_1975_2019.shape)
votes_1975_2019.head()

(49832, 8)


Unnamed: 0,year,(semi-)_final,edition,jury_or_televoting,from_country,to_country,points,duplicate
0,1975,f,1975f,J,Belgium,Belgium,0,x
1,1975,f,1975f,J,Belgium,Finland,0,
2,1975,f,1975f,J,Belgium,France,2,
3,1975,f,1975f,J,Belgium,Germany,0,
4,1975,f,1975f,J,Belgium,Ireland,12,


In [3]:
## Clean up this dataset.

# Select only finals votes, and only 1998 onwards (inclusive)
votes_1998_2019 = votes_1975_2019[(votes_1975_2019['(semi-)_final'] == 'f') & (votes_1975_2019['year'] >= 1998)]

# Drop unnecessary columns
votes_1998_2019 = votes_1998_2019[["year", "from_country", "to_country", "points", "jury_or_televoting"]]

# Clean up country names
def standardise_country(c):
    replacements = [('-', ' '), ('&', 'and'), ('netherands', 'netherlands'),
                    # FYR Macedonia was formally renamed as North Macedonia in 2019
                    ('f.y.r. macedonia', 'north macedonia'), 
                    ('russia', 'russian federation'), 
                    ('the netherlands', 'netherlands'), 
                    ('czech republic', 'czechia'),
                    # Yugoslavia dissolved in 2002; most of it became 'Serbia and Montenegro', until 2006, when Serbia and Montenegro split ways.
                    ('serbia and montenegro', 'yugoslavia'),
                    ('moldova', 'moldova, republic of')]
    c = c.lower()
    for r in replacements:
        c = c.replace(r[0], r[1])
    return c
for column in ['from_country', 'to_country']:
    votes_1998_2019[column] = votes_1998_2019[column].map(standardise_country)

# Drop columns which correspond to the same vote (there are two Belarus -> Russia in 2019, for example)
votes_1998_2019 = votes_1998_2019.drop_duplicates(subset=['year', 'from_country', 'to_country', 'jury_or_televoting'])

# Drop Lithuania in 2003 (they didn't participate - I don't know why it's still in the dataset)
votes_1998_2019 = votes_1998_2019[~((votes_1998_2019['to_country'] == 'lithuania') & (votes_1998_2019['year'] == 2003))]

# Drop "votes" from one country to herself
votes_1998_2019 = votes_1998_2019[votes_1998_2019['from_country'] != votes_1998_2019['to_country']]

votes_1998_2019.sample(n=10)


Unnamed: 0,year,from_country,to_country,points,jury_or_televoting
26691,2011,austria,slovenia,3,J
40043,2017,austria,italy,6,T
13443,2003,poland,bosnia and herzegovina,0,J
49715,2019,montenegro,slovenia,0,J
23612,2009,hungary,estonia,6,J
12835,2002,malta,austria,0,J
21624,2008,finland,greece,0,J
36332,2016,united kingdom,bulgaria,5,J
31014,2013,"moldova, republic of",belarus,4,J
23714,2009,latvia,france,5,J


In [4]:
## Now we need to fetch some data from Wikipedia for the 2021 and 2022 contests.

import requests
from bs4 import BeautifulSoup

def import_votes_from_wp(year: int) -> pd.DataFrame:
    # ID numbers for the respective tables on the Wikipedia page.
    JURY_ID = 16
    TELEVOTING_ID = 17

    url = f"https://en.wikipedia.org/wiki/Eurovision_Song_Contest_{year}#Final_2"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.find_all('table', {'class': "wikitable"})

    def parse_table_from_id(wp_id: int, jury_or_tele: str) -> pd.DataFrame:
        df_table = pd.read_html(str(tables[wp_id]))
        df_table = pd.DataFrame(df_table[0])

        # remove redundant rows/columns
        df_table = df_table.drop(df_table.columns[[0, 2, 3, 4]], axis=1)
        df_table = df_table.drop(df_table.index[[0, 2]], axis=0) 

        # set the index to the first column
        df_table = df_table.set_index(df_table.columns[0])

        # set the column names as the first row
        df_table.columns = df_table.iloc[0]
        df_table = df_table.drop(df_table.index[0])

        # replace NaN with 0
        df_table = df_table.fillna(0)

        # squash the column index with stack
        df_table = df_table.stack().reset_index()

        df_table.columns = ['to_country', 'from_country', 'points']
        df_table['jury_or_televoting'] = jury_or_tele

        df_table['year'] = year

        # re-order the columns to match the original data   
        df_table = df_table[['year', 'from_country', 'to_country', 'points', 'jury_or_televoting']]
        
        df_table['points'] = df_table['points'].astype(int)
        
        # Clean up countries as before
        for column in ['from_country', 'to_country']:
            df_table[column] = df_table[column].map(standardise_country)

        return(df_table)

    jury_table = parse_table_from_id(JURY_ID, jury_or_tele='J')
    tele_table = parse_table_from_id(TELEVOTING_ID, jury_or_tele='T')
    return(pd.concat([jury_table, tele_table]))

votes_1998_2022 = pd.concat([votes_1998_2019,
                             import_votes_from_wp(2021),
                             import_votes_from_wp(2022)])
votes_1998_2022.sample(n=10)

Unnamed: 0,year,from_country,to_country,points,jury_or_televoting
325,2021,slovenia,united kingdom,0,J
14153,2004,israel,spain,8,J
27620,2011,netherlands,ukraine,0,J
914,2021,"moldova, republic of",italy,8,T
30392,2013,armenia,denmark,4,J
38986,2017,azerbaijan,ukraine,7,J
30718,2013,france,norway,0,J
44180,2018,hungary,italy,6,T
21650,2008,france,iceland,0,J
843,2022,malta,united kingdom,12,T


In [5]:
# This cell is a sanity check to make sure that all countries participating in a given year got the same number of votes.
# We hope to see the 'is_consistent' column be True for all years in the output.

def all_entries_same(arr : np.ndarray) -> bool:
    # Determines if all non-NaN entries in a numpy array have the same value.
    arr2 = arr[~np.isnan(arr)]
    return np.all(arr2 == arr2[0])

# Pivot to wide form, so that each row gives the number of scores each country received in a given year
grouped_votes = votes_1998_2022.groupby(by=['year', 'to_country'])['points'].count().reset_index()
grouped_votes = grouped_votes.pivot(index="year", columns="to_country", values="points")
# Create "is_consistent" column and move it to the front
col_names = grouped_votes.columns
grouped_votes["is_consistent"] = grouped_votes.apply(all_entries_same, axis=1, raw=True)
new_col_names = ["is_consistent", *col_names]
# Show data
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(grouped_votes.reindex(columns=new_col_names))
    
# At this point, if is_consistent is False for any year, you can look at the entries in the row to find out where the discrepancy lies.
# That's how I found Lithuania 2003, at least. There's no guarantee that is_consistent = True means that everything is *correct*, but
# it at least increases our confidence, I think.
    
# Perfect...!

to_country,is_consistent,albania,armenia,australia,austria,azerbaijan,belarus,belgium,bosnia and herzegovina,bulgaria,croatia,cyprus,czechia,denmark,estonia,finland,france,georgia,germany,greece,hungary,iceland,ireland,israel,italy,latvia,lithuania,malta,"moldova, republic of",montenegro,netherlands,north macedonia,norway,poland,portugal,romania,russian federation,san marino,serbia,slovakia,slovenia,spain,sweden,switzerland,turkey,ukraine,united kingdom,yugoslavia
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1
1998,True,,,,,,,24.0,,,24.0,24.0,,,24.0,24.0,24.0,,24.0,24.0,24.0,,24.0,24.0,,,,24.0,,,24.0,24.0,24.0,24.0,24.0,24.0,,,,24.0,24.0,24.0,24.0,24.0,24.0,,24.0,
1999,True,,,,22.0,,,22.0,22.0,,22.0,22.0,,22.0,22.0,,22.0,,22.0,,,22.0,22.0,22.0,,,22.0,22.0,,,22.0,,22.0,22.0,22.0,,,,,,22.0,22.0,22.0,,22.0,,22.0,
2000,True,,,,23.0,,,23.0,,,23.0,23.0,,23.0,23.0,23.0,23.0,,23.0,,,23.0,23.0,23.0,,23.0,,23.0,,,23.0,23.0,23.0,,,23.0,23.0,,,,,23.0,23.0,23.0,23.0,,23.0,
2001,True,,,,,,,,22.0,,22.0,,,22.0,22.0,,22.0,,22.0,22.0,,22.0,22.0,22.0,,22.0,22.0,22.0,,,22.0,,22.0,22.0,22.0,,22.0,,,,22.0,22.0,22.0,,22.0,,22.0,
2002,True,,,,23.0,,,23.0,23.0,,23.0,23.0,,23.0,23.0,23.0,23.0,,23.0,23.0,,,,23.0,,23.0,23.0,23.0,,,,23.0,,,,23.0,23.0,,,,23.0,23.0,23.0,23.0,23.0,,23.0,
2003,True,,,,25.0,,,25.0,25.0,,25.0,25.0,,,25.0,,25.0,,25.0,25.0,,25.0,25.0,25.0,,,,25.0,,,25.0,,25.0,25.0,25.0,25.0,25.0,,,,25.0,25.0,25.0,,25.0,25.0,25.0,
2004,True,35.0,,,35.0,,,35.0,35.0,,35.0,35.0,,,,,35.0,,35.0,35.0,,35.0,35.0,,,,,35.0,,,35.0,35.0,35.0,35.0,,35.0,35.0,,,,,35.0,35.0,,35.0,35.0,35.0,35.0
2005,True,38.0,,,,,,,38.0,,38.0,38.0,,38.0,,,38.0,,38.0,38.0,38.0,,,38.0,,38.0,,38.0,38.0,,,38.0,38.0,,,38.0,38.0,,,,,38.0,38.0,38.0,38.0,38.0,38.0,38.0
2006,True,,37.0,,,,,,37.0,,37.0,,,37.0,,37.0,37.0,,37.0,37.0,,,37.0,37.0,,37.0,37.0,37.0,37.0,,,37.0,37.0,,,37.0,37.0,,,,,37.0,37.0,37.0,37.0,37.0,37.0,
2007,True,,41.0,,,,41.0,,41.0,41.0,,,,,,41.0,41.0,41.0,41.0,41.0,41.0,,41.0,,,41.0,41.0,,41.0,,,41.0,,,,41.0,41.0,,41.0,,41.0,41.0,41.0,,41.0,41.0,41.0,


In [6]:
# Now we need to combine jury and televoting scores.

# Years where jury voting happened
jury_years = np.unique(votes_1998_2022[votes_1998_2022['jury_or_televoting'] == 'J']['year'])
# Years where televoting happened
televoting_years = np.unique(votes_1998_2022[votes_1998_2022['jury_or_televoting'] == 'T']['year'])
# Years where both happened (i.e. the intersection)
double_voting_years = np.intersect1d(jury_years, televoting_years)
double_voting_years

array([2016, 2017, 2018, 2019, 2021, 2022])

In [7]:
# These are the years for which the points can just be used as-is.
votes_to_keep = votes_1998_2022[~votes_1998_2022['year'].isin(double_voting_years)]
votes_to_keep = votes_to_keep.drop(columns=['jury_or_televoting'])

# These are the years which we need to process.
# The way we do this is to add up the J and T scores, then re-rank them and assign 12 points to the highest score, 10 to the next-highest, etc.
votes_to_process = votes_1998_2022[votes_1998_2022['year'].isin(double_voting_years)]
summed_votes = votes_to_process.sort_values(by=['year', 'from_country', 'to_country'])
summed_votes = summed_votes.groupby(by=['year', 'from_country', 'to_country']).sum(numeric_only=True)

def rescale_points(pts: pd.Series) -> pd.Series:
    # grp is a pd.Series corresponding to one combination of 'year' and 'from_country'
    ranks_to_rescaled_points = {1: 12, 2: 10, 3: 8, 4: 7, 5: 6, 6: 5, 7: 4, 8: 3, 9: 2, 10: 1}
    ranks = [sorted(pts, reverse=True).index(pt) + 1 for pt in pts]
    rescaled_points = {pt: ranks_to_rescaled_points.get(r, 0) for pt, r in zip(pts, ranks)}
    return pts.map(rescaled_points)

processed_votes = summed_votes.groupby(by=['year', 'from_country']).transform(rescale_points).reset_index()
processed_votes.head()

Unnamed: 0,year,from_country,to_country,points
0,2016,albania,armenia,0
1,2016,albania,australia,12
2,2016,albania,austria,0
3,2016,albania,azerbaijan,0
4,2016,albania,belgium,0


In [8]:
# Sanity check
x = processed_votes.rename(columns={"points": "rescaled"})
x = x.set_index(["year", "from_country", "to_country"])
v = summed_votes.reset_index().set_index(["year", "from_country", "to_country"])
joined = v.join(x, how="outer").reset_index()
joined[(joined['year'] == 2016) & (joined['from_country'] == 'albania')].sort_values(by="points", ascending=False)

Unnamed: 0,year,from_country,to_country,points,rescaled
1,2016,albania,australia,24,12
14,2016,albania,italy,18,10
20,2016,albania,russian federation,14,8
5,2016,albania,bulgaria,12,7
9,2016,albania,france,10,6
24,2016,albania,ukraine,6,5
22,2016,albania,spain,6,5
25,2016,albania,united kingdom,5,3
19,2016,albania,poland,5,3
16,2016,albania,lithuania,4,1


In [9]:
# Add in country codes, and that's our final voting data.
import pycountry
import itertools

votes = pd.concat([votes_to_keep, processed_votes]).reset_index(drop=True)
votes

def get_country_codes(name):
    if name == 'yugoslavia':
        # That's how it's encoded in pycountry.
        # https://github.com/flyingcircusio/pycountry/blob/main/src/pycountry/databases/iso3166-3.json
        cty = pycountry.historic_countries.get(name='yugoslavia, socialist federal republic of')
    else:
        cty = pycountry.countries.get(name=name)
    if cty is None:
        raise KeyError("Country name " + name + " not found in pycountry. This really shouldn't happen.")
    
    return cty.alpha_2, cty.alpha_3

for ft in ['from', 'to']:
    votes[f'{ft}_code2'], votes[f'{ft}_code3'] = zip(*votes[f'{ft}_country'].map(get_country_codes))

votes

Unnamed: 0,year,from_country,to_country,points,from_code2,from_code3,to_code2,to_code3
0,1998,belgium,croatia,5,BE,BEL,HR,HRV
1,1998,belgium,cyprus,2,BE,BEL,CY,CYP
2,1998,belgium,estonia,0,BE,BEL,EE,EST
3,1998,belgium,north macedonia,0,BE,BEL,MK,MKD
4,1998,belgium,finland,0,BE,BEL,FI,FIN
...,...,...,...,...,...,...,...,...
21356,2022,united kingdom,spain,8,GB,GBR,ES,ESP
21357,2022,united kingdom,sweden,10,GB,GBR,SE,SWE
21358,2022,united kingdom,switzerland,0,GB,GBR,CH,CHE
21359,2022,united kingdom,ukraine,7,GB,GBR,UA,UKR


In [10]:
# sanity check the numbers
temp = votes[['from_country', 'year']].value_counts()

# for each year print the unique values
for year, group in temp.groupby(level=1):
    print(year, group.unique())

1998 [24]
1999 [22]
2000 [23]
2001 [22]
2002 [23]
2003 [25 24]
2004 [24 23]
2005 [24 23]
2006 [24 23]
2007 [24 23]
2008 [25 24]
2009 [25 24]
2010 [25 24]
2011 [25 24]
2012 [26 25]
2013 [26 25]
2014 [26 25]
2015 [27 26]
2016 [26 25]
2017 [26 25]
2018 [26 25]
2019 [26 25]
2021 [26]
2022 [25]


In [11]:
# This is the same sanity check as above.

# Pivot to wide form, so that each row gives the number of scores each country received in a given year
grouped_votes = votes.groupby(by=['year', 'to_country'])['points'].count().reset_index()
grouped_votes = grouped_votes.pivot(index="year", columns="to_country", values="points")
# Create "is_consistent" column and move it to the front
col_names = grouped_votes.columns
grouped_votes["is_consistent"] = grouped_votes.apply(all_entries_same, axis=1, raw=True)
new_col_names = ["is_consistent", *col_names]
# Show data
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(grouped_votes.reindex(columns=new_col_names))

to_country,is_consistent,albania,armenia,australia,austria,azerbaijan,belarus,belgium,bosnia and herzegovina,bulgaria,croatia,cyprus,czechia,denmark,estonia,finland,france,georgia,germany,greece,hungary,iceland,ireland,israel,italy,latvia,lithuania,malta,"moldova, republic of",montenegro,netherlands,north macedonia,norway,poland,portugal,romania,russian federation,san marino,serbia,slovakia,slovenia,spain,sweden,switzerland,turkey,ukraine,united kingdom,yugoslavia
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1
1998,True,,,,,,,24.0,,,24.0,24.0,,,24.0,24.0,24.0,,24.0,24.0,24.0,,24.0,24.0,,,,24.0,,,24.0,24.0,24.0,24.0,24.0,24.0,,,,24.0,24.0,24.0,24.0,24.0,24.0,,24.0,
1999,True,,,,22.0,,,22.0,22.0,,22.0,22.0,,22.0,22.0,,22.0,,22.0,,,22.0,22.0,22.0,,,22.0,22.0,,,22.0,,22.0,22.0,22.0,,,,,,22.0,22.0,22.0,,22.0,,22.0,
2000,True,,,,23.0,,,23.0,,,23.0,23.0,,23.0,23.0,23.0,23.0,,23.0,,,23.0,23.0,23.0,,23.0,,23.0,,,23.0,23.0,23.0,,,23.0,23.0,,,,,23.0,23.0,23.0,23.0,,23.0,
2001,True,,,,,,,,22.0,,22.0,,,22.0,22.0,,22.0,,22.0,22.0,,22.0,22.0,22.0,,22.0,22.0,22.0,,,22.0,,22.0,22.0,22.0,,22.0,,,,22.0,22.0,22.0,,22.0,,22.0,
2002,True,,,,23.0,,,23.0,23.0,,23.0,23.0,,23.0,23.0,23.0,23.0,,23.0,23.0,,,,23.0,,23.0,23.0,23.0,,,,23.0,,,,23.0,23.0,,,,23.0,23.0,23.0,23.0,23.0,,23.0,
2003,True,,,,25.0,,,25.0,25.0,,25.0,25.0,,,25.0,,25.0,,25.0,25.0,,25.0,25.0,25.0,,,,25.0,,,25.0,,25.0,25.0,25.0,25.0,25.0,,,,25.0,25.0,25.0,,25.0,25.0,25.0,
2004,True,35.0,,,35.0,,,35.0,35.0,,35.0,35.0,,,,,35.0,,35.0,35.0,,35.0,35.0,,,,,35.0,,,35.0,35.0,35.0,35.0,,35.0,35.0,,,,,35.0,35.0,,35.0,35.0,35.0,35.0
2005,True,38.0,,,,,,,38.0,,38.0,38.0,,38.0,,,38.0,,38.0,38.0,38.0,,,38.0,,38.0,,38.0,38.0,,,38.0,38.0,,,38.0,38.0,,,,,38.0,38.0,38.0,38.0,38.0,38.0,38.0
2006,True,,37.0,,,,,,37.0,,37.0,,,37.0,,37.0,37.0,,37.0,37.0,,,37.0,37.0,,37.0,37.0,37.0,37.0,,,37.0,37.0,,,37.0,37.0,,,,,37.0,37.0,37.0,37.0,37.0,37.0,
2007,True,,41.0,,,,41.0,,41.0,41.0,,,,,,41.0,41.0,41.0,41.0,41.0,41.0,,41.0,,,41.0,41.0,,41.0,,,41.0,,,,41.0,41.0,,41.0,,41.0,41.0,41.0,,41.0,41.0,41.0,


# Song language

https://github.com/KatrionaGoldmann/Eurovision_TDS/blob/jp/data-wrangling-notebook/eurovision/notebooks/get_language_scores.ipynb

Import the lyric data from https://www.kaggle.com/datasets/minitree/eurovision-song-lyrics?select=eurovision-lyrics-2022.json

In [15]:
songs = pd.read_json('./data/eurovision-lyrics-2022.json')

In [16]:
language = songs.T[['Year','Country','Language']]

# rename the Lnaguage column as Language_sung
language.rename(columns={'Language':'Language_sung'}, inplace=True)

# if country is equal to macedonaia, then change the country name to f.y.r. macedonia
language.loc[language['Country'] == 'Macedonia', 'Country'] = 'North Macedonia'
language.loc[language['Country'] == 'Russia', 'Country'] = 'russian federation'
language.loc[language['Country'] == 'Serbia and Montenegro', 'Country'] = 'yugoslavia'
language.loc[language['Country'] == 'Moldova', 'Country'] = 'moldova, republic of'
language.loc[language['Country'] == 'Czech Republic', 'Country'] = 'czechia'
language.loc[language['Country'] == 'The Netherlands', 'Country'] = 'netherlands'


language['Year']= pd.to_numeric(language['Year'])
language = language[language['Year']>1997]

# convert language['Country'] to lower case
language['Country'] = language['Country'].str.lower()

# Add country code columms
language['Country_code2'], language['Country_code3'] = zip(*language['Country'].map(get_country_codes))


language.head()


Unnamed: 0,Year,Country,Language_sung,Country_code2,Country_code3
772,1998,croatia,Croatian,HR,HRV
773,1998,greece,Greek,GR,GRC
774,1998,france,French,FR,FRA
775,1998,spain,Spanish,ES,ESP
776,1998,switzerland,German,CH,CHE


In [20]:
# Tidy up the language sung column
language['Language_sung'] = language['Language_sung'].str.lower()
language['Language_sung'] = language['Language_sung'].str.replace('partly|dialect|title|and', '')

# for each key in the dictionary, replace the value with the key
replace_strings = {
    'fr\\.': 'french', 'eng\\.': 'english', 'gr\\.': 'greek', 
    'sp\\.': 'spanish', 'rom\\.': 'romanian', 'russ\\.': 'russian',
    'it\\.': 'italian', 'germ\\.': 'german', 'pol\\.': 'polish', 
    'sign language': 'sign-language'
}

for key, value in replace_strings.items():
    language['Language_sung'] = language['Language_sung'].str.replace(key, value)

language['Contains_English'] = language['Language_sung'].apply(lambda x: 1 if 'english' in x else 0)

# remove english and white space from language column
#language['Language_sung_tidied'] = language['Language_sung'].apply(lambda x: x.replace('english',''))
language['Language_sung_tidied'] = language['Language_sung'].apply(lambda x: x.strip())
language['Language_sung_tidied'] = language['Language_sung_tidied'].apply(lambda x: x.replace('  ',' '))
language['Language_sung_tidied'] = language['Language_sung_tidied'].apply(lambda x: x.replace('(',''))
language['Language_sung_tidied'] = language['Language_sung_tidied'].apply(lambda x: x.replace(')',''))
language['Language_sung_tidied'] = language['Language_sung_tidied'].apply(lambda x: x.replace('/',' '))

language['Contains_NonEnglish'] = language['Language_sung_tidied'].str.replace('english', '').apply(lambda x: 0 if x=='' else 1)

  language['Language_sung'] = language['Language_sung'].str.replace('partly|dialect|title|and', '')
  language['Language_sung'] = language['Language_sung'].str.replace(key, value)


In [21]:
print('Number of songs containing English or non-English:')

language[['Contains_English', 'Contains_NonEnglish']].value_counts()

Number of songs containing English or non-English:


Contains_English  Contains_NonEnglish
1                 0                      543
0                 1                      220
1                 1                      149
dtype: int64

Next we want to see whether countries are singing in their official language. We can get the official language from [Wikipedia](https://en.wikipedia.org/wiki/List_of_official_languages_by_country_and_territory).

In [28]:
# Get the official languages from Wikipedia

import pycountry
import requests
from bs4 import BeautifulSoup
import re

url=f"https://en.wikipedia.org/wiki/List_of_official_languages_by_country_and_territory"
response=requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
tables=soup.find_all('table',{'class':"wikitable"})

table = tables[0]

df_table=pd.read_html(str(table))
df_table=pd.DataFrame(df_table[0])

# replace United Kingdom and Crown dependencies etc with United Kingdom
df_table['Country/Region'] = df_table['Country/Region'].replace('United Kingdom and Crown dependencies etc.','United Kingdom')

# Tidy the columns
df_table = df_table.fillna('')
df_table['Country/Region'] = df_table['Country/Region'].apply(lambda x: re.sub("\[.*?\]","",x))
df_table.rename(columns={'Official language':'Official_languages'}, inplace=True)
df_table['Official_languages'] = df_table['Official_languages'].apply(lambda x: re.sub("\[.*?\]","",x))
df_table['Country/Region'] = df_table['Country/Region'].str.lower()

# Tidy the country names
df_table.loc[df_table['Country/Region'] == 'russia', 'Country/Region'] = 'russian federation'
df_table.loc[df_table['Country/Region'] == 'serbia and montenegro', 'Country/Region'] = 'yugoslavia'
df_table.loc[df_table['Country/Region'] == 'moldova', 'Country/Region'] = 'moldova, republic of'
df_table.loc[df_table['Country/Region'] == 'czech republic', 'Country/Region'] = 'czechia'
df_table = df_table.append({'Country/Region':'yugoslavia', 'Official_languages':'serbian montenegrin'}, ignore_index=True)
set(language['Country'].unique()) - set(df_table['Country/Region'].unique())

df_table = df_table.loc[df_table['Country/Region'].isin(language['Country'].unique())]

df_table['Country_code2'], df_table['Country_code3'] = zip(*df_table['Country/Region'].map(get_country_codes))

# Tidy the language column
df_table['Official_languages'] = df_table['Official_languages'].str.lower()
df_table['Official_languages'] = df_table['Official_languages'].apply(lambda x: x.replace('all have de facto status',''))
df_table['Official_languages'] = df_table['Official_languages'].apply(lambda x: x.replace(',',''))
df_table['Official_languages'] = df_table['Official_languages'].apply(lambda x: x.replace('(',''))
df_table['Official_languages'] = df_table['Official_languages'].apply(lambda x: x.replace(')',''))

# Manually add missing languages or country names
df_table.loc[df_table['Country_code2'] == 'LT', 'Official_languages'] = 'samogitian ' + df_table.loc[df_table['Country_code2'] == 'LT', 'Official_languages']
df_table.loc[df_table['Country_code2'] == 'FR', 'Official_languages'] = 'breton corsican ' + df_table.loc[df_table['Country_code2'] == 'FR', 'Official_languages']
df_table.loc[df_table['Country_code2'] == 'SI', 'Official_languages'] = 'slovenian ' + df_table.loc[df_table['Country_code2'] == 'SI', 'Official_languages']
df_table.loc[df_table['Country_code2'] == 'EE', 'Official_languages'] = 'võro ' + df_table.loc[df_table['Country_code2'] == 'EE', 'Official_languages']

df_table.head()

  df_table = df_table.append({'Country/Region':'yugoslavia', 'Official_languages':'serbian montenegrin'}, ignore_index=True)


Unnamed: 0,Country/Region,Official_languages,Regional language,Minority language,National language,Widely spoken,Country_code2,Country_code3
2,albania,albanian,,Greek Macedonian Aromanian,,Italian,AL,ALB
4,andorra,catalan,,Spanish French Portuguese,,,AD,AND
8,armenia,armenian,,,Armenian (state language)[9],,AM,ARM
10,australia,none english has de facto status,Norfuk language (on Norfolk island),,,,AU,AUS
11,austria,german,Burgenland Croatian (parts of Burgenland) Hung...,Slovene Czech Hungarian Slovak Romani Serbian,German (state language),,AT,AUT


In [29]:
df_table.loc[df_table['Country/Region'].isin(['yugoslavia', 'serbia', 'montenegro'])] 

Unnamed: 0,Country/Region,Official_languages,Regional language,Minority language,National language,Widely spoken,Country_code2,Country_code3
124,montenegro,montenegrin,,,,,ME,MNE
165,serbia,serbian,,(15 languages),,,RS,SRB
211,yugoslavia,serbian montenegrin,,,,,YU,YUG


In [30]:
# print langauge['Country'] that are not in df_table['Country/Region']
if len(set(language['Country_code2']) - set(df_table['Country_code2'])) > 0: 
    countries = set(language['Country_code2']) - set(df_table['Country_code2'])
    raise KeyError("Country name " + countries + " not found in pycountry. This really shouldn't happen.")

# merge df_table and language on Country and Country/Region
language = pd.merge(language, df_table[['Country_code2', 'Official_languages']], left_on='Country_code2', right_on='Country_code2', how='left')

In [31]:
# Tidy the languages column
language['Official_languages'] = language['Official_languages'].fillna(' ')

language['Contains_Multiple_Languages'] = language['Language_sung_tidied'].apply(lambda x: 1 if len(x.split()) > 1 else 0)
language['Number_of_Languages'] = language['Language_sung_tidied'].apply(lambda x:len(x.split()))
language['Contains_Own_Language'] = [1 if set(i['Language_sung_tidied'].split()).intersection(i['Official_languages'].split()) else 0 for idx,i in language.iterrows()]

language.loc[language['Language_sung_tidied'].str.contains('6 other'), 'Number_of_Languages'] = language.loc[language['Language_sung_tidied'].str.contains('6 other'), 'Number_of_Languages'] + 4
language.loc[language['Language_sung_tidied'].str.contains('10 other'), 'Number_of_Languages'] = language.loc[language['Language_sung_tidied'].str.contains('10 other'), 'Number_of_Languages'] + 8
language['Language_sung_tidied'] = language['Language_sung_tidied'].apply(lambda x: x.replace('6 other',''))
language['Language_sung_tidied'] = language['Language_sung_tidied'].apply(lambda x: x.replace('10 other',''))

In [32]:
votes.head()

Unnamed: 0,year,from_country,to_country,points,from_code2,from_code3,to_code2,to_code3
0,1998,belgium,croatia,5,BE,BEL,HR,HRV
1,1998,belgium,cyprus,2,BE,BEL,CY,CYP
2,1998,belgium,estonia,0,BE,BEL,EE,EST
3,1998,belgium,north macedonia,0,BE,BEL,MK,MKD
4,1998,belgium,finland,0,BE,BEL,FI,FIN


In [33]:
# combine votes and language
df_main = pd.merge(votes, language, left_on=['to_code2', 'year'], right_on=['Country_code2', 'Year'], how='left')

# check if Country and to_country are identical

print(all(df_main['Country'] == df_main['to_country']))
print(all(df_main['Country_code2'] == df_main['to_code2']))
print(all(df_main['Year'] == df_main['year']))


df_main = df_main[[
    'Year', 
    'from_country',	'to_country', 'points',	'from_code2', 'from_code3', 'to_code2', 'to_code3',
    'Official_languages', 'Language_sung', 'Language_sung_tidied', 
    'Contains_English',  'Contains_NonEnglish', 'Contains_Multiple_Languages',
    'Number_of_Languages', 'Contains_Own_Language']]


df_main.head()

True
True
True


Unnamed: 0,Year,from_country,to_country,points,from_code2,from_code3,to_code2,to_code3,Official_languages,Language_sung,Language_sung_tidied,Contains_English,Contains_NonEnglish,Contains_Multiple_Languages,Number_of_Languages,Contains_Own_Language
0,1998,belgium,croatia,5,BE,BEL,HR,HRV,croatian,croatian,croatian,0,1,0,1,1
1,1998,belgium,cyprus,2,BE,BEL,CY,CYP,greek turkish,greek,greek,0,1,0,1,1
2,1998,belgium,estonia,0,BE,BEL,EE,EST,võro estonian,estonian,estonian,0,1,0,1,1
3,1998,belgium,north macedonia,0,BE,BEL,MK,MKD,macedonian albanian,macedonian,macedonian,0,1,0,1,1
4,1998,belgium,finland,0,BE,BEL,FI,FIN,finnish swedish,finnish,finnish,0,1,0,1,1


In [34]:
# sanity check the numbers
temp = df_main[['from_code2', 'Year']].value_counts()

# print temp value for all countries for a given year
for year in range(2004, 2006):
     print(year, temp.loc[:,year])

2004 from_code2
SI    24
MC    24
PT    24
AD    24
EE    24
DK    24
FI    24
CH    24
BY    24
LT    24
IL    24
LV    24
DE    23
SE    23
CY    23
MK    23
UA    23
AL    23
YU    23
IS    23
BE    23
GR    23
BA    23
AT    23
TR    23
RU    23
FR    23
HR    23
ES    23
PL    23
NO    23
NL    23
GB    23
IE    23
RO    23
MT    23
dtype: int64
2005 from_code2
SI    24
PL    24
MC    24
PT    24
NL    24
EE    24
FI    24
AT    24
BY    24
BG    24
BE    24
IE    24
AD    24
LT    24
IS    24
CH    23
IL    23
GR    23
DE    23
CY    23
SE    23
MK    23
UA    23
MD    23
AL    23
LV    23
YU    23
TR    23
BA    23
RU    23
FR    23
HR    23
HU    23
NO    23
GB    23
MT    23
DK    23
ES    23
RO    23
dtype: int64


# Performer gender

https://github.com/KatrionaGoldmann/Eurovision_TDS/blob/jp/data-wrangling-notebook/eurovision/get_gender.py

# Migration

Do it both ways round: migration_v2p and migration_p2v

https://github.com/KatrionaGoldmann/Eurovision_TDS/blob/jp/data-wrangling-notebook/eurovision/get_migration.ipynb

# Comps without win

https://github.com/KatrionaGoldmann/Eurovision_TDS/blob/jp/data-wrangling-notebook/eurovision/get_comps_without_win.py