# Data rework

Trying to get everything cleanly.

## Voting scores

Start with this. 
 - 1975-2019 data from Kaggle: https://www.kaggle.com/datasets/datagraver/eurovision-song-contest-scores-19752019
 - 2020 was cancelled
 - 2021, 2022 scraped from Wikipedia

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import re
import pycountry
import json

In [2]:
## Read in data.

votes_1975_2019 = pd.read_excel("../../data/eurovision_song_contest_1975_2019.xlsx")

# Clean up column names first
votes_1975_2019.columns = [c.strip().lower().replace(' ', '_') for c in votes_1975_2019.columns.values.tolist()]

print(votes_1975_2019.shape)
votes_1975_2019.head()

(49832, 8)


Unnamed: 0,year,(semi-)_final,edition,jury_or_televoting,from_country,to_country,points,duplicate
0,1975,f,1975f,J,Belgium,Belgium,0,x
1,1975,f,1975f,J,Belgium,Finland,0,
2,1975,f,1975f,J,Belgium,France,2,
3,1975,f,1975f,J,Belgium,Germany,0,
4,1975,f,1975f,J,Belgium,Ireland,12,


In [3]:
## Clean up this dataset.

# Select only finals votes, and only 1998 onwards (inclusive)
votes_1998_2019 = votes_1975_2019[(votes_1975_2019['(semi-)_final'] == 'f') & (votes_1975_2019['year'] >= 1998)]

# Drop unnecessary columns
votes_1998_2019 = votes_1998_2019[["year", "from_country", "to_country", "points", "jury_or_televoting"]]

# Clean up country names
def standardise_country(c):
    replacements = [('-', ' '), ('&', 'and'), ('netherands', 'netherlands'),
                    # FYR Macedonia was formally renamed as North Macedonia in 2019
                    ('f.y.r. macedonia', 'north macedonia'), 
                    ('russia', 'russian federation'), 
                    ('the netherlands', 'netherlands'), 
                    ('czech republic', 'czechia'),
                    # Yugoslavia dissolved in 2002; most of it became 'Serbia and Montenegro', until 2006, when Serbia and Montenegro split ways.
                    ('serbia and montenegro', 'yugoslavia'),
                    ('moldova', 'moldova, republic of')]
    c = c.lower()
    for r in replacements:
        c = c.replace(r[0], r[1])
    return c
for column in ['from_country', 'to_country']:
    votes_1998_2019[column] = votes_1998_2019[column].map(standardise_country)

# Drop columns which correspond to the same vote (there are two Belarus -> Russia in 2019, for example)
votes_1998_2019 = votes_1998_2019.drop_duplicates(subset=['year', 'from_country', 'to_country', 'jury_or_televoting'])

# Drop Lithuania in 2003 (they didn't participate - I don't know why it's still in the dataset)
votes_1998_2019 = votes_1998_2019[~((votes_1998_2019['to_country'] == 'lithuania') & (votes_1998_2019['year'] == 2003))]

# Drop "votes" from one country to herself
votes_1998_2019 = votes_1998_2019[votes_1998_2019['from_country'] != votes_1998_2019['to_country']]

votes_1998_2019.sample(n=10)


Unnamed: 0,year,from_country,to_country,points,jury_or_televoting
19487,2007,georgia,united kingdom,0,J
43524,2018,spain,finland,0,J
12938,2002,spain,north macedonia,0,J
14201,2004,lithuania,spain,0,J
31175,2013,serbia,france,0,J
32254,2014,finland,armenia,4,J
26667,2011,armenia,spain,0,J
38927,2017,australia,norway,0,J
28663,2012,bosnia and herzegovina,russian federation,3,J
35563,2016,cyprus,united kingdom,0,J


In [4]:
## Now we need to fetch some data from Wikipedia for the 2021 and 2022 contests.

import requests
from bs4 import BeautifulSoup

def import_votes_from_wp(year: int) -> pd.DataFrame:
    # ID numbers for the respective tables on the Wikipedia page.
    JURY_ID = 16
    TELEVOTING_ID = 17

    url = f"https://en.wikipedia.org/wiki/Eurovision_Song_Contest_{year}#Final_2"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.find_all('table', {'class': "wikitable"})

    def parse_table_from_id(wp_id: int, jury_or_tele: str) -> pd.DataFrame:
        df_table = pd.read_html(str(tables[wp_id]))
        df_table = pd.DataFrame(df_table[0])

        # remove redundant rows/columns
        df_table = df_table.drop(df_table.columns[[0, 2, 3, 4]], axis=1)
        df_table = df_table.drop(df_table.index[[0, 2]], axis=0) 

        # set the index to the first column
        df_table = df_table.set_index(df_table.columns[0])

        # set the column names as the first row
        df_table.columns = df_table.iloc[0]
        df_table = df_table.drop(df_table.index[0])

        # replace NaN with 0
        df_table = df_table.fillna(0)

        # squash the column index with stack
        df_table = df_table.stack().reset_index()

        df_table.columns = ['to_country', 'from_country', 'points']
        df_table['jury_or_televoting'] = jury_or_tele

        df_table['year'] = year

        # re-order the columns to match the original data   
        df_table = df_table[['year', 'from_country', 'to_country', 'points', 'jury_or_televoting']]
        
        df_table['points'] = df_table['points'].astype(int)
        
        # Clean up countries as before
        for column in ['from_country', 'to_country']:
            df_table[column] = df_table[column].map(standardise_country)

        return(df_table)

    jury_table = parse_table_from_id(JURY_ID, jury_or_tele='J')
    tele_table = parse_table_from_id(TELEVOTING_ID, jury_or_tele='T')
    return(pd.concat([jury_table, tele_table]))

votes_1998_2022 = pd.concat([votes_1998_2019,
                             import_votes_from_wp(2021),
                             import_votes_from_wp(2022)])

# Again, drop "votes" from one country to herself
votes_1998_2022 = votes_1998_2022[votes_1998_2022['from_country'] != votes_1998_2022['to_country']]

votes_1998_2022.sample(n=10)

Unnamed: 0,year,from_country,to_country,points,jury_or_televoting
40548,2017,italy,armenia,0,T
23850,2009,norway,russian federation,0,J
43176,2018,italy,ukraine,0,J
34292,2015,norway,united kingdom,0,J
44503,2018,portugal,united kingdom,0,T
43376,2018,portugal,"moldova, republic of",0,J
666,2021,albania,lithuania,0,J
23724,2009,latvia,romania,0,J
34339,2015,portugal,romania,4,J
29456,2012,sweden,north macedonia,0,J


In [5]:
# This cell is a sanity check to make sure that all countries participating in a given year got the same number of votes.
# We hope to see the 'is_consistent' column be True for all years in the output.

def check_consistency(df):
    def all_entries_same(arr : np.ndarray) -> bool:
        # Determines if all non-NaN entries in a numpy array have the same value.
        arr2 = arr[~np.isnan(arr)]
        return np.all(arr2 == arr2[0])

    # Pivot to wide form, so that each row gives the number of scores each country received in a given year
    grouped_votes = df.groupby(by=['year', 'to_country'])['points'].count().reset_index()
    grouped_votes = grouped_votes.pivot(index="year", columns="to_country", values="points")
    # Create "is_consistent" column and move it to the front
    col_names = grouped_votes.columns
    grouped_votes["is_consistent"] = grouped_votes.apply(all_entries_same, axis=1, raw=True)
    new_col_names = ["is_consistent", *col_names]
    # Show data
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(grouped_votes.reindex(columns=new_col_names))
        
check_consistency(votes_1998_2022)
    
# At this point, if is_consistent is False for any year, you can look at the entries in the row to find out where the discrepancy lies.
# That's how I found Lithuania 2003, at least. There's no guarantee that is_consistent = True means that everything is *correct*, but
# it at least increases our confidence, I think.
    
# Perfect...!

to_country,is_consistent,albania,armenia,australia,austria,azerbaijan,belarus,belgium,bosnia and herzegovina,bulgaria,croatia,cyprus,czechia,denmark,estonia,finland,france,georgia,germany,greece,hungary,iceland,ireland,israel,italy,latvia,lithuania,malta,"moldova, republic of",montenegro,netherlands,north macedonia,norway,poland,portugal,romania,russian federation,san marino,serbia,slovakia,slovenia,spain,sweden,switzerland,turkey,ukraine,united kingdom,yugoslavia
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1
1998,True,,,,,,,24.0,,,24.0,24.0,,,24.0,24.0,24.0,,24.0,24.0,24.0,,24.0,24.0,,,,24.0,,,24.0,24.0,24.0,24.0,24.0,24.0,,,,24.0,24.0,24.0,24.0,24.0,24.0,,24.0,
1999,True,,,,22.0,,,22.0,22.0,,22.0,22.0,,22.0,22.0,,22.0,,22.0,,,22.0,22.0,22.0,,,22.0,22.0,,,22.0,,22.0,22.0,22.0,,,,,,22.0,22.0,22.0,,22.0,,22.0,
2000,True,,,,23.0,,,23.0,,,23.0,23.0,,23.0,23.0,23.0,23.0,,23.0,,,23.0,23.0,23.0,,23.0,,23.0,,,23.0,23.0,23.0,,,23.0,23.0,,,,,23.0,23.0,23.0,23.0,,23.0,
2001,True,,,,,,,,22.0,,22.0,,,22.0,22.0,,22.0,,22.0,22.0,,22.0,22.0,22.0,,22.0,22.0,22.0,,,22.0,,22.0,22.0,22.0,,22.0,,,,22.0,22.0,22.0,,22.0,,22.0,
2002,True,,,,23.0,,,23.0,23.0,,23.0,23.0,,23.0,23.0,23.0,23.0,,23.0,23.0,,,,23.0,,23.0,23.0,23.0,,,,23.0,,,,23.0,23.0,,,,23.0,23.0,23.0,23.0,23.0,,23.0,
2003,True,,,,25.0,,,25.0,25.0,,25.0,25.0,,,25.0,,25.0,,25.0,25.0,,25.0,25.0,25.0,,,,25.0,,,25.0,,25.0,25.0,25.0,25.0,25.0,,,,25.0,25.0,25.0,,25.0,25.0,25.0,
2004,True,35.0,,,35.0,,,35.0,35.0,,35.0,35.0,,,,,35.0,,35.0,35.0,,35.0,35.0,,,,,35.0,,,35.0,35.0,35.0,35.0,,35.0,35.0,,,,,35.0,35.0,,35.0,35.0,35.0,35.0
2005,True,38.0,,,,,,,38.0,,38.0,38.0,,38.0,,,38.0,,38.0,38.0,38.0,,,38.0,,38.0,,38.0,38.0,,,38.0,38.0,,,38.0,38.0,,,,,38.0,38.0,38.0,38.0,38.0,38.0,38.0
2006,True,,37.0,,,,,,37.0,,37.0,,,37.0,,37.0,37.0,,37.0,37.0,,,37.0,37.0,,37.0,37.0,37.0,37.0,,,37.0,37.0,,,37.0,37.0,,,,,37.0,37.0,37.0,37.0,37.0,37.0,
2007,True,,41.0,,,,41.0,,41.0,41.0,,,,,,41.0,41.0,41.0,41.0,41.0,41.0,,41.0,,,41.0,41.0,,41.0,,,41.0,,,,41.0,41.0,,41.0,,41.0,41.0,41.0,,41.0,41.0,41.0,


In [6]:
# Now we need to combine jury and televoting scores.

# Years where jury voting happened
jury_years = np.unique(votes_1998_2022[votes_1998_2022['jury_or_televoting'] == 'J']['year'])
# Years where televoting happened
televoting_years = np.unique(votes_1998_2022[votes_1998_2022['jury_or_televoting'] == 'T']['year'])
# Years where both happened (i.e. the intersection)
double_voting_years = np.intersect1d(jury_years, televoting_years)
double_voting_years

array([2016, 2017, 2018, 2019, 2021, 2022])

In [7]:
# These are the years for which the points can just be used as-is.
votes_to_keep = votes_1998_2022[~votes_1998_2022['year'].isin(double_voting_years)]
votes_to_keep = votes_to_keep.drop(columns=['jury_or_televoting'])

# These are the years which we need to process.
# The way we do this is to add up the J and T scores, then re-rank them and assign 12 points to the highest score, 10 to the next-highest, etc.
votes_to_process = votes_1998_2022[votes_1998_2022['year'].isin(double_voting_years)]
summed_votes = votes_to_process.sort_values(by=['year', 'from_country', 'to_country'])
summed_votes = summed_votes.groupby(by=['year', 'from_country', 'to_country']).sum(numeric_only=True)

def rescale_points(pts: pd.Series) -> pd.Series:
    # grp is a pd.Series corresponding to one combination of 'year' and 'from_country'
    ranks_to_rescaled_points = {1: 12, 2: 10, 3: 8, 4: 7, 5: 6, 6: 5, 7: 4, 8: 3, 9: 2, 10: 1}
    ranks = [sorted(pts, reverse=True).index(pt) + 1 for pt in pts]
    rescaled_points = {pt: ranks_to_rescaled_points.get(r, 0) for pt, r in zip(pts, ranks)}
    return pts.map(rescaled_points)

processed_votes = summed_votes.groupby(by=['year', 'from_country']).transform(rescale_points).reset_index()
processed_votes.head()

Unnamed: 0,year,from_country,to_country,points
0,2016,albania,armenia,0
1,2016,albania,australia,12
2,2016,albania,austria,0
3,2016,albania,azerbaijan,0
4,2016,albania,belgium,0


In [8]:
# Sanity check
x = processed_votes.rename(columns={"points": "rescaled"})
x = x.set_index(["year", "from_country", "to_country"])
v = summed_votes.reset_index().set_index(["year", "from_country", "to_country"])
joined = v.join(x, how="outer").reset_index()
joined[(joined['year'] == 2016) & (joined['from_country'] == 'albania')].sort_values(by="points", ascending=False)

Unnamed: 0,year,from_country,to_country,points,rescaled
1,2016,albania,australia,24,12
14,2016,albania,italy,18,10
20,2016,albania,russian federation,14,8
5,2016,albania,bulgaria,12,7
9,2016,albania,france,10,6
24,2016,albania,ukraine,6,5
22,2016,albania,spain,6,5
25,2016,albania,united kingdom,5,3
19,2016,albania,poland,5,3
16,2016,albania,lithuania,4,1


In [9]:
# Add in country codes, and that's our final voting data.

votes = pd.concat([votes_to_keep, processed_votes]).reset_index(drop=True)
votes

def get_country_codes(name):
    if name == 'yugoslavia':
        # That's how it's encoded in pycountry.
        # https://github.com/flyingcircusio/pycountry/blob/main/src/pycountry/databases/iso3166-3.json
        cty = pycountry.historic_countries.get(name='yugoslavia, socialist federal republic of')
    else:
        cty = pycountry.countries.get(name=name)
    if cty is None:
        raise KeyError("Country name " + name + " not found in pycountry. This really shouldn't happen.")
    
    return cty.alpha_2, cty.alpha_3

for ft in ['from', 'to']:
    votes[f'{ft}_code2'], votes[f'{ft}_code3'] = zip(*votes[f'{ft}_country'].map(get_country_codes))

# Add column for each country and year get the total number of points received
votes['total_points'] = votes.groupby(by=['year', 'to_country'])['points'].transform('sum')

# For each year rank the countries by total points received, where draws get same value
temp = votes[['year', 'to_country', 'total_points']].drop_duplicates()
temp['rank'] = temp.groupby(by=['year'])['total_points'].rank(method='first', ascending=False)

# merge votes with ranks
votes = votes.merge(temp, on=['year', 'to_country', 'total_points'], how='left')


votes

Unnamed: 0,year,from_country,to_country,points,from_code2,from_code3,to_code2,to_code3,total_points,rank
0,1998,belgium,croatia,5,BE,BEL,HR,HRV,131,5.0
1,1998,belgium,cyprus,2,BE,BEL,CY,CYP,37,11.0
2,1998,belgium,estonia,0,BE,BEL,EE,EST,36,12.0
3,1998,belgium,north macedonia,0,BE,BEL,MK,MKD,16,19.0
4,1998,belgium,finland,0,BE,BEL,FI,FIN,22,15.0
...,...,...,...,...,...,...,...,...,...,...
21305,2022,united kingdom,serbia,0,GB,GBR,RS,SRB,169,5.0
21306,2022,united kingdom,spain,8,GB,GBR,ES,ESP,282,3.0
21307,2022,united kingdom,sweden,10,GB,GBR,SE,SWE,245,4.0
21308,2022,united kingdom,switzerland,0,GB,GBR,CH,CHE,28,18.0


In [10]:
# sanity check the numbers
temp = votes[['from_country', 'year']].value_counts()

# for each year print the unique values
for year, group in temp.groupby(level=1):
    print(year, group.unique())

1998 [24]
1999 [22]
2000 [23]
2001 [22]
2002 [23]
2003 [25 24]
2004 [24 23]
2005 [24 23]
2006 [24 23]
2007 [24 23]
2008 [25 24]
2009 [25 24]
2010 [25 24]
2011 [25 24]
2012 [26 25]
2013 [26 25]
2014 [26 25]
2015 [27 26]
2016 [26 25]
2017 [26 25]
2018 [26 25]
2019 [26 25]
2021 [26 25]
2022 [25 24]


In [11]:
check_consistency(votes)

to_country,is_consistent,albania,armenia,australia,austria,azerbaijan,belarus,belgium,bosnia and herzegovina,bulgaria,croatia,cyprus,czechia,denmark,estonia,finland,france,georgia,germany,greece,hungary,iceland,ireland,israel,italy,latvia,lithuania,malta,"moldova, republic of",montenegro,netherlands,north macedonia,norway,poland,portugal,romania,russian federation,san marino,serbia,slovakia,slovenia,spain,sweden,switzerland,turkey,ukraine,united kingdom,yugoslavia
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1
1998,True,,,,,,,24.0,,,24.0,24.0,,,24.0,24.0,24.0,,24.0,24.0,24.0,,24.0,24.0,,,,24.0,,,24.0,24.0,24.0,24.0,24.0,24.0,,,,24.0,24.0,24.0,24.0,24.0,24.0,,24.0,
1999,True,,,,22.0,,,22.0,22.0,,22.0,22.0,,22.0,22.0,,22.0,,22.0,,,22.0,22.0,22.0,,,22.0,22.0,,,22.0,,22.0,22.0,22.0,,,,,,22.0,22.0,22.0,,22.0,,22.0,
2000,True,,,,23.0,,,23.0,,,23.0,23.0,,23.0,23.0,23.0,23.0,,23.0,,,23.0,23.0,23.0,,23.0,,23.0,,,23.0,23.0,23.0,,,23.0,23.0,,,,,23.0,23.0,23.0,23.0,,23.0,
2001,True,,,,,,,,22.0,,22.0,,,22.0,22.0,,22.0,,22.0,22.0,,22.0,22.0,22.0,,22.0,22.0,22.0,,,22.0,,22.0,22.0,22.0,,22.0,,,,22.0,22.0,22.0,,22.0,,22.0,
2002,True,,,,23.0,,,23.0,23.0,,23.0,23.0,,23.0,23.0,23.0,23.0,,23.0,23.0,,,,23.0,,23.0,23.0,23.0,,,,23.0,,,,23.0,23.0,,,,23.0,23.0,23.0,23.0,23.0,,23.0,
2003,True,,,,25.0,,,25.0,25.0,,25.0,25.0,,,25.0,,25.0,,25.0,25.0,,25.0,25.0,25.0,,,,25.0,,,25.0,,25.0,25.0,25.0,25.0,25.0,,,,25.0,25.0,25.0,,25.0,25.0,25.0,
2004,True,35.0,,,35.0,,,35.0,35.0,,35.0,35.0,,,,,35.0,,35.0,35.0,,35.0,35.0,,,,,35.0,,,35.0,35.0,35.0,35.0,,35.0,35.0,,,,,35.0,35.0,,35.0,35.0,35.0,35.0
2005,True,38.0,,,,,,,38.0,,38.0,38.0,,38.0,,,38.0,,38.0,38.0,38.0,,,38.0,,38.0,,38.0,38.0,,,38.0,38.0,,,38.0,38.0,,,,,38.0,38.0,38.0,38.0,38.0,38.0,38.0
2006,True,,37.0,,,,,,37.0,,37.0,,,37.0,,37.0,37.0,,37.0,37.0,,,37.0,37.0,,37.0,37.0,37.0,37.0,,,37.0,37.0,,,37.0,37.0,,,,,37.0,37.0,37.0,37.0,37.0,37.0,
2007,True,,41.0,,,,41.0,,41.0,41.0,,,,,,41.0,41.0,41.0,41.0,41.0,41.0,,41.0,,,41.0,41.0,,41.0,,,41.0,,,,41.0,41.0,,41.0,,41.0,41.0,41.0,,41.0,41.0,41.0,


# Song language

- Performance language from Kaggle: https://www.kaggle.com/datasets/minitree/eurovision-song-lyrics?select=eurovision-lyrics-2022.json
- Official country language from wikipedia: https://en.wikipedia.org/wiki/List_of_official_languages_by_country_and_territory

In [12]:
songs = pd.read_json('../../data/eurovision-lyrics-2022.json').T
songs = songs[['Country', 'Artist', 'Language', 'Year']]

# Rename a couple of columns
songs = songs.rename(columns={'Language': 'Language_sung'})

# Tidy up country names
for original, replacement in [('Macedonia', 'North Macedonia'),
                              ('Russia', 'russian federation'),
                              ('Serbia and Montenegro', 'yugoslavia'),
                              ('Moldova', 'moldova, republic of'),
                              ('Czech Republic', 'czechia'),
                              ('The Netherlands', 'netherlands')]:
    songs.loc[songs['Country'] == original, 'Country'] = replacement
songs['Country'] = songs['Country'].str.lower()

# Limit to 1998 and later
songs['Year'] = pd.to_numeric(songs['Year'])
songs = songs[songs['Year'] > 1997]

# Add country code columns
songs['Country_code2'], songs['Country_code3'] = zip(
    *songs['Country'].map(get_country_codes))

songs.head()

Unnamed: 0,Country,Artist,Language_sung,Year,Country_code2,Country_code3
772,croatia,Danijela,Croatian,1998,HR,HRV
773,greece,Thalassa,Greek,1998,GR,GRC
774,france,Marie Line,French,1998,FR,FRA
775,spain,Mikel Herzog,Spanish,1998,ES,ESP
776,switzerland,Gunvor,German,1998,CH,CHE


In [13]:
# Tidy up the language sung column
songs['Language_sung'] = songs['Language_sung'].str.lower()
songs['Language_sung'] = songs['Language_sung'].str.replace('partly|dialect|title|and', '', regex=True)

# for each key in the dictionary, replace the value with the key
replace_strings = {
    'fr\\.': 'french', 'eng\\.': 'english', 'gr\\.': 'greek', 
    'sp\\.': 'spanish', 'rom\\.': 'romanian', 'russ\\.': 'russian',
    'it\\.': 'italian', 'germ\\.': 'german', 'pol\\.': 'polish', 
    'sign language': 'sign-language'
}

for key, value in replace_strings.items():
    songs['Language_sung'] = songs['Language_sung'].str.replace(key, value, regex=True)

def extract_languages(lang_string):
    """Convert the string in language_sung into a list of languages"""
    langs = re.split(r'\s*[/()]\s*', lang_string)
    langs = [lang.strip() for lang in langs]
    return [lang for lang in langs if lang != ""]

songs['Language_sung'] = songs['Language_sung'].apply(extract_languages)
songs.tail(n=10)

Unnamed: 0,Country,Artist,Language_sung,Year,Country_code2,Country_code3
1674,romania,WRS,"[english, spanish]",2022,RO,ROU
1675,san marino,Achille Lauro,"[italian, english]",2022,SM,SMR
1676,serbia,Konstrakta,[serbian],2022,RS,SRB
1677,slovenia,LPS,[slovenian],2022,SI,SVN
1678,spain,Chanel,"[spanish, english]",2022,ES,ESP
1679,sweden,Cornelia Jakobs,[english],2022,SE,SWE
1680,switzerland,Marius Bear,[english],2022,CH,CHE
1681,netherlands,S10,[dutch],2022,NL,NLD
1682,ukraine,Kalush Orchestra,[ukrainian],2022,UA,UKR
1683,united kingdom,Sam Ryder,[english],2022,GB,GBR


In [14]:
songs['Contains_English'] = songs['Language_sung'].apply(lambda x: 'english' in x)
songs['Contains_NonEnglish'] = songs['Language_sung'].apply(lambda x: x != ['english'])

songs[['Contains_English', 'Contains_NonEnglish']].value_counts()

Contains_English  Contains_NonEnglish
True              False                  543
False             True                   220
True              True                   149
dtype: int64

In [15]:
print('Number of songs containing English or non-English:')

songs[['Contains_English', 'Contains_NonEnglish']].value_counts()

Number of songs containing English or non-English:


Contains_English  Contains_NonEnglish
True              False                  543
False             True                   220
True              True                   149
dtype: int64

Next we want to see whether countries are singing in their official language. We can get the official language from [Wikipedia](https://en.wikipedia.org/wiki/List_of_official_languages_by_country_and_territory).

In [16]:
# Get the official languages from Wikipedia

import requests
from bs4 import BeautifulSoup
import re

url = (
    f"https://en.wikipedia.org/wiki/List_of_official_languages_by_country_and_territory"
)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
tables = soup.find_all("table", {"class": "wikitable"})

table = tables[0]
df_table = pd.read_html(str(table))
df_table = pd.DataFrame(df_table[0])

# Tidy the columns
df_table = df_table.fillna("")
df_table["Country/Region"] = df_table["Country/Region"].apply(
    lambda x: re.sub("\[.*?\]", "", x)
)
df_table.rename(columns={"Official language": "Official_languages"}, inplace=True)
df_table["Official_languages"] = df_table["Official_languages"].apply(
    lambda x: re.sub("\[.*?\]", "", x)
)

# Tidy the country names
df_table["Country/Region"] = df_table["Country/Region"].str.lower()

df_table.loc[
    df_table["Country/Region"] == "united kingdom and crown dependencies etc.",
    "Country/Region",
] = "united kingdom"
df_table.loc[
    df_table["Country/Region"] == "russia", "Country/Region"
] = "russian federation"
df_table.loc[
    df_table["Country/Region"] == "serbia and montenegro", "Country/Region"
] = "yugoslavia"
df_table.loc[
    df_table["Country/Region"] == "moldova", "Country/Region"
] = "moldova, republic of"
df_table.loc[
    df_table["Country/Region"] == "czech republic", "Country/Region"
] = "czechia"
df_table = pd.concat(
    [
        df_table,
        pd.Series(
            {
                "Country/Region": "yugoslavia",
                "Official_languages": "serbian montenegrin",
            }
        )
        .to_frame()
        .T,
    ],
    ignore_index=True,
)
set(songs["Country"].unique()) - set(df_table["Country/Region"].unique())

df_table = df_table.loc[df_table["Country/Region"].isin(songs["Country"].unique())]

df_table["Country_code2"], df_table["Country_code3"] = zip(
    *df_table["Country/Region"].map(get_country_codes)
)

# Tidy the language column
df_table["Official_languages"] = df_table["Official_languages"].str.lower()
df_table["Official_languages"] = df_table["Official_languages"].apply(
    lambda x: x.replace("all have de facto status", "")
)
df_table["Official_languages"] = df_table["Official_languages"].apply(
    lambda x: x.replace(",", "")
)
df_table["Official_languages"] = df_table["Official_languages"].apply(
    lambda x: x.replace("(", "")
)
df_table["Official_languages"] = df_table["Official_languages"].apply(
    lambda x: x.replace(")", "")
)

# Manually add missing languages
df_table.loc[df_table["Country_code2"] == "LT", "Official_languages"] = (
    "samogitian "
    + df_table.loc[df_table["Country_code2"] == "LT", "Official_languages"]
)
df_table.loc[df_table["Country_code2"] == "FR", "Official_languages"] = (
    "breton corsican "
    + df_table.loc[df_table["Country_code2"] == "FR", "Official_languages"]
)
df_table.loc[df_table["Country_code2"] == "SI", "Official_languages"] = (
    "slovenian " + df_table.loc[df_table["Country_code2"] == "SI", "Official_languages"]
)
df_table.loc[df_table["Country_code2"] == "EE", "Official_languages"] = (
    "võro " + df_table.loc[df_table["Country_code2"] == "EE", "Official_languages"]
)

df_table.tail()

Unnamed: 0,Country/Region,Official_languages,Regional language,Minority language,National language,Widely spoken,Country_code2,Country_code3
182,switzerland,french bern fribourg geneva jura neuchâtel val...,,,,,CH,CHE
194,turkey,turkish,,Kurdish,Turkish,,TR,TUR
198,ukraine,ukrainian,Russian (Autonomous Republic of Crimea) Crimea...,,,,UA,UKR
200,united kingdom,none english has de facto status,Irish and Ulster-Scots (in Northern Ireland) S...,,,,GB,GBR
211,yugoslavia,serbian montenegrin,,,,,YU,YUG


In [17]:
# print any countries in songs['Country'] that are not in df_table['Country/Region']
if len(set(songs['Country_code2']) - set(df_table['Country_code2'])) > 0: 
    countries = list(set(songs['Country_code2']) - set(df_table['Country_code2']))
    raise KeyError("Country name " + ', '.join(countries) + " was in songs, but not in df_table.")

# merge df_table and language on Country and Country/Region
songs = pd.merge(songs, df_table[['Country_code2', 'Official_languages']], left_on='Country_code2', right_on='Country_code2', how='left')

In [18]:
# Tidy the languages column
songs['Official_languages'] = songs['Official_languages'].fillna(' ')

# Add more columns
def get_n_languages(langs):
    """Get the number of languages in a list of languages"""
    if '6 other' in langs:   # ["english", "6 other"] -> 7
        return len(langs) + 5
    elif '10 other' in langs:
        return len(langs) + 9
    else:
        return len(langs)
songs['Contains_Multiple_Languages'] = songs['Language_sung'].apply(lambda x: len(x) > 1)
songs['Number_of_Languages'] = songs['Language_sung'].apply(get_n_languages)
songs['Contains_Own_Language'] = songs.apply(lambda df: len(set(df['Language_sung']).intersection(df['Official_languages'].split())) > 0, axis=1)

songs[songs['Number_of_Languages'] > 3]

Unnamed: 0,Country,Artist,Language_sung,Year,Country_code2,Country_code3,Contains_English,Contains_NonEnglish,Official_languages,Contains_Multiple_Languages,Number_of_Languages,Contains_Own_Language
45,germany,Sürpriz,"[german, turkish, english, hebrew]",1999,DE,DEU,True,True,german,True,4,True
79,lithuania,Skamp,"[english, lithuanian, german, french]",2001,LT,LTU,True,True,samogitian lithuanian,True,4,True
131,israel,Lior Narkis,"[hebrew, english, greek, french, spanish]",2003,IL,ISR,True,True,hebrew,True,5,True
214,ukraine,GreenJolly,"[ukrainian, english, 6 other]",2005,UA,UKR,True,True,ukrainian,True,8,True
231,poland,Ich Troje (2) feat. Real McCoy,"[english, polish, german, russian, spanish]",2006,PL,POL,True,True,polish,True,5,True
273,portugal,Sabrina,"[portuguese, english, french, spanish]",2007,PT,PRT,True,True,portuguese,True,4,True
295,ukraine,Verka Serduchka,"[german, english, ukrainian, russian]",2007,UA,UKR,True,True,ukrainian,True,4,True
297,romania,Todomondo,"[english, italian, spanish, russian, french, r...",2007,RO,ROU,True,True,romanian,True,6,True
309,ireland,Dustin the Turkey,"[english, french, german, italian, spanish]",2008,IE,IRL,True,True,irish english,True,5,True
491,bulgaria,Sofi Marinova,"[bulgarian, 10 other]",2012,BG,BGR,False,True,bulgarian,True,11,True


In [19]:
songs.head()

Unnamed: 0,Country,Artist,Language_sung,Year,Country_code2,Country_code3,Contains_English,Contains_NonEnglish,Official_languages,Contains_Multiple_Languages,Number_of_Languages,Contains_Own_Language
0,croatia,Danijela,[croatian],1998,HR,HRV,False,True,croatian,False,1,True
1,greece,Thalassa,[greek],1998,GR,GRC,False,True,greek,False,1,True
2,france,Marie Line,[french],1998,FR,FRA,False,True,breton corsican french,False,1,True
3,spain,Mikel Herzog,[spanish],1998,ES,ESP,False,True,spanish,False,1,True
4,switzerland,Gunvor,[german],1998,CH,CHE,False,True,french bern fribourg geneva jura neuchâtel val...,False,1,True


In [20]:
# Combine votes and language
df_VL = pd.merge(votes, songs, left_on=['to_code2', 'year'], right_on=['Country_code2', 'Year'], how='left')

# check if Country and to_country are identical
if not all([all(df_VL['Country'] == df_VL['to_country']),
            all(df_VL['Country_code2'] == df_VL['to_code2']),
            all(df_VL['Year'] == df_VL['year'])]):
    raise ValueError("Mismatch in the merge - check this out!")

df_VL = df_VL[[
    'year', 'Artist',
    'from_country',	'to_country', 'points', 'total_points', 
    'rank',	'from_code2', 'from_code3', 'to_code2', 'to_code3',
    'Official_languages', 'Language_sung',
    'Contains_English', 'Contains_NonEnglish', 'Contains_Multiple_Languages',
    'Number_of_Languages', 'Contains_Own_Language']]


df_VL.shape

(21310, 18)

# Performer gender

Synchronous code: https://github.com/KatrionaGoldmann/Eurovision_TDS/blob/jp/data-wrangling-notebook/eurovision/get_gender.py

Refactored version using asyncio is below.

In [21]:
import aiohttp
import asyncio

async def get_property(session, concept_id, property_id):
    """Async reimplementation of wikipeople.get_property
    https://github.com/samvanstroud/wikipeople/blob/master/wikipeople/wikipeople.py
    
    session is an aiohttp ClientSession.
    concept_id can be obtained using the get_concept_id function
    property_id is hardcoded, I don't know where to get these from, but whatever.
    
    Returns None if any of this can't be found for whatever reason.
    
    e.g. "Q219655" is the concept_id for Carey Mulligan; "P21" is the property_id for gender. So we have that
        get_property(session, "Q219655", "P21") -> "female"
    """
    url = 'https://www.wikidata.org/w/api.php'
    params = {'action': 'wbgetclaims',
              'entity': concept_id,
              'property': property_id,
              'language': 'en',
              'format': 'json'}
    async with session.get(url, params=params) as resp:
        try:
            res = await resp.json()
        except Exception as e:
            print(resp)
            raise e

    if property_id not in res['claims']:
        return None
    # This gives yet another 'id', and we then need to perform yet another HTTP
    # request to find the actual *label* that this corresponds to.
    else:
        id = None
        for prop in res['claims'][property_id]:
            try:
                id = prop['mainsnak']['datavalue']['value']['id']
            except:
                continue

        if id is None:
            return None
        else:
            new_params =  {'action': 'wbgetentities',
                           'ids': id,
                           'languages': 'en',
                           'format': 'json',
                           'props': 'labels'}
            async with session.get(url, params=new_params) as resp:
                try:
                    res = await resp.json()
                except Exception as e:
                    print(resp)
                    raise e
            try:
                return res['entities'][id]['labels']['en']['value']
            except:
                return None

async def get_concept_id(session, page_name):
    """
    Get the concept_id corresponding to a particular Wikipedia page. For some odd reason, some Wikipedia
    pages don't have concept IDs. In such a case, we return None.
    
    e.g. get_concept_id(session, "Carey Mulligan") -> "Q219655"
    """
    url = 'https://www.wikidata.org/w/api.php'
    params = {'action': 'wbsearchentities',
              'search': page_name,
              'language': 'en',
              'format': 'json'}
    music_markers = [
        'singer', 'artist', 'musician', 'music',
        'band', 'group', 'duo', 'ensemble'
    ]

    async with session.get(url, params=params) as resp:
        # Titles of WP pages that match the search query.
        json = await resp.json()

    result = json['search']

    if len(result) == 0:
        # Couldn't find a concept id for the person/group
        return None

    # By default, choose the first result from the list
    target = 0
    # But check the other results to see if any of them are musicians (as
    # indicated by the markers) and Eurovision contestants
    for i, res in enumerate(result):
        if 'description' in res['display']:
            description = res['display']['description']['value']
            if any(markers in description for markers in music_markers):
                concept_id = res['id']
                contestant_in = await get_property(session, concept_id, 'P1344')
                if contestant_in is not None and "Eurovision" in contestant_in:
                    target = i
    # Return the concept ID of the result found
    return result[target]['id']

async def lookup_gender(session, page_name):
    """Find gender of a performing act, using the name associated with their
    Wikipedia page. Returns None if could not be found.
    """
    concept_id = await get_concept_id(session, page_name)
    if concept_id is None:
        return None

    gender = await get_property(session, concept_id, 'P21')
    instance = await get_property(session, concept_id, 'P31')
    if gender is None and instance is None:
        return None
    elif gender is None and instance is not None:
        group_checks = ["group", "duo", "trio", "music", "band", "ensemble"]
        if any(x in instance for x in group_checks):
            return "group"
    else:
        return gender

async def get_pages(session, name):
    """Obtain a list of Wikipedia pages obtained by searching for a name.
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "opensearch",
        "namespace": "0",
        "search": name,
        "limit": "10000",
        "format": "json"
    }
    async with session.get(url, params=params) as resp:
        # Titles of WP pages that match the search query.
        json = await resp.json()
    return json[1]

async def get_artist_gender(session, name):
    gender = None
    # Get the WP page for this person/group
    pages = await get_pages(session, name)
    # If there's one, try to get their gender from the first page
    if len(pages) > 0:
        gender = await lookup_gender(session, pages[0])
    # Finally we use some heuristics to cover some edge cases
    if gender is None:
        if '&' in name or 'feat.' in name:
            return 'group'
    
    return gender

In [22]:
# Check whether the gender data has already been saved. If so, load it in.
p = Path("../../data/gender_dict.json")
if p.is_file():
    with open(p, 'r') as file:
        gender_dict = json.load(file)
        print('Loaded performer genders from file')
        
else:
    # If not, now that we have all the necessary functionality, we can fetch the data from Wikipedia.
    all_performers = df_VL['Artist'].unique().tolist()
    n_performers = len(all_performers)
    MAX_CONCURRENT = 40   # To stop Wikipedia from complaining about 'too many requests'
    USER_AGENT = 'Eurovision study @ The Alan Turing Institute mailto:jyong@turing.ac.uk'

    async def get_all_genders():
        genders = []
        print(f'We need to fetch the genders of {n_performers} performers, in batches of {MAX_CONCURRENT}. Hold tight...')
        async with aiohttp.ClientSession(headers={'User-Agent': USER_AGENT}) as session:
            start = 0
            end = MAX_CONCURRENT
            while start < n_performers:
                print(f'Getting genders for performers #{start + 1} to #{end}... ', end='')
                batch_tasks = asyncio.gather(*[get_artist_gender(session, p) for p in all_performers[start:end]])
                batch_genders = await batch_tasks
                print(f'Got {len(batch_genders)} results, {len([g for g in batch_genders if g is None])} of which were None.')
                genders = genders + batch_genders
                start = end
                end = min(end + MAX_CONCURRENT, n_performers)
                await asyncio.sleep(1.5)   # Put a pause between batches to avoid being timed out
        # now pray that I didn't make an off-by-one error somewhere
        assert len(genders) == n_performers
        print('Finished downloading gender data.')
        return dict(zip(all_performers, genders))
        
    gender_dict = await get_all_genders()
    
    # Manually assign missing entries (the Nones).
    male = ['Michael Hajiyanni', 'Charlie', 'Tüzmen', 'Mietek Szcześniak', 'Olexandr', 'Max', 'Brinck',
            'Sakis Rouvas (2)', 'Gianluca', 'Frans', 'Chingiz', 'Mahmood', 'Serhat (2)', 'Miki', 'Stefan']
    female = ['Gunvor', 'Selma', 'Charlotte Nilsson (Perrelli)', 'Karolina', 'Laura', 'Rosa', 'Lou', 'Nicola',
            'Karmen', 'Sanda', 'Ortal', 'Gracia', 'Chiara (2)', 'Hanna', 'Chiara (3)', 'Elena', 'Lena (2)',
            'Birgit', 'Samra', 'ZAA Sanja Vučić', 'Anja', 'Alma', 'Netta', 'Michela', 'Efendi', 'Victoria',
            'Destiny', 'Amanda Georgiadi Tenfjord', 'MARO']
    group = ['Eden', 'Voice', 'Taxi', 'One', 'Prime Minister', 'Fame', 'Regina (band)', 'ESDM',
            'Tolmachevy Sisters', 'Minus One', 'AWS']
    for p in male:
        gender_dict[p] = "male"
    for p in female:
        gender_dict[p] = "female"
    for p in group:
        gender_dict[p] = "group"

    # Wikipedia needs to learn that 'trans woman' is 'female'.
    for k, v in gender_dict.items():
        if v == 'trans woman':
            gender_dict[k] = 'female'
            
    # Save it to a file
    with open('../../data/gender_dict.json', 'w') as file:
        json.dump(gender_dict, file)

Loaded performer genders from file


In [23]:
# Add it to the dataframe.

df_VLG = df_VL.copy()
df_VLG['gender'] = df_VLG['Artist'].map(gender_dict)
df_VLG.head()

Unnamed: 0,year,Artist,from_country,to_country,points,total_points,rank,from_code2,from_code3,to_code2,to_code3,Official_languages,Language_sung,Contains_English,Contains_NonEnglish,Contains_Multiple_Languages,Number_of_Languages,Contains_Own_Language,gender
0,1998,Danijela,belgium,croatia,5,131,5.0,BE,BEL,HR,HRV,croatian,[croatian],False,True,False,1,True,female
1,1998,Michael Hajiyanni,belgium,cyprus,2,37,11.0,BE,BEL,CY,CYP,greek turkish,[greek],False,True,False,1,True,male
2,1998,Koit Toome,belgium,estonia,0,36,12.0,BE,BEL,EE,EST,võro estonian,[estonian],False,True,False,1,True,male
3,1998,Vlado Janevski,belgium,north macedonia,0,16,19.0,BE,BEL,MK,MKD,macedonian albanian,[macedonian],False,True,False,1,True,male
4,1998,Edea,belgium,finland,0,22,15.0,BE,BEL,FI,FIN,finnish swedish,[finnish],False,True,False,1,True,group


In [24]:
check_consistency(df_VLG)

to_country,is_consistent,albania,armenia,australia,austria,azerbaijan,belarus,belgium,bosnia and herzegovina,bulgaria,croatia,cyprus,czechia,denmark,estonia,finland,france,georgia,germany,greece,hungary,iceland,ireland,israel,italy,latvia,lithuania,malta,"moldova, republic of",montenegro,netherlands,north macedonia,norway,poland,portugal,romania,russian federation,san marino,serbia,slovakia,slovenia,spain,sweden,switzerland,turkey,ukraine,united kingdom,yugoslavia
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1
1998,True,,,,,,,24.0,,,24.0,24.0,,,24.0,24.0,24.0,,24.0,24.0,24.0,,24.0,24.0,,,,24.0,,,24.0,24.0,24.0,24.0,24.0,24.0,,,,24.0,24.0,24.0,24.0,24.0,24.0,,24.0,
1999,True,,,,22.0,,,22.0,22.0,,22.0,22.0,,22.0,22.0,,22.0,,22.0,,,22.0,22.0,22.0,,,22.0,22.0,,,22.0,,22.0,22.0,22.0,,,,,,22.0,22.0,22.0,,22.0,,22.0,
2000,True,,,,23.0,,,23.0,,,23.0,23.0,,23.0,23.0,23.0,23.0,,23.0,,,23.0,23.0,23.0,,23.0,,23.0,,,23.0,23.0,23.0,,,23.0,23.0,,,,,23.0,23.0,23.0,23.0,,23.0,
2001,True,,,,,,,,22.0,,22.0,,,22.0,22.0,,22.0,,22.0,22.0,,22.0,22.0,22.0,,22.0,22.0,22.0,,,22.0,,22.0,22.0,22.0,,22.0,,,,22.0,22.0,22.0,,22.0,,22.0,
2002,True,,,,23.0,,,23.0,23.0,,23.0,23.0,,23.0,23.0,23.0,23.0,,23.0,23.0,,,,23.0,,23.0,23.0,23.0,,,,23.0,,,,23.0,23.0,,,,23.0,23.0,23.0,23.0,23.0,,23.0,
2003,True,,,,25.0,,,25.0,25.0,,25.0,25.0,,,25.0,,25.0,,25.0,25.0,,25.0,25.0,25.0,,,,25.0,,,25.0,,25.0,25.0,25.0,25.0,25.0,,,,25.0,25.0,25.0,,25.0,25.0,25.0,
2004,True,35.0,,,35.0,,,35.0,35.0,,35.0,35.0,,,,,35.0,,35.0,35.0,,35.0,35.0,,,,,35.0,,,35.0,35.0,35.0,35.0,,35.0,35.0,,,,,35.0,35.0,,35.0,35.0,35.0,35.0
2005,True,38.0,,,,,,,38.0,,38.0,38.0,,38.0,,,38.0,,38.0,38.0,38.0,,,38.0,,38.0,,38.0,38.0,,,38.0,38.0,,,38.0,38.0,,,,,38.0,38.0,38.0,38.0,38.0,38.0,38.0
2006,True,,37.0,,,,,,37.0,,37.0,,,37.0,,37.0,37.0,,37.0,37.0,,,37.0,37.0,,37.0,37.0,37.0,37.0,,,37.0,37.0,,,37.0,37.0,,,,,37.0,37.0,37.0,37.0,37.0,37.0,
2007,True,,41.0,,,,41.0,,41.0,41.0,,,,,,41.0,41.0,41.0,41.0,41.0,41.0,,41.0,,,41.0,41.0,,41.0,,,41.0,,,,41.0,41.0,,41.0,,41.0,41.0,41.0,,41.0,41.0,41.0,


# Migration

- The `migration-flows.csv` data is from [Our World in Data](https://ourworldindata.org/migration) on international migration, under the 'Explore data on where people migrate from and to' section.
- Original source is from the UN.
- Data shows total number of immigrants in each country split by country of origin in the years 1990-2020, recorded at intervals of every 5 years.
- Additional population size data (`pop_sizes.csv`) is taken from the [World Bank](https://data.worldbank.org/indicator/SP.POP.TOTL?end=2021&start=2021&view=map).

In [25]:
migration = pd.read_csv('../../data/migration-flows.csv')

# Martin actually writes good pandas code, unlike me

migration = (migration
    .pipe(pd.melt, id_vars=['Country', 'Year'], var_name='Migration', value_name='Count')  # to long format
    .loc[lambda x: x['Migration'].str.contains('Emigrants')]                               # filter for emigrant rows
    .pipe(lambda x: x.rename(columns = {col: col.lower() for col in x.columns}))           # lowercase column names                                                         
    .assign(migration = lambda x: x.migration.str.replace('Emigrants from ', ''))          # filter for emigrant rows                          
    .rename(columns={'migration': 'emigrated_from', 'country': 'emigrated_to'})            # boil down to country name
    .query('count >= 0')                                                                   # negative counts are just total emigrants from country
    .pipe(lambda x: x.assign(count = x['count'].astype(int)))                              # convert count to int     
)
migration.head()

Unnamed: 0,emigrated_to,year,emigrated_from,count
56,Argentina,1990,Afghanistan,20
57,Argentina,1995,Afghanistan,20
58,Argentina,2000,Afghanistan,20
59,Argentina,2005,Afghanistan,16
60,Argentina,2010,Afghanistan,9


In [26]:
# Clean up country names
for ft in ['from', 'to']:
    migration[f'emigrated_{ft}'] = migration[f'emigrated_{ft}'].str.lower()
    migration.loc[migration[f'emigrated_{ft}'] == 'moldova', f'emigrated_{ft}'] = 'moldova, republic of'
    migration.loc[migration[f'emigrated_{ft}'] == 'russia', f'emigrated_{ft}'] = 'russian federation'

# Remove countries we don't care about
ev_countries = set(df_VLG['from_country'].unique()).union(set(df_VLG['to_country'].unique()))
migration = migration[(migration['emigrated_to'].isin(ev_countries)) & (migration['emigrated_from'].isin(ev_countries))]

migration_countries = set(migration['emigrated_to'].unique()).union(set(migration['emigrated_from'].unique()))
print(ev_countries - migration_countries)  # No data for Yugoslavia.

# Add in country codes
for ft in ['from', 'to']:
    migration[f'emigrated_{ft}_code2'], migration[f'emigrated_{ft}_code3'] = zip(*migration[f'emigrated_{ft}'].map(get_country_codes))
    
migration = migration.reset_index(drop=True)
migration.head()

{'yugoslavia'}


Unnamed: 0,emigrated_to,year,emigrated_from,count,emigrated_from_code2,emigrated_from_code3,emigrated_to_code2,emigrated_to_code3
0,australia,1990,albania,984,AL,ALB,AU,AUS
1,australia,1995,albania,1315,AL,ALB,AU,AUS
2,australia,2000,albania,1530,AL,ALB,AU,AUS
3,australia,2005,albania,2270,AL,ALB,AU,AUS
4,australia,2010,albania,2880,AL,ALB,AU,AUS


In [27]:
pop_size = (pd.read_csv('../../data/pop_sizes.csv')
           .iloc[:, 3:]
           .rename(columns=lambda x: x.lower().replace(' ', '_'))
           .pipe(pd.melt, id_vars=['country_code'], var_name='year', value_name='population')
           .assign(year=lambda x: x['year'].apply(lambda y: y.split('_')[0]))
           .assign(year=lambda x: x['year'].astype(int))
           .rename(columns={'country_code': 'code3'})
           .dropna()
           .assign(population=lambda x: pd.to_numeric(x['population'], errors='coerce'))
)
pop_size.head()

migration_and_pop = (migration.merge(pop_size, left_on=['year', 'emigrated_to_code3'], right_on=['year', 'code3'], how='left')
                     .rename(columns={'population': 'population_to'})
                    .assign(prop_emigrants=lambda x: x['count'] / x['population_to'])
                    #.reindex(columns=['country', 'code', 'code3', 'population', 'year', 'emigrated_from_code', 'count', 'prop_emigrants'])
                    )
migration_and_pop.head(n=20)

Unnamed: 0,emigrated_to,year,emigrated_from,count,emigrated_from_code2,emigrated_from_code3,emigrated_to_code2,emigrated_to_code3,code3,population_to,prop_emigrants
0,australia,1990,albania,984,AL,ALB,AU,AUS,AUS,17065128.0,5.8e-05
1,australia,1995,albania,1315,AL,ALB,AU,AUS,AUS,18004882.0,7.3e-05
2,australia,2000,albania,1530,AL,ALB,AU,AUS,AUS,19028802.0,8e-05
3,australia,2005,albania,2270,AL,ALB,AU,AUS,AUS,20176844.0,0.000113
4,australia,2010,albania,2880,AL,ALB,AU,AUS,AUS,22031750.0,0.000131
5,australia,2015,albania,3460,AL,ALB,AU,AUS,AUS,23815995.0,0.000145
6,australia,2020,albania,3941,AL,ALB,AU,AUS,AUS,25655289.0,0.000154
7,austria,1990,albania,1733,AL,ALB,AT,AUT,AUT,7677850.0,0.000226
8,austria,1995,albania,1955,AL,ALB,AT,AUT,AUT,7948278.0,0.000246
9,austria,2000,albania,2177,AL,ALB,AT,AUT,AUT,8011566.0,0.000272


Because we don't have migration data for every year, when merging with the main dataset, we take the last migration data point before the competition.
So, for example, the 2012 entries will contain migration data from 2010.

To do this, we'll first make 5 copies of each row from the `migration_and_pop` dataframe, each with a different year.

In [28]:
migration_and_pop['migration_pop_year'] = migration_and_pop['year']

total_migration_and_pop = migration_and_pop.copy()
for i in range(1, 5):
    next_migration_and_pop = migration_and_pop.copy()
    next_migration_and_pop['year'] = next_migration_and_pop['year'] + i
    total_migration_and_pop = pd.concat([total_migration_and_pop, next_migration_and_pop], ignore_index=True)
    
total_migration_and_pop = total_migration_and_pop.sort_values(by=["emigrated_from", "emigrated_to", "year"])

total_migration_and_pop = total_migration_and_pop[['emigrated_from_code2', 'emigrated_to_code2', 'year', 'count', 'population_to', 'prop_emigrants', 'migration_pop_year']]

total_migration_and_pop.head(n=20)

Unnamed: 0,emigrated_from_code2,emigrated_to_code2,year,count,population_to,prop_emigrants,migration_pop_year
0,AL,AU,1990,984,17065128.0,5.8e-05,1990
12313,AL,AU,1991,984,17065128.0,5.8e-05,1990
24626,AL,AU,1992,984,17065128.0,5.8e-05,1990
36939,AL,AU,1993,984,17065128.0,5.8e-05,1990
49252,AL,AU,1994,984,17065128.0,5.8e-05,1990
1,AL,AU,1995,1315,18004882.0,7.3e-05,1995
12314,AL,AU,1996,1315,18004882.0,7.3e-05,1995
24627,AL,AU,1997,1315,18004882.0,7.3e-05,1995
36940,AL,AU,1998,1315,18004882.0,7.3e-05,1995
49253,AL,AU,1999,1315,18004882.0,7.3e-05,1995


In [29]:
# Now we can join with the main dataframe.

# migration_v2p      -> number of migrants from voting country to performing country
# population_p       -> population of performing country
# prop_emigrants_v2p -> proportion of migrants from voting country in population of performing country
df_VLGM = df_VLG.merge(total_migration_and_pop, how='left', left_on=['from_code2', 'to_code2', 'year'], right_on=['emigrated_from_code2', 'emigrated_to_code2', 'year'])
df_VLGM = (df_VLGM
           .drop(columns=['emigrated_from_code2', 'emigrated_to_code2', 'migration_pop_year'])
           .rename(columns={'count': 'migration_v2p', 'population_to': 'population_p', 'prop_emigrants': 'prop_emigrants_v2p'})
)

# migration_p2v      -> number of migrants from performing country to voting country
# population_p       -> population of voting country
# prop_emigrants_v2p -> proportion of migrants from performing country in population of voting country
# migration_pop_year -> year from which the migration and population data is taken
df_VLGM = df_VLGM.merge(total_migration_and_pop, how='left', left_on=['from_code2', 'to_code2', 'year'], right_on=['emigrated_to_code2', 'emigrated_from_code2', 'year'])
df_VLGM = (df_VLGM
           .drop(columns=['emigrated_from_code2', 'emigrated_to_code2'])
           .rename(columns={'count': 'migration_p2v', 'population_to': 'population_v', 'prop_emigrants': 'prop_emigrants_p2v'})
)
df_VLGM['migration_pop_year'] = df_VLGM['migration_pop_year'].astype(int, errors='ignore')   # ignore NaN's.

df_VLGM.head(n=10)

Unnamed: 0,year,Artist,from_country,to_country,points,total_points,rank,from_code2,from_code3,to_code2,...,Number_of_Languages,Contains_Own_Language,gender,migration_v2p,population_p,prop_emigrants_v2p,migration_p2v,population_v,prop_emigrants_p2v,migration_pop_year
0,1998,Danijela,belgium,croatia,5,131,5.0,BE,BEL,HR,...,1,True,female,205.0,4620030.0,4.4e-05,72.0,10136811.0,7e-06,1995.0
1,1998,Michael Hajiyanni,belgium,cyprus,2,37,11.0,BE,BEL,CY,...,1,True,male,92.0,862418.0,0.000107,77.0,10136811.0,8e-06,1995.0
2,1998,Koit Toome,belgium,estonia,0,36,12.0,BE,BEL,EE,...,1,True,male,0.0,1436634.0,0.0,57.0,10136811.0,6e-06,1995.0
3,1998,Vlado Janevski,belgium,north macedonia,0,16,19.0,BE,BEL,MK,...,1,True,male,,,,120.0,10136811.0,1.2e-05,1995.0
4,1998,Edea,belgium,finland,0,22,15.0,BE,BEL,FI,...,1,True,group,144.0,5107790.0,2.8e-05,1541.0,10136811.0,0.000152,1995.0
5,1998,Marie Line,belgium,france,0,3,24.0,BE,BEL,FR,...,1,True,female,132113.0,59543659.0,0.002219,123438.0,10136811.0,0.012177,1995.0
6,1998,Guildo Horn feat. Die Orthopädischen Strümpfe,belgium,germany,7,86,7.0,BE,BEL,DE,...,1,True,group,22307.0,81678051.0,0.000273,65226.0,10136811.0,0.006435,1995.0
7,1998,Thalassa,belgium,greece,0,12,20.0,BE,BEL,GR,...,1,True,group,4916.0,10562153.0,0.000465,18488.0,10136811.0,0.001824,1995.0
8,1998,Charlie,belgium,hungary,0,4,23.0,BE,BEL,HU,...,1,True,male,601.0,10328965.0,5.8e-05,1593.0,10136811.0,0.000157,1995.0
9,1998,Dawn Martin,belgium,ireland,0,64,9.0,BE,BEL,IE,...,1,True,female,507.0,3608841.0,0.00014,2195.0,10136811.0,0.000217,1995.0


In [30]:
# Lots of NaN's though... :(
# The data for these simply aren't in the source csv. Is there anything we can do about it?

# df_VLGM[df_VLGM['migration_v2p'].isna()]   # 3436 empty rows
df_VLGM[df_VLGM['migration_p2v'].isna()]   # 3345 empty rows

Unnamed: 0,year,Artist,from_country,to_country,points,total_points,rank,from_code2,from_code3,to_code2,...,Number_of_Languages,Contains_Own_Language,gender,migration_v2p,population_p,prop_emigrants_v2p,migration_p2v,population_v,prop_emigrants_p2v,migration_pop_year
25,1998,Michael Hajiyanni,croatia,cyprus,4,37,11.0,HR,HRV,CY,...,1,True,male,30.0,862418.0,0.000035,,,,
35,1998,Chiara,croatia,malta,7,165,3.0,HR,HRV,MT,...,1,True,female,22.0,377419.0,0.000058,,,,
96,1998,Mélanie Cohl,north macedonia,belgium,6,122,6.0,MK,MKD,BE,...,1,True,female,120.0,10136811.0,0.000012,,,,
98,1998,Michael Hajiyanni,north macedonia,cyprus,0,37,11.0,MK,MKD,CY,...,1,True,male,7.0,862418.0,0.000008,,,,
99,1998,Koit Toome,north macedonia,estonia,0,36,12.0,MK,MKD,EE,...,1,True,male,0.0,1436634.0,0.000000,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21281,2022,Konstrakta,ukraine,serbia,0,169,5.0,UA,UKR,RS,...,1,True,female,1297.0,6899126.0,0.000188,,,,
21282,2022,Chanel,ukraine,spain,0,282,3.0,UA,UKR,ES,...,2,True,female,106373.0,47365655.0,0.002246,,,,
21283,2022,Cornelia Jakobs,ukraine,sweden,0,245,4.0,UA,UKR,SE,...,1,False,female,10176.0,10353442.0,0.000983,,,,
21284,2022,Marius Bear,ukraine,switzerland,3,28,18.0,UA,UKR,CH,...,1,False,male,10631.0,8638167.0,0.001231,,,,


# Comps without win

https://github.com/KatrionaGoldmann/Eurovision_TDS/blob/jp/data-wrangling-notebook/eurovision/get_comps_without_win.py

Copy the winners from wikipedia. Note that Luxembourg withdrew from the contest in 1994, so is not included in our data - hence why it does not merge, we will remove this from the data. 

In [31]:
winners = [(1956, "Switzerland"), (1957, "Netherlands"), (1958, "France"),
           (1959, "Netherlands"), (1960, "France"), (1961, "Luxembourg"),
           (1962, "France"), (1963, "Denmark"), (1964, "Italy"),
           (1965, "Luxembourg"), (1966, "Austria"), (1967, "United Kingdom"),
           (1968, "Spain"), (1969, "Spain"), (1969, "United Kingdom"),
           (1969, "Netherlands"), (1969, "France"), (1970, "Ireland"),
           (1971, "Monaco"), (1972, "Luxembourg"), (1973, "Luxembourg"),
           (1974, "Sweden"), (1975, "Netherlands"), (1976, "United Kingdom"),
           (1977, "France"), (1978, "Israel"), (1979, "Israel"),
           (1980, "Ireland"), (1981, "United Kingdom"), (1982, "Germany"),
           (1983, "Luxembourg"), (1984, "Sweden"), (1985, "Norway"),
           (1986, "Belgium"), (1987, "Ireland"), (1988, "Switzerland"),
           (1989, "Yugoslavia"), (1990, "Italy"), (1991, "Sweden"),
           (1992, "Ireland"), (1993, "Ireland"), (1994, "Ireland"),
           (1995, "Norway"), (1996, "Ireland"), (1997, "United Kingdom"),
           (1998, "Israel"), (1999, "Sweden"), (2000, "Denmark"),
           (2001, "Estonia"), (2002, "Latvia"), (2003, "Turkey"),
           (2004, "Ukraine"), (2005, "Greece"), (2006, "Finland"),
           (2007, "Serbia"), (2008, "Russian federation"), (2009, "Norway"),
           (2010, "Germany"), (2011, "Azerbaijan"), (2012, "Sweden"),
           (2013, "Denmark"), (2014, "Austria"), (2015, "Sweden"),
           (2016, "Ukraine"), (2017, "Portugal"), (2018, "Israel"),
           (2019, "Netherlands"), (2021, "Italy"), (2022, "Ukraine")]

# replace the country names with their codes, and remove Luxembourg from 1961
winners = [(year, get_country_codes(country.lower())[0]) for year, country in winners if country != 'Luxembourg']

print(winners)

[(1956, 'CH'), (1957, 'NL'), (1958, 'FR'), (1959, 'NL'), (1960, 'FR'), (1962, 'FR'), (1963, 'DK'), (1964, 'IT'), (1966, 'AT'), (1967, 'GB'), (1968, 'ES'), (1969, 'ES'), (1969, 'GB'), (1969, 'NL'), (1969, 'FR'), (1970, 'IE'), (1971, 'MC'), (1974, 'SE'), (1975, 'NL'), (1976, 'GB'), (1977, 'FR'), (1978, 'IL'), (1979, 'IL'), (1980, 'IE'), (1981, 'GB'), (1982, 'DE'), (1984, 'SE'), (1985, 'NO'), (1986, 'BE'), (1987, 'IE'), (1988, 'CH'), (1989, 'YU'), (1990, 'IT'), (1991, 'SE'), (1992, 'IE'), (1993, 'IE'), (1994, 'IE'), (1995, 'NO'), (1996, 'IE'), (1997, 'GB'), (1998, 'IL'), (1999, 'SE'), (2000, 'DK'), (2001, 'EE'), (2002, 'LV'), (2003, 'TR'), (2004, 'UA'), (2005, 'GR'), (2006, 'FI'), (2007, 'RS'), (2008, 'RU'), (2009, 'NO'), (2010, 'DE'), (2011, 'AZ'), (2012, 'SE'), (2013, 'DK'), (2014, 'AT'), (2015, 'SE'), (2016, 'UA'), (2017, 'PT'), (2018, 'IL'), (2019, 'NL'), (2021, 'IT'), (2022, 'UA')]


In [32]:
# Construct a dictionary mapping each country to the years they won in.

all_wins = {}

# for each row in winners, get the country code and year
for y, code in winners:
    if code in all_wins:
        all_wins[code].append(y)
    else:
        all_wins[code] = [y]

print(all_wins)

{'CH': [1956, 1988], 'NL': [1957, 1959, 1969, 1975, 2019], 'FR': [1958, 1960, 1962, 1969, 1977], 'DK': [1963, 2000, 2013], 'IT': [1964, 1990, 2021], 'AT': [1966, 2014], 'GB': [1967, 1969, 1976, 1981, 1997], 'ES': [1968, 1969], 'IE': [1970, 1980, 1987, 1992, 1993, 1994, 1996], 'MC': [1971], 'SE': [1974, 1984, 1991, 1999, 2012, 2015], 'IL': [1978, 1979, 1998, 2018], 'DE': [1982, 2010], 'NO': [1985, 1995, 2009], 'BE': [1986], 'YU': [1989], 'EE': [2001], 'LV': [2002], 'TR': [2003], 'UA': [2004, 2016, 2022], 'GR': [2005], 'FI': [2006], 'RS': [2007], 'RU': [2008], 'AZ': [2011], 'PT': [2017]}


In [33]:
df_VLGMC = df_VLGM.copy()

def comps_without_win(code, year):
      # Find last win. Use 1955 (year before ESC started) if there isn't one.
      if code not in all_wins:
        last_win = 1955
      else:
        last_win = max([y for y in all_wins[code] if y < year], default=1955)
      
      # Count the number of competitions since the last win. Note that the 2020
      # contest was cancelled.
      comps = year - last_win - 1
      if year > 2020 and last_win < 2020:
        comps = comps - 1

      return comps

# Some quick tests.
assert(comps_without_win("UA", 2023) == 0)   # won in 2022
assert(comps_without_win("GB", 2023) == 24)  # won in 1997
assert(comps_without_win("AU", 2023) == 66)  # never won
assert(comps_without_win("SE", 1983) == 8)   # won in 1974
assert(comps_without_win("SE", 2019) == 3)   # won in 2015
assert(comps_without_win("NL", 2019) == 43)  # won in 1975

df_VLGMC['comps_without_win'] = df_VLGMC.apply(lambda row: comps_without_win(row['to_code2'], row['year']), axis=1)

In [34]:
df_VLGMC.tail()

Unnamed: 0,year,Artist,from_country,to_country,points,total_points,rank,from_code2,from_code3,to_code2,...,Contains_Own_Language,gender,migration_v2p,population_p,prop_emigrants_v2p,migration_p2v,population_v,prop_emigrants_p2v,migration_pop_year,comps_without_win
21305,2022,Konstrakta,united kingdom,serbia,0,169,5.0,GB,GBR,RS,...,True,female,658.0,6899126.0,9.5e-05,9343.0,67081000.0,0.000139,2020.0,13
21306,2022,Chanel,united kingdom,spain,8,282,3.0,GB,GBR,ES,...,True,female,303193.0,47365655.0,0.006401,150892.0,67081000.0,0.002249,2020.0,51
21307,2022,Cornelia Jakobs,united kingdom,sweden,10,245,4.0,GB,GBR,SE,...,False,female,29715.0,10353442.0,0.00287,35824.0,67081000.0,0.000534,2020.0,5
21308,2022,Marius Bear,united kingdom,switzerland,0,28,18.0,GB,GBR,CH,...,False,male,45951.0,8638167.0,0.00532,23649.0,67081000.0,0.000353,2020.0,32
21309,2022,Kalush Orchestra,united kingdom,ukraine,7,379,1.0,GB,GBR,UA,...,True,group,,,,22119.0,67081000.0,0.00033,2020.0,4


# Border data

Raw data is obtained from GeoDataSource: https://github.com/geodatasource/country-borders/

In [35]:
border = pd.read_csv('../../data/geodatasource-country-borders.csv')
border.head()

Unnamed: 0,country_code,country_name,country_border_code,country_border_name
0,AD,Andorra,FR,France
1,AD,Andorra,ES,Spain
2,AE,United Arab Emirates,OM,Oman
3,AE,United Arab Emirates,SA,Saudi Arabia
4,AF,Afghanistan,CN,China


In [36]:
# Probably the easiest way to do this is to turn the border data into a list of tuples.

# Clean up the data first; subset to only Eurovision countries
ev_code2s = set(df_VLGMC['from_code2'].unique()).union(set(df_VLGMC['to_code2'].unique()))
border = border[(border['country_code'].isin(ev_code2s)) & (border['country_border_code'].isin(ev_code2s))]

# Generate a list of tuples
border_tuples = list(border[['country_code', 'country_border_code']].itertuples(index=False, name=None))
# Sanity check to make sure the list is symmetric. Expect True.
all((b, a) in border_tuples for a, b in border_tuples)

# Yugoslavia needs a manual exception. For now, Yugoslavia shares a border with country X if X shares a border with either Serbia or Montenegro.
def has_border(cty1, cty2):
    if cty1 == 'YU':
        return has_border('RS', cty2) or has_border('ME', cty2)
    elif cty2 == 'YU':
        return has_border(cty1, 'RS') or has_border(cty1, 'ME')
    else:
        return (cty1, cty2) in border_tuples
# TODO: CHECK IF THIS IS HISTORICALLY CORRECT

In [37]:
has_border('BA', 'YU')

True

In [38]:
# Then just add a new column to df_VLGMC that is True if the two countries are neighbours.

df_VLGMCB = df_VLGMC.copy()
df_VLGMCB["has_border"] = df_VLGMCB.apply(lambda row: has_border(row['from_code2'], row['to_code2']), axis=1)
df_VLGMCB.head(n=20)

Unnamed: 0,year,Artist,from_country,to_country,points,total_points,rank,from_code2,from_code3,to_code2,...,gender,migration_v2p,population_p,prop_emigrants_v2p,migration_p2v,population_v,prop_emigrants_p2v,migration_pop_year,comps_without_win,has_border
0,1998,Danijela,belgium,croatia,5,131,5.0,BE,BEL,HR,...,female,205.0,4620030.0,4.4e-05,72.0,10136811.0,7e-06,1995.0,42,False
1,1998,Michael Hajiyanni,belgium,cyprus,2,37,11.0,BE,BEL,CY,...,male,92.0,862418.0,0.000107,77.0,10136811.0,8e-06,1995.0,42,False
2,1998,Koit Toome,belgium,estonia,0,36,12.0,BE,BEL,EE,...,male,0.0,1436634.0,0.0,57.0,10136811.0,6e-06,1995.0,42,False
3,1998,Vlado Janevski,belgium,north macedonia,0,16,19.0,BE,BEL,MK,...,male,,,,120.0,10136811.0,1.2e-05,1995.0,42,False
4,1998,Edea,belgium,finland,0,22,15.0,BE,BEL,FI,...,group,144.0,5107790.0,2.8e-05,1541.0,10136811.0,0.000152,1995.0,42,False
5,1998,Marie Line,belgium,france,0,3,24.0,BE,BEL,FR,...,female,132113.0,59543659.0,0.002219,123438.0,10136811.0,0.012177,1995.0,20,True
6,1998,Guildo Horn feat. Die Orthopädischen Strümpfe,belgium,germany,7,86,7.0,BE,BEL,DE,...,group,22307.0,81678051.0,0.000273,65226.0,10136811.0,0.006435,1995.0,15,True
7,1998,Thalassa,belgium,greece,0,12,20.0,BE,BEL,GR,...,group,4916.0,10562153.0,0.000465,18488.0,10136811.0,0.001824,1995.0,42,False
8,1998,Charlie,belgium,hungary,0,4,23.0,BE,BEL,HU,...,male,601.0,10328965.0,5.8e-05,1593.0,10136811.0,0.000157,1995.0,42,False
9,1998,Dawn Martin,belgium,ireland,0,64,9.0,BE,BEL,IE,...,female,507.0,3608841.0,0.00014,2195.0,10136811.0,0.000217,1995.0,1,False


# Store the data as CSV

In [39]:
df_VLGMCB.to_csv('../../data/df_main.csv', index=False)