In [46]:
import stan
import arviz as az
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import arviz as az
import math
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import json
import pickle

# work around to get stan working in a notebook
import nest_asyncio
nest_asyncio.apply()
del nest_asyncio

In [41]:
df_past = pd.read_csv('df_main.csv')
print(df_past.columns)
df_past.head()

Index(['year', 'Artist', 'from_country', 'to_country', 'points',
       'total_points', 'rank', 'from_code2', 'from_code3', 'to_code2',
       'to_code3', 'Official_languages', 'Language_sung', 'Contains_English',
       'Contains_NonEnglish', 'Contains_Multiple_Languages',
       'Number_of_Languages', 'Contains_Own_Language', 'gender',
       'migration_v2p', 'population_p', 'prop_emigrants_v2p', 'migration_p2v',
       'population_v', 'prop_emigrants_p2v', 'migration_pop_year',
       'comps_without_win', 'has_border'],
      dtype='object')


Unnamed: 0,year,Artist,from_country,to_country,points,total_points,rank,from_code2,from_code3,to_code2,...,gender,migration_v2p,population_p,prop_emigrants_v2p,migration_p2v,population_v,prop_emigrants_p2v,migration_pop_year,comps_without_win,has_border
0,1998,Danijela,belgium,croatia,5,131,5.0,BE,BEL,HR,...,female,205.0,4620030.0,4.4e-05,72.0,10136811.0,7e-06,1995.0,42,False
1,1998,Michael Hajiyanni,belgium,cyprus,2,37,11.0,BE,BEL,CY,...,male,92.0,862418.0,0.000107,77.0,10136811.0,8e-06,1995.0,42,False
2,1998,Koit Toome,belgium,estonia,0,36,12.0,BE,BEL,EE,...,male,0.0,1436634.0,0.0,57.0,10136811.0,6e-06,1995.0,42,False
3,1998,Vlado Janevski,belgium,north macedonia,0,16,19.0,BE,BEL,MK,...,male,,,,120.0,10136811.0,1.2e-05,1995.0,42,False
4,1998,Edea,belgium,finland,0,22,15.0,BE,BEL,FI,...,group,144.0,5107790.0,2.8e-05,1541.0,10136811.0,0.000152,1995.0,42,False


In [50]:
df_past.loc[ df_past['to_code2'].isin(['NO','HR','CY']) ]

Unnamed: 0,year,Artist,from_country,to_country,points,total_points,rank,from_code2,from_code3,to_code2,...,population_v,prop_emigrants_p2v,migration_pop_year,comps_without_win,has_border,indexed_votes,male,female,Contains_English_bin,Contains_Own_Language_bin
0,1998,Danijela,belgium,croatia,5,131,5.0,BE,BEL,HR,...,10136811.0,0.000007,1995.0,42,False,6,0,1,0,1
1,1998,Michael Hajiyanni,belgium,cyprus,2,37,11.0,BE,BEL,CY,...,10136811.0,0.000008,1995.0,42,False,3,1,0,0,1
12,1998,Lars A. Fredriksen,belgium,norway,4,79,8.0,BE,BEL,NO,...,10136811.0,0.000096,1995.0,2,False,5,1,0,0,1
25,1998,Michael Hajiyanni,croatia,cyprus,4,37,11.0,HR,HRV,CY,...,,,,42,False,5,1,0,0,1
36,1998,Lars A. Fredriksen,croatia,norway,0,79,8.0,HR,HRV,NO,...,4620030.0,0.000003,1995.0,2,False,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21205,2022,Subwoolfer,spain,norway,0,70,13.0,ES,ESP,NO,...,47365655.0,0.000317,2020.0,11,False,1,0,0,1,0
21229,2022,Subwoolfer,sweden,norway,8,70,13.0,SE,SWE,NO,...,10353442.0,0.004135,2020.0,11,True,9,0,0,1,0
21253,2022,Subwoolfer,switzerland,norway,0,70,13.0,CH,CHE,NO,...,8638167.0,0.000260,2020.0,11,False,1,0,0,1,0
21277,2022,Subwoolfer,ukraine,norway,0,70,13.0,UA,UKR,NO,...,,,,11,False,1,0,0,1,0


In [44]:
def format_votes(x):
  if x == 12.:
    return 10
  elif x == 10.:
    return 9
  return int(x)
df_past['indexed_votes'] = df_past['points'].apply(format_votes) + 1

# Given gender is a categoric variable with 3 classes, encode as binary w.r.t default gender='group'
df_past['male'] = [1 if gender=='male' else 0 for gender in df_past['gender']]
df_past['female'] = [1 if gender=='female' else 0 for gender in df_past['gender']]

# Evaluate binary variables for boolean covariates to be used
df_past['Contains_English_bin'] = df_past['Contains_English'].apply(lambda x: 1 if x else 0)
df_past['Contains_Own_Language_bin'] = df_past['Contains_Own_Language'].apply(lambda x: 1 if x else 0)

In [48]:
scaler = MinMaxScaler()
xbeta = df_past.loc[:,['Contains_English_bin','Contains_Own_Language_bin','male','female','comps_without_win']].values
print(xbeta.shape)
xbeta_norm = scaler.fit_transform(xbeta)
with open('model_output/scaler_test.pkl', 'wb') as f:
  pickle.dump(scaler, f)

(21310, 5)


In [27]:
df_future = pd.read_csv('df_2023_compressed.csv')
print(df_future.columns)
df_future.head()

Index(['year', 'Artist', 'from_country', 'to_country', 'points',
       'total_points', 'rank', 'from_code2', 'from_code3', 'to_code2',
       'to_code3', 'Official_languages', 'Language_sung', 'Contains_English',
       'Contains_NonEnglish', 'Contains_Multiple_Languages',
       'Number_of_Languages', 'Contains_Own_Language', 'gender',
       'migration_v2p', 'population_p', 'prop_emigrants_v2p', 'migration_p2v',
       'population_v', 'prop_emigrants_p2v', 'migration_pop_year',
       'comps_without_win', 'has_border', 'comp_round'],
      dtype='object')


Unnamed: 0,year,Artist,from_country,to_country,points,total_points,rank,from_code2,from_code3,to_code2,...,migration_v2p,population_p,prop_emigrants_v2p,migration_p2v,population_v,prop_emigrants_p2v,migration_pop_year,comps_without_win,has_border,comp_round
0,2023,Alessandra,,norway,,,,,,NO,...,,5379475.0,,,,,,12,,sf1
1,2023,The Busker,,malta,,,,,,MT,...,,515332.0,,,,,,66,,sf1
2,2023,Luke Black,,serbia,,,,,,RS,...,,6899126.0,,,,,,14,,sf1
3,2023,Sudden Lights,,latvia,,,,,,LV,...,,1977527.0,,,,,,20,,sf1
4,2023,Mimicat,,portugal,,,,,,PT,...,,10297081.0,,,,,,4,,sf1


In [30]:
df = df_past.append(df_future,ignore_index=True, verify_integrity=True)
df.head()

  df = df_past.append(df_future,ignore_index=True, verify_integrity=True)
  df = df_past.append(df_future,ignore_index=True, verify_integrity=True)


Unnamed: 0,year,Artist,from_country,to_country,points,total_points,rank,from_code2,from_code3,to_code2,...,population_p,prop_emigrants_v2p,migration_p2v,population_v,prop_emigrants_p2v,migration_pop_year,comps_without_win,has_border,indexed_votes,comp_round
0,1998,Danijela,belgium,croatia,5.0,131.0,5.0,BE,BEL,HR,...,4620030.0,4.4e-05,72.0,10136811.0,7e-06,1995.0,42,0.0,6.0,
1,1998,Michael Hajiyanni,belgium,cyprus,2.0,37.0,11.0,BE,BEL,CY,...,862418.0,0.000107,77.0,10136811.0,8e-06,1995.0,42,0.0,3.0,
2,1998,Koit Toome,belgium,estonia,0.0,36.0,12.0,BE,BEL,EE,...,1436634.0,0.0,57.0,10136811.0,6e-06,1995.0,42,0.0,1.0,
3,1998,Vlado Janevski,belgium,north macedonia,0.0,16.0,19.0,BE,BEL,MK,...,,,120.0,10136811.0,1.2e-05,1995.0,42,0.0,1.0,
4,1998,Edea,belgium,finland,0.0,22.0,15.0,BE,BEL,FI,...,5107790.0,2.8e-05,1541.0,10136811.0,0.000152,1995.0,42,0.0,1.0,
