## Data preparation

This dataset contains ESC final results from 1956 to 2019.

This notebook cleans the data and replaces categorical values with numeric ones.

In [2]:
import pandas as pd
import numpy as np

Read in data from csv file

In [3]:
data = pd.read_csv("../data/ESCDB2.csv", sep=";", encoding="latin-1")

# Removing potential unneccesary spaces from dataframe columns
for column in data.select_dtypes(include='object'):
    data[column] = data[column].str.strip()

# Dropping duplicate column "songid"
data.drop('songid', axis=1)

# Dropping 1956 contest result, since no points/places were awarded
data = data.drop(data[data['Year'] == 1956].index)

# Replacing "Macedonia" with "North Macedonia", since the country changed its name
data["Country"] = data["Country"].replace("Macedonia", "North Macedonia")

data

Unnamed: 0,song_id,Year,Order,Country,Singer,Title,Points,Place,in_english,songid,lovementions,population
14,15,1957,1,Belgium,Bobbejaan Schoepen,Straatdeuntje,5,8,False,15,0,11350000
15,16,1957,2,Luxembourg,DaniÃ¨le DuprÃ©,Tant De Peine,8,5,False,16,5,590667
16,17,1957,3,United Kingdom,Patricia Bredin,All,6,7,True,17,1,66020000
17,18,1957,4,Italy,Nunzio Gallo,Corde Della Mia Chitarra,7,6,False,18,0,60590000
18,19,1957,5,Austria,Bob Martin,Wohin Kleines Pony,3,10,False,19,0,8773000
...,...,...,...,...,...,...,...,...,...,...,...,...
1368,1369,2022,21,Australia,Sheldon Riley,Not the Same,125,15,True,1369,0,24600000
1369,1370,2022,22,United Kingdom,Sam Ryder,Space Man,466,2,True,1370,0,66020000
1370,1371,2022,23,Poland,Ochman,River,151,12,True,1371,0,37970000
1371,1372,2022,24,Serbia,Konstrakta,In corpore sano,312,5,False,1372,0,7240000


Making era column based on years:

- years 1957-1986 -> 0 (very old contests)
- years 1987-1999 -> 1 (roughly after soviet union collapse)
- years 2000-2013 -> 2 (prior to invasion of ukraine)
- years 2014-2022 -> 3 (modern era of the contest)

In [4]:
def era(row):
    if row["Year"] >= 1957 and row["Year"] <= 1986:
        return 0
    if row["Year"] >= 1987 and row["Year"] <= 1999:
        return 1
    if row["Year"] >= 2000 and row["Year"] <= 2013:
        return 2
    if row["Year"] >= 2014 and row["Year"] <= 2022:
        return 3
    
data["Era"] = data.apply(lambda row: era(row), axis = 1)

Making country column numeric

In [5]:
countries = data["Country"].unique()
countries.sort()
countries

countries_to_numbers = {}

for i in range(len(countries)):
    countries_to_numbers[countries[i]] = i

data["Country"] = data["Country"].map(countries_to_numbers)

Making the singer column numeric

This will be a boolean value:
* **0** if the singer has only performed once in ESC
* **1** if the singer has performed multiple times in ESC

In [6]:
data["Singer"] = data["Singer"].map(lambda x: 0 if data["Singer"].value_counts()[x] == 1 else 1)

Making title column numeric.

We are extracting two attributes:
* 1 if song title included word "Love" 0 otherwise
* 1 if song title is more than 1 word, 0 otherwise 

In [7]:
data["love_in_title"] = data["Title"].map(lambda x: 1 if 'love' in x.lower() else 0)

In [8]:
data["Title"] = data["Title"].map(lambda x: len(x.split()))
data = data.rename(columns={"Title" : "title_word_count"})

Making "in_english" column numeric.

In [9]:
data["in_english"] = data["in_english"].map(lambda x: 1 if x else 0)

In [10]:
# display out the result
data

Unnamed: 0,song_id,Year,Order,Country,Singer,title_word_count,Points,Place,in_english,songid,lovementions,population,Era,love_in_title
14,15,1957,1,6,0,1,5,8,0,15,0,11350000,0,0
15,16,1957,2,26,0,3,8,5,0,16,5,590667,0,0
16,17,1957,3,49,0,1,6,7,1,17,1,66020000,0,0
17,18,1957,4,23,0,4,7,6,0,18,0,60590000,0,0
18,19,1957,5,3,0,3,3,10,0,19,0,8773000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1368,1369,2022,21,2,0,3,125,15,1,1369,0,24600000,3,0
1369,1370,2022,22,49,0,2,466,2,1,1370,0,66020000,3,0
1370,1371,2022,23,35,0,1,151,12,1,1371,0,37970000,3,0
1371,1372,2022,24,40,0,3,312,5,0,1372,0,7240000,3,0


---

In [11]:
# unneccesary column
data = data.drop('song_id', axis=1)

# not needed
data = data.drop(columns=['Place'], axis=1)

In [34]:
year_to_predict = 2016

y = data.iloc[:, [0, 5]] # year | points

# trying to predict 2018
y_train = y[y["Year"] != year_to_predict]["Points"] # y_train
y_test = y[y["Year"] == year_to_predict]["Points"]  # y_test

X_train = data[data["Year"] != year_to_predict]     # X_train
X_test = data[data["Year"] == year_to_predict]      # X_test

In [39]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

# classifier = RandomForestClassifier()
# classifier = SVC(kernel="linear") # too slow?
# classifier = SVC(kernel="rbf")
# classifier = Lasso() # SEE ANNAB 100% õiged ?
classifier = Ridge() # ka 100% õiged?

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [40]:
numbers_to_countries = dict((v, k) for k, v in countries_to_numbers.items())

result = X_test.iloc[:, [2, 5]]

result["Country"] = result["Country"].map(numbers_to_countries)
result = result.assign(Predicted=list(y_pred))

result = result.sort_values("Predicted", ascending=False)
result = result.assign(pred_place=range(1, len(result) + 1))

result = result.sort_values("Points", ascending=False)
result = result.assign(actual_place=range(1, len(result) + 1))

result["correct"] = abs(result["pred_place"] - result["actual_place"])
result["correct"] = result["correct"].apply(lambda x: x <= 3)

result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result["Country"] = result["Country"].map(numbers_to_countries)


Unnamed: 0,Country,Points,Predicted,pred_place,actual_place,correct
1238,Ukraine,534,533.999942,1,1,True
1230,Australia,511,510.999942,2,2,True
1235,Russia,491,490.999951,3,3,True
1225,Bulgaria,307,306.999969,4,4,True
1226,Sweden,261,260.999979,5,5,True
1228,France,257,256.99998,6,6,True
1243,Armenia,249,248.999982,7,7,True
1229,Poland,229,228.999985,8,8,True
1233,Lithuania,200,199.999991,9,9,True
1218,Belgium,181,180.999988,10,10,True


In [41]:
score = result["correct"].value_counts("True")

score

True    1.0
Name: correct, dtype: float64