## Data preparation

This dataset contains ESC final results from 1956 to 2019.

This notebook cleans the data and replaces categorical values with numeric ones.

In [1]:
import pandas as pd
import numpy as np

Read in data from csv file

In [2]:
data = pd.read_csv("../data/ESCDB2.csv", sep=";", encoding="latin-1")

# Removing potential unneccesary spaces from dataframe columns
for column in data.select_dtypes(include='object'):
    data[column] = data[column].str.strip()

# Dropping duplicate column "songid"
data.drop('songid', axis=1)

# Dropping 1956 contest result, since no points/places were awarded
data = data.drop(data[data['Year'] == 1956].index)

# Replacing "Macedonia" with "North Macedonia", since the country changed its name
data["Country"] = data["Country"].replace("Macedonia", "North Macedonia")

data

Unnamed: 0,song_id,Year,Order,Country,Singer,Title,Points,Place,in_english,songid,lovementions,population
14,15,1957,1,Belgium,Bobbejaan Schoepen,Straatdeuntje,5,8,False,15,0,11350000
15,16,1957,2,Luxembourg,DaniÃ¨le DuprÃ©,Tant De Peine,8,5,False,16,5,590667
16,17,1957,3,United Kingdom,Patricia Bredin,All,6,7,True,17,1,66020000
17,18,1957,4,Italy,Nunzio Gallo,Corde Della Mia Chitarra,7,6,False,18,0,60590000
18,19,1957,5,Austria,Bob Martin,Wohin Kleines Pony,3,10,False,19,0,8773000
...,...,...,...,...,...,...,...,...,...,...,...,...
1368,1369,2022,21,Australia,Sheldon Riley,Not the Same,125,15,True,1369,0,24600000
1369,1370,2022,22,United Kingdom,Sam Ryder,Space Man,466,2,True,1370,0,66020000
1370,1371,2022,23,Poland,Ochman,River,151,12,True,1371,0,37970000
1371,1372,2022,24,Serbia,Konstrakta,In corpore sano,312,5,False,1372,0,7240000


Making country column numeric

In [3]:
countries = data["Country"].unique()
countries.sort()
countries

countries_to_numbers = {}

for i in range(len(countries)):
    countries_to_numbers[countries[i]] = i

data["Country"] = data["Country"].map(countries_to_numbers)

Making the singer column numeric

This will be a boolean value:
* **0** if the singer has only performed once in ESC
* **1** if the singer has performed multiple times in ESC

In [4]:
data["Singer"] = data["Singer"].map(lambda x: 0 if data["Singer"].value_counts()[x] == 1 else 1)

Making title column numeric.

We are extracting two attributes:
* 1 if song title included word "Love" 0 otherwise
* 1 if song title is more than 1 word, 0 otherwise 

In [5]:
data["love_in_title"] = data["Title"].map(lambda x: 1 if 'love' in x.lower() else 0)

In [6]:
data["Title"] = data["Title"].map(lambda x: len(x.split()))
data = data.rename(columns={"Title" : "title_word_count"})

Making "in_english" column numeric.

In [7]:
data["in_english"] = data["in_english"].map(lambda x: 1 if x else 0)

In [8]:
# display out the result
data

Unnamed: 0,song_id,Year,Order,Country,Singer,title_word_count,Points,Place,in_english,songid,lovementions,population,love_in_title
14,15,1957,1,6,0,1,5,8,0,15,0,11350000,0
15,16,1957,2,26,0,3,8,5,0,16,5,590667,0
16,17,1957,3,49,0,1,6,7,1,17,1,66020000,0
17,18,1957,4,23,0,4,7,6,0,18,0,60590000,0
18,19,1957,5,3,0,3,3,10,0,19,0,8773000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1368,1369,2022,21,2,0,3,125,15,1,1369,0,24600000,0
1369,1370,2022,22,49,0,2,466,2,1,1370,0,66020000,0
1370,1371,2022,23,35,0,1,151,12,1,1371,0,37970000,0
1371,1372,2022,24,40,0,3,312,5,0,1372,0,7240000,0


---

In [9]:
data = data.drop('song_id', axis=1)
data.drop(columns=['Place'], axis=1)
y = data.iloc[:, [0, 5]]

y_train = y[y["Year"] != 2018]["Points"] # y_train
y_test = y[y["Year"] == 2018]["Points"] # y_test

X_train = data[data["Year"] != 2018] # X_train
X_test = data[data["Year"] == 2018] # X_test

In [10]:
X_train

Unnamed: 0,Year,Order,Country,Singer,title_word_count,Points,Place,in_english,songid,lovementions,population,love_in_title
14,1957,1,6,0,1,5,8,0,15,0,11350000,0
15,1957,2,26,0,3,8,5,0,16,5,590667,0
16,1957,3,49,0,1,6,7,1,17,1,66020000,0
17,1957,4,23,0,4,7,6,0,18,0,60590000,0
18,1957,5,3,0,3,3,10,0,19,0,8773000,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1368,2022,21,2,0,3,125,15,1,1369,0,24600000,0
1369,2022,22,49,0,2,466,2,1,1370,0,66020000,0
1370,2022,23,35,0,1,151,12,1,1371,0,37970000,0
1371,2022,24,40,0,3,312,5,0,1372,0,7240000,0


In [11]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier() # SVC(kernel="linear") # see on liiga aeglane
classifier.fit(X_train, y_train)

accuracy = classifier.score(X_test, y_test)

y_pred = classifier.predict(X_test)

y_pred

array([ 93,  64,  64, 153, 290, 200, 111,  35,  36,  89, 491, 146,  82,
       301, 128,  77,  31, 120, 158, 344,  89, 498,  77, 132, 615, 524],
      dtype=int64)