## Data preparation

This dataset contains ESC final results from 1956 to 2019.

This notebook cleans the data and replaces categorical values with numeric ones.

In [1]:
import pandas as pd
import numpy as np

Read in data from csv file

In [2]:
data = pd.read_csv("../data/ESCDB2.csv", sep=";", encoding="UTF-8")

# Removing potential unneccesary spaces from dataframe columns
for column in data.select_dtypes(include='object'):
    data[column] = data[column].str.strip()

# Dropping duplicate column "songid"
data.drop('songid', axis=1)

# Dropping 1956 contest result, since no points/places were awarded
data = data.drop(data[data['Year'] == 1956].index)

# Temporarily dropping 2019 since the dataset is outdated and doesn't contain information about it
data = data.drop(data[data['Year'] == 2019].index)

Making country column numeric

In [3]:
countries = data["Country"].unique()
countries.sort()
countries

countries_to_numbers = {}

for i in range(len(countries)):
    countries_to_numbers[countries[i]] = i

data["Country"] = data["Country"].map(countries_to_numbers)

Making the singer column numeric

This will be a boolean value:
* **0** if the singer has only performed once in ESC
* **1** if the singer has performed multiple times in ESC

In [4]:
data["Singer"] = data["Singer"].map(lambda x: 0 if data["Singer"].value_counts()[x] == 1 else 1)

Making title column numeric.

We are extracting two attributes:
* 1 if song title included word "Love" 0 otherwise
* 1 if song title is more than 1 word, 0 otherwise 

In [5]:
data["love_in_title"] = data["Title"].map(lambda x: 1 if 'love' in x.lower() else 0)

In [6]:
data["Title"] = data["Title"].map(lambda x: len(x.split()))
data = data.rename(columns={"Title" : "title_word_count"})

Making "in_english" column numeric.

In [7]:
data["in_english"] = data["in_english"].map(lambda x: 1 if x else 0)

In [8]:
# display out the result
data

Unnamed: 0,song_id,Year,Order,Country,Singer,title_word_count,Points,Place,in_english,songid,lovementions,population,love_in_title
14,15,1957,1,6,0,1,5.0,8.0,0,15.0,0.0,11350000.0,0
15,16,1957,2,26,0,3,8.0,5.0,0,16.0,5.0,590667.0,0
16,17,1957,3,49,0,1,6.0,7.0,1,17.0,1.0,66020000.0,0
17,18,1957,4,23,0,4,7.0,6.0,0,18.0,0.0,60590000.0,0
18,19,1957,5,3,0,3,3.0,10.0,0,19.0,0.0,8773000.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1291,1292,2018,22,22,0,1,529.0,1.0,1,1292.0,0.0,8712000.0,0
1292,1293,2018,23,33,0,3,121.0,18.0,1,1293.0,0.0,17000000.0,0
1293,1294,2018,24,21,0,1,136.0,16.0,1,1294.0,0.0,4784000.0,0
1294,1295,2018,25,10,0,1,436.0,2.0,1,1295.0,1.0,1180000.0,0


---

In [10]:
data = data.drop('song_id', axis=1)
data.drop(columns=['Place'], axis=1)
y = data.iloc[:, [0, 5]]

y_train = y[y["Year"] != 2018]["Points"] # y_train
y_test = y[y["Year"] == 2018]["Points"] # y_test

X_train = data[data["Year"] != 2018] # X_train
X_test = data[data["Year"] == 2018] # X_test

In [11]:
X_train

Unnamed: 0,Year,Order,Country,Singer,title_word_count,Points,Place,in_english,songid,lovementions,population,love_in_title
14,1957,1,6,0,1,5.0,8.0,0,15.0,0.0,11350000.0,0
15,1957,2,26,0,3,8.0,5.0,0,16.0,5.0,590667.0,0
16,1957,3,49,0,1,6.0,7.0,1,17.0,1.0,66020000.0,0
17,1957,4,23,0,4,7.0,6.0,0,18.0,0.0,60590000.0,0
18,1957,5,3,0,3,3.0,10.0,0,19.0,0.0,8773000.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1265,2017,22,48,0,1,36.0,24.0,1,1266.0,0.0,44830000.0,0
1266,2017,23,6,0,2,363.0,4.0,1,1267.0,0.0,11350000.0,0
1267,2017,24,45,0,4,344.0,5.0,1,1268.0,2.0,9995000.0,0
1268,2017,25,8,0,2,615.0,2.0,1,1269.0,6.0,7102000.0,0


In [12]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier() # SVC(kernel="linear") # see on liiga aeglane
classifier.fit(X_train, y_train)

accuracy = classifier.score(X_test, y_test)

y_pred = classifier.predict(X_test)

y_pred

array([ 39.,  64.,  68., 153., 363., 200., 111.,  36.,  36.,  77., 344.,
       146.,  82., 615., 200.,  77.,  68., 135., 158., 344.,  77., 615.,
        77.,  68., 615., 344.])

Unnamed: 0,Year,Order,Country,Singer,title_word_count,Points,Place,in_english,songid,lovementions,population,love_in_title
1270,2018,1,48,0,3,130.0,17.0,1,1271.0,0.0,44830000.0,0
1271,2018,2,44,0,2,61.0,23.0,0,1272.0,0.0,46570000.0,0
1272,2018,3,43,0,2,64.0,22.0,0,1273.0,0.0,1990000.0,0
1273,2018,4,25,0,3,181.0,12.0,1,1274.0,0.0,2848000.0,0
1274,2018,5,3,0,3,342.0,3.0,1,1275.0,3.0,8773000.0,0
1275,2018,6,13,0,2,245.0,8.0,0,1276.0,3.0,1316000.0,0
1276,2018,7,34,1,6,144.0,15.0,1,1277.0,0.0,5258000.0,0
1277,2018,8,36,0,2,39.0,26.0,1,1278.0,1.0,10310000.0,0
1278,2018,9,49,0,1,48.0,24.0,1,1279.0,3.0,66020000.0,0
1279,2018,10,40,0,5,113.0,19.0,0,1280.0,0.0,7240000.0,0
