In [1]:
import os
import json
import matplotlib.pyplot as plt
import numpy as np
import fileinput
from operator import itemgetter
import pandas as pd
from sklearn.linear_model import Lasso
import h2o
from h2o.estimators import H2OGeneralizedLinearEstimator
from h2o.grid.grid_search import H2OGridSearch


# Names of countries are not named consitently, it takes a lot of time to change all those names, for now use a 
# converter dictionary. If spare time, I'll change it. TODO.
landNameConverter = {
    "United KingdomUK": "United Kingdom",
    "United Kingdom": "United KingdomUK",
    "SerbiaMontenegro": "Serbia and Montenegro",
    "Serbia and Montenegro": "SerbiaMontenegro",
    "BosniaHerzegovina": "Bosnia and Herzegovina",
    "Bosnia and Herzegovina": "BosniaHerzegovina",
    "North Macedonia": "North MacedoniaNorth MacedoniaN",
    "North MacedoniaNorth MacedoniaN": "North Macedonia",
    "Albania": "Albania",
"Andorra": "Andorra",
"Armenia": "Armenia",
"Australia": "Australia",
"Austria": "Austria",
"Azerbaijan": "Azerbaijan",
"Belarus": "Belarus",
"Belgium": "Belgium",
"Bulgaria": "Bulgaria",
"Croatia": "Croatia",
"Cyprus": "Cyprus",
"Czech Republic": "Czech Republic",
"Denmark": "Denmark",
"Estonia": "Estonia",
"Finland": "Finland",
"France": "France",
"Georgia": "Georgia",
"Germany": "Germany",
"Greece": "Greece",
"Hungary": "Hungary",
"Iceland": "Iceland",
"Ireland": "Ireland",
"Israel": "Israel",
"Italy": "Italy",
"Latvia": "Latvia",
"Lithuania": "Lithuania",
"Luxembourg": "Luxembourg",
"Malta": "Malta",
"Moldova": "Moldova",
"Monaco": "Monaco",
"Montenegro": "Montenegro",
"Morocco": "Morocco",
"Netherlands": "Netherlands",
"Norway": "Norway",
"Poland": "Poland",
"Portugal": "Portugal",
"Romania": "Romania",
"Russia": "Russia",
"San Marino": "San Marino",
"Serbia": "Serbia",
"Slovakia": "Slovakia",
"Slovenia": "Slovenia",
"Spain": "Spain",
"Sweden": "Sweden",
"Switzerland": "Switzerland",
"Turkey": "Turkey",
"Ukraine": "Ukraine",
"Yugoslavia": "Yugoslavia"
}

In [2]:
# Reading the positions the countries achieved form files.
files = os.listdir("realRankings")
realRankings = dict()
for f in files:
    year = f.split(".")[0]
    semiFinal = False
    realRankings[year] = dict()
    for line in fileinput.input("realRankings/"+f):
        l = line.split(",")
#         First line is a header
        if fileinput.lineno() == 1:
            continue
#       If start with 'Country', line is a header and it is beginning to read the positions in the semi finals.
        if l[0].startswith("Country"):
            semiFinal = True
            continue
#       If position is in semi-final, I only care about the fact that it did not reach the finals.
        if semiFinal:
            realRankings[year][l[0]] = ("S", l[2])
        elif not semiFinal:
            realRankings[year][l[0]] = l[2]

In [3]:
# Category 0: winner, position is 1.
# Category 1: Top 3, position is 2 or 3.
# Category 2: Top 5, position is 4 or 5.
# Category 3: Top 10, position is between 6 and 10.
# Category 4: In Final, position is higher than 10.
# Category 5: Not in final, position is defined as 'S'.
def inCategory0(year, country):
    try:
        position = int(realRankings[year][country])
    except:
        return False
    if position == 1:
        return True
    return False
    
def inCategory1(year, country):
    try:
        position = int(realRankings[year][country])
    except:
        return False
    if position <= 3:
        return True
    return False
    
def inCategory2(year, country):
    try:
        position = int(realRankings[year][country])
    except:
        return False
    if position <= 5:
        return True
    return False
    
def inCategory3(year, country):
    try:
        position = int(realRankings[year][country])
    except:
        return False
    if position <= 10:
        return True
    return False
    
def inCategory4(year, country):
    try:
        position = int(realRankings[year][country])
    except:
        return False
    return True
    
def inCategory5(year, country):
    position = realRankings[year][country][0]
    if type(position) == str and position == "S":
        return True
    return False
    
def getCategory(year, country):
    if inCategory0(year, country):
        return 0
    elif inCategory1(year, country):
        return 1
    elif inCategory2(year, country):
        return 2
    elif inCategory3(year, country):
        return 3
    elif inCategory4(year, country):
        return 4
    elif inCategory5(year, country):
        return 5

# List of years in order, because dict.keys() always gives a random order.
yearsInOrder = sorted(list(realRankings.keys()))

In [4]:
# Function for trying to divide semi finals in a correct way, because the first place in a semi final is not 
# first place in the final as well.
def getMaxFinalPositionAndMinSemi(year):
    maxPos = -1
    minPos = 44
    for c in realRankings[year]:
        if type(realRankings[year][c]) != tuple:
#             Get lowest place of finale.
            if int(realRankings[year][c]) > maxPos:
                maxPos = int(realRankings[year][c])
        elif type(realRankings[year][c]) == tuple:
            if int(realRankings[year][c][1]) < minPos:
                minPos = int(realRankings[year][c][1])
    return maxPos, minPos
  
# Divide categorie per 20% quantile.
quants = dict()
nQuants = 5
# Loop over years, skip 1956.
for year in yearsInOrder[1:]:
    quants[year] = dict()
    qSize = int(len(realRankings[year])/nQuants)
    toBeSorted = dict()
#     For all the years with a semi-final, get max and min final positions in the semi-final relative to the
#     final places.
    if int(year) > 2003:
        ma, mi = getMaxFinalPositionAndMinSemi(year)
    for c in realRankings[year]:
#         If country ended in a semi final, get a place below the final places, so a ranking that is complet is
#         formed.
        if type(realRankings[year][c]) == tuple:
            toBeSorted[c] = ma+(int(realRankings[year][c][1]) - mi + 1)
        else:
            toBeSorted[c] = int(realRankings[year][c])
    sortedPlaces = sorted(toBeSorted.items(), key=itemgetter(1))
    sortedCountries = [cc[0] for cc in sortedPlaces]
#     Divide in to the quantiles.
    q = [sortedCountries[:qSize], sortedCountries[qSize:qSize*2], sortedCountries[qSize*2:qSize*3],
         sortedCountries[qSize*3:qSize*4], sortedCountries[qSize*4:]]
#     Declare which categories.
    for i in range(len(q)):
        for c in q[i]:
            quants[year][c] = i

In [23]:
# Read the jsons with all the audio features per year, per country, per segment.
segmentedJsons = dict()
for year in yearsInOrder:
    segmentedJsons[year] = dict()
    files = os.listdir("extractedFrames/"+year)
    for f in files:
        country = f.split("_")[1].split(".")[0]
        with open("extractedFrames/"+year+"/"+f) as ff:
            segmentedJsons[year][country] = json.load(ff)

In [27]:
# Read data set.
votes = pd.read_excel('eurovision_song_contest_1975_2019v3.xlsx')
new = votes.rename(columns={"(semi-) final": "round", "Jury or Televoting": "Jury_or_Televoting",
                  "From country": "from_country", "To country": "to_country"})


FileNotFoundError: [Errno 2] No such file or directory: 'eurovision_song_contest_1975_2019v3.xlsx'

In [25]:
# No correct names in the votes dataset.
def getAnEqualName(n):
    if n == "Bosnia & Herzegovina":
        return "Bosnia and Herzegovina"
    elif n == "The Netherlands":
        return "Netherlands"
    elif n == "F.Y.R. Macedonia":
        return "North Macedonia"
    else:
        return n

# Count all points that one country got, since the data set consits of how many points one country gave to another
# country.
def pointsReceivedCounter(df):
    counted = dict()
    for i, r in df.iterrows():
        if r['to_country'] not in counted.keys():
            counted[r['to_country']] = r[-2] 
        else:
            counted[r['to_country']] += r[-2] 
    return counted

# To remove countries, that are in both lists.
def removeIntersect(l1, l2):
    l = []
    k1 = [ll[0] for ll in l1]
    for k2, v2 in l2:
        if k2 not in k1:
            l.append((k2, v2))
        
    return l

jury = dict()
tele = dict()
# From 2016.
for y in yearsInOrder[60:]:
    yy = int(y)
#     Get the jury votes for final and semi-final, get points received per country and put in a list of tuples.
    jf = new.query("Year==@yy and Jury_or_Televoting=='J' and round=='f'")
    jsf = new.query("Year==@yy and Jury_or_Televoting=='J' and (round=='sf1' or round=='sf2')")
    jury[y] = (pointsReceivedCounter(jf), pointsReceivedCounter(jsf))
#     Get the tele votes for final en semi-final, get points received per country and put in a list of tuples. 
    tf = new.query("Year==@yy and Jury_or_Televoting=='T' and round=='f'")
    tsf = new.query("Year==@yy and Jury_or_Televoting=='T' and (round=='sf1' or round=='sf2')") 
    tele[y] = (pointsReceivedCounter(tf), pointsReceivedCounter(tsf))
    
for y in yearsInOrder[60:]:
#     Sort jury and televots from most to least points
    jury[y] = (sorted(jury[y][0].items(), key=itemgetter(1)), sorted(jury[y][1].items(), key=itemgetter(1)))
    tele[y] = (sorted(tele[y][0].items(), key=itemgetter(1)), sorted(tele[y][1].items(), key=itemgetter(1)))
    
    jury[y][0].reverse()
    jury[y][1].reverse()
    tele[y][0].reverse()
    tele[y][1].reverse()

#     Remove the countries that are in both semi-final and final, those reached the final, they do not need
#     to be in the lists twice.
    jury[y][0].extend(removeIntersect(jury[y][0], jury[y][1]))
    jury[y] = jury[y][0]
    tele[y][0].extend(removeIntersect(tele[y][0], tele[y][1]))
    tele[y] = tele[y][0]
        
quantsJ = dict()
quantsT = dict()
for year in yearsInOrder[60:]:
    quantsJ[year] = dict()
    quantsT[year] = dict()
    
#    Divide jury and tele votes according to the division of the first model. (winner, top 3, top 5, top 10 etc.)
#    Since it is since 2016, the partition of semi-finals to final is consistent.
    qJ = [jury[year][:1], jury[year][1:3], jury[year][3:5],
         jury[year][5:10], jury[year][10:26], jury[year][26:]]

    qT = [tele[year][:1], tele[year][1:3], tele[year][3:5],
         tele[year][5:10], tele[year][10:26], tele[year][26:]]

#     Declare belongling categories in a dictionary.
    for i in range(len(qJ)):
        for j in range(len(qJ[i])):
            cJ = getAnEqualName(qJ[i][j][0])
            cT = getAnEqualName(qT[i][j][0])
            
            quantsJ[year][cJ] = i
            quantsT[year][cT] = i


NameError: name 'new' is not defined

In [34]:
# To keep the columns in order.
columns = [['onset_rate', 'danceability', 'bpm_histogram_second_peak_spread', 'beats_count', 'beats_loudness', 'bpm', 
            'bpm_histogram_first_peak_bpm', 
           'bpm_histogram_first_peak_weight', 'bpm_histogram_second_peak_bpm', 'bpm_histogram_second_peak_weight'], 
           ['melbands_flatness_db', 'erbbands_kurtosis', 'melbands_crest', 'pitch_salience', 'hfc', 
           'barkbands_kurtosis', 'barkbands_spread', 'spectral_energyband_low', 'dynamic_complexity', 
           'dissonance', 'spectral_skewness', 'average_loudness', 'spectral_rolloff', 'spectral_complexity', 
           'erbbands_flatness_db', 'erbbands_crest', 'silence_rate_60dB', 
           'barkbands_flatness_db', 'spectral_flux','erbbands_skewness', 
           'melbands_skewness', 'erbbands_spread', 'spectral_kurtosis', 'melbands_kurtosis', 
           'barkbands_crest', 'silence_rate_30dB', 'barkbands_skewness', 'spectral_spread',
           'spectral_centroid', 'spectral_strongpeak', 'spectral_energyband_high', 
           'spectral_energyband_middle_high', 'spectral_energyband_middle_low', 'melbands_spread', 'spectral_rms',
            'spectral_entropy', 'spectral_energy', 'zerocrossingrate'], 
           ['tuning_nontempered_energy_ratio','tuning_equal_tempered_deviation', 'tuning_diatonic_strength',
           'chords_strength', 'chords_changes_rate', 'hpcp_entropy', 'hpcp_crest', 'chords_number_rate', 
           'tuning_frequency', 'chords_key', 'chords_scale']]
main = ["rhythm", "lowlevel", "tonal"]
# Dummy variables
dummyForChordsKey = { 'C':0, 'C#':1, 'D#':2, 'D':3, 'Eb':4, 'E':5, 'F#':6, 'F':7,'G':8,
                     'G#':9, 'Ab':10, 'A':11, 'A#':12,'Bb':13, 'B':14}
dummyForChordsScale = {'major': 0, 'minor': 1}

# Get the audio feature values, since itis defined in dicts with means and etc.
def convertDataPerYear(year):
    data = []
    for countryN in segmentedJsons[year]:
        entry = []
#         Only the country name, not the segment number as well.
        country = countryN[:-1]
        for i, mainFeature in enumerate(main):
            
            for audioFeature in columns[i]:
                value = segmentedJsons[year][countryN][mainFeature][audioFeature]
                
#                 Get the dummy value.
                if audioFeature == "chords_key":
                    entry.append(dummyForChordsKey[value])
                    
                elif audioFeature == "chords_scale":
                    entry.append(dummyForChordsScale[value])
#               If the audio feature contains a dict with mean, stdev, max, min, etc, get the mean value.
                elif type(value) == dict and "mean" in value:
                    entry.append(value["mean"])
#                 If the audio feature is an int or float.    
                elif type(value) == int or type(value) == float:
                    entry.append(value)
#          Also add the category and country+segment(id) in to the dict. Comment according to your needs.

#         Oneven model
        entry.append(getCategory(year, landNameConverter[country]))
    
#     Even model
#         entry.append(quants[year][landNameConverter[country]])

#    Oneven Jury model
#         entry.append(quantsJ[year][landNameConverter[country]])

#   Oneven Tele model
#         entry.append(quantsT[year][landNameConverter[country]])

        entry.append(year+"_"+countryN)
        data.append(entry)
        
    return data

# Get audio features for multiple years.
def doAConvertForMultiYears(years):
    d = []
    for y in years:
        data = convertDataPerYear(y)
        d.extend(data)
    return d

In [29]:
# Flatten the columns for dataFraming the data.
audioFeatures = [item for sublist in columns for item in sublist]

In [30]:
# Check if document is relevent according to given query.
def isRelevant(document, query):
    if document['trueCat'] <= query:
        return True
    return False

# Calculate average precision with a given query.
def AP(rank, query):
    ap = 0
    counter = 0
    for i in range(len(rank)):
        if isRelevant(rank.iloc[i], query):
            denominator = i + 1
            counter += 1
            ap += (counter/denominator)
    return ap/counter

# Get a full ranking with a soft class. The mean of all the predicted categories according to their probabilities.
def getAFullRankingFromPredictionsSoftClass(data, predictionsDataFrame):
    rankFrame = dict()
    for i, row in predictionsDataFrame.iterrows():
#         Determine soft class.
        softclass = (row['p0']*0) + (row['p1']*1)+(row['p2']*2) + (row['p3']*3)+(row['p4']*4) + (row['p5']*5)
#     Store in dictionary and convert to dataframe.
        rankFrame[i] = [data.iloc[i]["id"], softclass, row["predict"], data.iloc[i]["category"]]
    return pd.DataFrame.from_dict(rankFrame, orient='index', columns=["id","softCat", "hardCat", "trueCat"])

# Get a full ranking with latent variables.
def getAFullRankingFromPredictionsLatent(data, predictionsDataFrame, coefs):
    rankFrame = dict()
    for i, row in data.iterrows():
#         Determine latent variable.
        latent = np.dot(np.array(row[:-2].tolist()), np.array(coefs.tolist()))
#     Store in dictionary and convert to dataframe.
        rankFrame[i] = [row["id"], latent, predictionsDataFrame.iloc[i]["predict"], row["category"]]
    return pd.DataFrame.from_dict(rankFrame, orient='index', columns=["id","softCat", "hardCat", "trueCat"])

# Get one segment per song by selecting the highest placed segment.
def getRankWithBestSegments(rank):
    mins = dict()
    for i, row in rank.iterrows():
        year, segment = row["id"].split("_")
        country = segment[:-1]
        idd = year+"_"+country
        if idd in list(mins.keys()):
#             Find for one country highest placing.
            if mins[idd]["softCat"] > row["softCat"]:
                mins[idd] = row.to_dict()
        else:
            mins[idd] = row.to_dict()
    forFrame = dict()
#     Prepare data to be in a frame.
    for k in mins:
        forFrame[k] = list(mins[k].values())
#    Make data frame and sort to a full ranking.
    minFrame = pd.DataFrame.from_dict(forFrame, orient='index', columns=list(row.to_dict().keys()))
    minFrame.sort_values(["softCat"], inplace=True)
    return minFrame

# Make a confusion matrix for predicitions of model.
def confusionMatrix(predictions, test):
    matrix = [[0 for _ in range(6)] for _ in range(6)]
    accuracyVector = [0 for _ in range(6)]
    for i, row in predictions.iterrows():
        t = int(test.iloc[i]['category'])
        p = int(row['predict'] )
        matrix[t][p] += 1
#         Return the matrix and the sums of predictions and trues.
    return matrix, np.sum(matrix, axis=1), np.sum(matrix, axis=0)

In [31]:
# Get the ordinal model for predicting categories.
def getOrdinalModel(train, valid, test, audioFeatures):
#     Set the category value as target value, to predict.
    train['target'] = train['category'].asfactor()
    valid['target'] = valid['category'].asfactor()
    test['target'] = test['category'].asfactor()

    #     Define model
    model = H2OGeneralizedLinearEstimator(standardize = True,
                                          family='ordinal',
                                          solver='GRADIENT_DESCENT_SQERR',
                                          nfolds=4)
    
    # Parameters to find the best model. 0.4 is more rigid regression than lasso.
    hyperParams = {'alpha': np.arange(0,1.1, 0.1).tolist()}
    
#     Get best model.
    griddedModel = H2OGridSearch(model=model, hyper_params=hyperParams)
    
#     Train model with train and validation data.
    griddedModel.train(x=audioFeatures, y="target", training_frame=train, validation_frame=valid)
    bestModel = griddedModel.get_grid(sort_by="MSE")[0]
    
    return bestModel

In [32]:
# Years to make a model from and predict the last year of the sequence. frto[0] is a model based on all years,
# exluding 1956, to predict 2019. frto[1:7] is per 10 years, starting with 1957. Not all are consistently 10 years.
# frto[7:] is, what I tought, music era's. In this case: 60's, 70's, 80's, 90's, 00's and 10's.
frto = [[1,64],[1,12],[12,22],[22,32],[32,42],[42,52],[52,63],[4,13],[14,23],[24,33],[34,43],[44,53],[54,63]]

In [16]:
# Get connection with h2o server.
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.6" 2020-01-14; OpenJDK Runtime Environment (build 11.0.6+10-post-Ubuntu-1ubuntu118.04.1); OpenJDK 64-Bit Server VM (build 11.0.6+10-post-Ubuntu-1ubuntu118.04.1, mixed mode, sharing)
  Starting server from /home/hester/.local/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpw0osbebg
  JVM stdout: /tmp/tmpw0osbebg/h2o_hester_started_from_python.out
  JVM stderr: /tmp/tmpw0osbebg/h2o_hester_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,03 secs
H2O cluster timezone:,Europe/Amsterdam
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.1
H2O cluster version age:,1 month and 13 days
H2O cluster name:,H2O_from_python_hester_b7gxwt
H2O cluster total nodes:,1
H2O cluster free memory:,3.848 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [35]:
# Predicting categories.

# Get the flat columns and add category and id for dataframing the data.
flatColumns = [item for sublist in columns for item in sublist]
flatColumns.append("category")
flatColumns.append("id")

# Loop over fr to's. But only get the first element in list, That are all years of the song contest.
for fr, to in frto[:1]:
#     Get the audio feature values for the years that make the model.

#     Uncomment when using jury or tele votes
#     dataList = doAConvertForMultiYears(yearsInOrder[60:])

    dataList = doAConvertForMultiYears(yearsInOrder[fr:to])

    
    dataH2OFrame = h2o.H2OFrame(pd.DataFrame(dataList, columns=flatColumns))
    
#     Devide the data in to train, valid and test, with 69%, 16% and 15%. seed=1 is for reproduction.
    train, valid, test = dataH2OFrame.split_frame(ratios=[0.69,0.16], seed=1)
    model = getOrdinalModel(train, valid, test, audioFeatures)

#   ==============================================================================================
#   Intercepts
#      Print the standardized intercepts of the model.
#   ==============================================================================================
    print("Intercepts:")
    print(model._model_json['output']['coefficients_table'].as_data_frame().iloc[0][['std_coefs_class_0',
                                                                                   'std_coefs_class_1',
                                                                                   'std_coefs_class_2',
                                                                                   'std_coefs_class_3',
                                                                                   'std_coefs_class_4']])
    print("\n")
#   ==============================================================================================
#   Selected coefficients
#       Print the standardized values of the model. This shows the, by normalization, selected 
#       audio features.
#   ==============================================================================================
    print("Coefficients:")
    selectedCoefs = model._model_json['output']['coefficients_table'].as_data_frame()[['names',     
                                                                                       'std_coefs_class_0']].iloc[1:]
#     Sort the selected coefficients for absolute values. Therefor the values first need to be absolute, then 
#     signs need to be put back, because those are important.
    selection = dict()
#     Loop over coefficients.
    for i, r in selectedCoefs.iterrows():
#         Get the absolute value and put in temporary dictionary.
        selection[i] = abs(r['std_coefs_class_0'])
#     Sort the selection in descending order.
    sortedSelection = sorted(selection.items(), key=itemgetter(1))
    sortedSelection.reverse()
#     Loop over the sorted selection, get the real value from the original coefficients and print them in 
#     order.
    for i, c in sortedSelection:
        print(selectedCoefs.iloc[i-1]['names'], " = ", selectedCoefs.iloc[i-1]['std_coefs_class_0'])
    print("\n")
#   ==============================================================================================
    
#     Make the prediction with the fetched model, to calculate accuracies. Exclude the id and category.

    predictions = model.predict(test[:,:-2])
    predictieDataFrame = predictions.as_data_frame()
    dataFrameToPred = test.as_data_frame()


Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Grid Build progress: |████████████████████████████████████████████████| 100%
Intercepts:
std_coefs_class_0    -2.21064
std_coefs_class_1    -1.61883
std_coefs_class_2   -0.876736
std_coefs_class_3    -0.36469
std_coefs_class_4      1.0866
Name: 0, dtype: object


Coefficients:
spectral_energyband_low  =  -0.30647996044650283
erbbands_flatness_db  =  0.2534743172719919
beats_loudness  =  -0.23396771786553702
spectral_energy  =  -0.23355569713604543
tuning_frequency  =  0.22583105006581333
dissonance  =  -0.2086690181434792
spectral_flux  =  -0.20381295779746236
tuning_nontempered_energy_ratio  =  -0.20369624229378405
erbbands_spread  =  -0.2003986989565729
tuning_equal_tempered_deviation  =  -0.19994003809776292
barkbands_spread  =  -0.19281414630345603
spectral_entropy  =  -0.19076559654562864
spectral_rms  =  -0.18562488585098963
melbands_skewness  =  -0.18339130443879448
spectral_kurtosis  =  0.18024

In [36]:
# Predictions for the model

#   ==============================================================================================
#    Confusion Matrix
#       Print the confusion matrix. (is not pprint)
#   ==============================================================================================
matrix, totalsTrues, totalsPredicts = confusionMatrix(predictieDataFrame, dataFrameToPred)
print(matrix, totalsTrues, totalsPredicts)
#   ==============================================================================================
#    Accuracy, Precision, Recall
#      For all categories that matter, compute accuracy, precision and recall and print.
#   ==============================================================================================
accs = [0 for _ in range(4)]
precisions = [0 for _ in range(4)]
recalls = [0 for _ in range(4)]

# Loop over categories in question.
for p in range(4):
#     Loop over confusion matrix till category.
    for i in range(p+1):
#         Loop again over confusion matrix till category.
        for j in range(p+1):
#         Get recall, precision and accuracies.
            recalls[p] += matrix[i][j]
            precisions[p] += matrix[i][j]
            accs[p] += matrix[i][j]
#   Print.
    print("Problem:", p)
    print("recalls:",  recalls[p], "/", sum(totalsTrues[:p+1]), recalls[p]/  sum(totalsTrues[:p+1]))
    print("precisions:", precisions[p], "/", sum(totalsPredicts[:p+1]), 
          precisions[p]/  sum(totalsPredicts[:p+1]))
    print("accuracies:", accs[p], "/", sum(totalsPredicts[:p+1])+sum(totalsTrues[:p+1]) - accs[p], 
          accs[p]/(sum(totalsPredicts[:p+1])+sum(totalsTrues[:p+1]) - accs[p]))
    
#   ==============================================================================================
#    Mean Average Precision Score
#       Compute MAP for full rankings and print.
#   ==============================================================================================

APs = [[] for _ in range(4)]
# Loop over all years, skip 1956.
for y in yearsInOrder[1:]:
#     Get data and put in h2o frame.
    dataList = convertDataPerYear(y)
    dataH2OFrame = h2o.H2OFrame(pd.DataFrame(dataList, columns=flatColumns))
    
#   Get predictions. (These are different form before.)
    predictions = model.predict(dataH2OFrame[:,:-2])
    predictieDataFrame = predictions.as_data_frame()
    dataFrameToPred = dataH2OFrame.as_data_frame()
    
#   Get full predicted rank from predictions, with a soft class or with the latent variable.
#   If the latent variable shows really weird values, I advice to use the soft class one.
#     rank = getAFullRankingFromPredictionsLatent(dataFrameToPred, predictieDataFrame, 
#                                                 selectedCoefs["std_coefs_class_0"])
    rank = getAFullRankingFromPredictionsSoftClass(dataFrameToPred, predictieDataFrame)
    filterRank = getRankWithBestSegments(rank)

#     Uncomment this if you want to see all the full ranking predicitions.
#     print(filterRank["id"])

#   Call the AP function to get the Average Precisions for one year.
#   Loop over the categories that matter, categories 0 till 3.
    for p in range(4):
        ap = AP(filterRank, p)
        APs[p].append(ap)

# Get the mean of the AP's per year, that gives 4 MAP's.
print("MAPS:", np.mean(APs, axis=1))

[[15, 5, 7, 8, 12, 13], [20, 13, 16, 6, 18, 18], [32, 13, 10, 2, 11, 24], [73, 24, 28, 25, 28, 63], [108, 50, 78, 42, 124, 165], [14, 4, 7, 7, 25, 141]] [ 60  91  92 241 567 198] [262 109 146  90 218 424]
recalls: 15 / 60 0.25
precisions: 15 / 262 0.05725190839694656
accuracies: 15 / 307 0.048859934853420196
recalls: 53 / 151 0.3509933774834437
precisions: 53 / 371 0.14285714285714285
accuracies: 53 / 469 0.11300639658848614
recalls: 131 / 243 0.5390946502057613
precisions: 131 / 517 0.25338491295938104
accuracies: 131 / 629 0.2082670906200318
recalls: 297 / 484 0.6136363636363636
precisions: 297 / 607 0.48929159802306427
accuracies: 297 / 794 0.37405541561712846
Parse progress: |█████████████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
Pars

Parse progress: |█████████████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
Parse progress: |███████████