In [198]:
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np


In [3]:
# import data
df = pd.read_pickle('./data/clean_with_lan_and_words.pkl')


In [227]:
def normalize(df, new_col_name, col_to_norm):
    '''
    ref: https://en.wikipedia.org/wiki/Normalization_(statistics)
    '''
    df = df.copy()
    max = df[col_to_norm].max()
    min = df[col_to_norm].min()

    df[new_col_name] = df[col_to_norm].apply(lambda val: (val-min)/(max-min))
    return df

def _count_words(words):
    try:
        return len(words.split())
    except:
        return 0 #TODO: better error handling, maybe not return 0

def word_count(df, new_col_name, col_with_lyrics):
    df = df.copy()
    df[new_col_name] = df[col_with_lyrics].apply(lambda words: _count_words(words))
    return df

def remove_outliers(df, col_to_process, low=.05, high=.95):
    df = df.copy()
    min, max = df[col_to_process].quantile([low,high])
    df = df[(df[col_to_process] >= min) & (df[col_to_process] <= max)]
    return df.reset_index(drop=True)

def prepare_data(df, data_cols, label_col, training_size=1000, test_size=250):
    labels = df_cp[label_col].value_counts().keys().tolist()
    train_data, train_labels, test_data, test_labels = [], [], [], []
    
    for label in labels:
        data = df[df[label_col] == label]
        # kun hvis der er nok eksempler, ift. training_size og test_size, ud fra den pågældende label
        if len(data) > training_size + test_size:
            data = data.reset_index(drop=True)
            train_data += data[data_cols][0:training_size].values.tolist()
            train_labels += data[label_col][0:training_size].values.tolist()
            test_data += data[data_cols][training_size:training_size+test_size].values.tolist()
            test_labels += data[label_col][training_size:training_size+test_size].values.tolist()
    
    # da modellen kun kan trænes med numpy arrays, så skal listerne lige konverteres
    train_data = np.asarray(train_data)
    train_labels = np.asarray(train_labels)
    test_data = np.asarray(test_data)
    test_labels = np.asarray(test_labels)
    
    return (train_data, train_labels), (test_data, test_labels)
        
        

In [None]:
### TILFØJNING AF DATA ###
df_cp = df.copy()
# tilføj kategoriske numeriske værdier for genre
df_cp.genre = pd.Categorical(df_cp.genre)
df_cp['genre_code'] = df_cp.genre.cat.codes
# optæl ord i sangtekst
df_cp = word_count(df_cp, 'num_words', 'lyrics')
# normaliser antal ord i sangtekst
df_cp = normalize(df_cp, 'num_words_nm', 'num_words')

### FJERNELSE A DATA ###
# fjern outliers ud fra antal ord i sangtekst
df_cp = remove_outliers(df_cp, 'num_words')
# fjern col 'index'
df_cp.drop(['index'], axis=1, inplace=True)

### --> evt. gem dataset her ###

(train_data, train_labels), (test_data, test_labels) = prepare_data(df_cp, ['num_words_nm'], 'genre_code')

### Setup netværk lag

- `input_nodes` er antallet af inputs parametrer/features
- `hidden_nodes` anbefalet antal er svarende til et tal mellem input og output nodes
- `output_nodes` er antallet af "labels" kategorier man forsøger at klassificerer for

In [229]:
input_nodes = 1
hidden_nodes = 4
output_nodes = 12

model = keras.Sequential([
    keras.layers.Dense(input_nodes),
    keras.layers.Dense(hidden_nodes, activation=tf.nn.sigmoid),
    keras.layers.Dense(output_nodes, activation=tf.nn.sigmoid)
])


### Compile modellen
Før modellen er klar til træning, mangler den nogle flere indstillinger. Disse er tilføjet under compiling:

- Loss function — Denne måler hvor præcis modellen er under træning. Vi vil minimerer denne funktion til, at "styre" modellen i den rigtige retning.
- Optimizer — Denne afgører hvordan modellen er opdateret, baseret på det data den ser 
- Metrics — Brugt til at monitorerer under træningen og testing trin.

In [230]:
model.compile(optimizer=tf.train.AdamOptimizer(), 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

### Træning af modellen

In [231]:
model.fit(train_data, train_labels, epochs=50)

<class 'numpy.ndarray'>
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1c38f0feb00>

### Evaluerer præcisionen

In [233]:
test_loss, test_acc = model.evaluate(test_data, test_labels)

print('Test accuracy:', test_acc)

Test accuracy: 0.11566666666666667


### Predictions

In [236]:
input = np.asarray([0.12])
prediction = model.predict(input)

print('klassificeringer')
print(prediction)

print(f'model gætter på: {np.argmax(prediction)} som er {df_cp[df_cp.genre_code == np.argmax(prediction)].genre[0]}')

klassificeringer
[[0.0338759  0.17233364 0.02673569 0.96144205 0.04695294 0.03703632
  0.0153113  0.11624471 0.09467284 0.36417633 0.1673174  0.08110837]]
model gætter på: 3 som er Hip-Hop
