In [4]:
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
from textblob import TextBlob


In [None]:
# import data
df = pd.read_pickle('./data/clean_with_lan_and_words.pkl')


In [5]:
def normalize(df, new_col_name, col_to_norm):
    '''
    ref: https://en.wikipedia.org/wiki/Normalization_(statistics)
    '''
    df = df.copy()
    max = df[col_to_norm].max()
    min = df[col_to_norm].min()

    df[new_col_name] = df[col_to_norm].apply(lambda val: (val-min)/(max-min))
    return df

def _count_words(words):
    try:
        return len(words.split())
    except:
        return 0 #TODO: better error handling, maybe not return 0

def word_count(df, new_col_name, col_with_lyrics):
    df = df.copy()
    df[new_col_name] = df[col_with_lyrics].apply(lambda words: _count_words(words))
    return df

def remove_outliers(df, col_to_process, low=.05, high=.95):
    df = df.copy()
    min, max = df[col_to_process].quantile([low,high])
    df = df[(df[col_to_process] >= min) & (df[col_to_process] <= max)]
    return df.reset_index(drop=True)

def analyze_sentiment(df):
    df = df.copy()
    res = df['lyrics'].apply(lambda txt : TextBlob(txt).sentiment)
    df['polarity'] = res.apply(lambda x: x[0])
    df['subjectivity'] = res.apply(lambda x: x[1])
    return df

def analyze_word_class(df):
    tqdm.pandas(desc="Preparing Text class analysis...")
    blobs = df['lyrics'].progress_apply(lambda txt : TextBlob(txt).tags)

    tqdm.pandas(desc="Analyzing classes...")
    df['nouns'] = blobs.progress_apply(lambda word_list: _count_word_class(word_list, 'NN'))
    df['adverbs'] = blobs.progress_apply(lambda word_list: _count_word_class(word_list, 'RB'))
    df['verbs'] = blobs.progress_apply(lambda word_list: _count_word_class(word_list, 'VB'))
    
    return df

def _count_word_class(words, word_class):
    count = 0
    for w in words:
        if w[1] == word_class:
            count = count + 1
    return count / 100

def prepare_data(df, data_cols, label_col, training_size=1000, test_size=250):
    labels = df_cp[label_col].value_counts().keys().tolist()
    train_data, train_labels, test_data, test_labels = [], [], [], []
    
    # shuffle dataset
    df = df.copy().sample(frac=1).reset_index(drop=True)
    
    for label in labels:
        data = df[df[label_col] == label]
        # kun hvis der er nok eksempler, ift. training_size og test_size, ud fra den pågældende label
        if len(data) > training_size + test_size:
            data = data.reset_index(drop=True)
            train_data += data[data_cols][0:training_size].values.tolist()
            train_labels += data[label_col][0:training_size].values.tolist()
            test_data += data[data_cols][training_size:training_size+test_size].values.tolist()
            test_labels += data[label_col][training_size:training_size+test_size].values.tolist()
    
    # da modellen kun kan trænes med numpy arrays, så skal listerne lige konverteres
    train_data = np.asarray(train_data)
    train_labels = np.asarray(train_labels)
    test_data = np.asarray(test_data)
    test_labels = np.asarray(test_labels)
    
    return (train_data, train_labels), (test_data, test_labels)
        
        

In [None]:
##
# kør dette step hvis processed-data ikke er tilgængeligt 
# og det først skal udvindes fra raw
##

df_cp = df.copy()
# tilføj kategoriske numeriske værdier for genre
df_cp.genre = pd.Categorical(df_cp.genre)
df_cp['genre_code'] = df_cp.genre.cat.codes
# optæl ord i sangtekst
df_cp = word_count(df_cp, 'num_words', 'lyrics')
# normaliser antal ord i sangtekst
df_cp = normalize(df_cp, 'num_words_nm', 'num_words')
df_cp = analyze_sentiment(df_cp)
# fjern col 'index'
df_cp.drop(['index'], axis=1, inplace=True)


In [6]:
##
# kør dette step hvis processed-data allerede er tilgængeligt
##

df_cp = pd.read_pickle('./data/clean_with_lan_and_words_and_sent.pkl')

In [7]:
### FJERNELSE AF UNØDVENDIG DATA ###
# fjern nan
df_cp.dropna()
# fjern uønskede genre (Not Available & Other)
df_cp = df_cp[(df_cp.genre != 'Not Available') & (df_cp.genre != 'Other')]
# fjern outliers ud fra antal ord i sangtekst
df_cp = remove_outliers(df_cp, 'num_words')
# reset index
df_cp = df_cp.reset_index(drop=True)


### --> evt. gem dataset her ###

In [8]:
# klargør data til model
(train_data, train_labels), (test_data, test_labels) = prepare_data(df_cp, ['num_words_nm', 'subjectivity', 'polarity'], 'genre_code', 10000, 400)

# Vis genre ud fra kategori kode
for code in np.unique(test_labels):
    print(code, df_cp[df_cp.genre_code == code].genre[0])


0 Country
6 Metal
9 Pop
11 Rock


### Setup netværk lag

- `input_nodes` er antallet af inputs parametrer/features
- `hidden_nodes` anbefalet antal er svarende til et tal mellem input og output nodes
- `output_nodes` er antallet af "labels" kategorier man forsøger at klassificerer for

In [9]:
input_nodes = 3
hidden_nodes = 4
output_nodes = 12

model = keras.Sequential([
    keras.layers.Dense(input_nodes),
    keras.layers.Dense(hidden_nodes, activation=tf.nn.sigmoid),
    keras.layers.Dense(output_nodes, activation=tf.nn.sigmoid)
])


### Compile modellen
Før modellen er klar til træning, mangler den nogle flere indstillinger. Disse er tilføjet under compiling:

- Loss function — Denne måler hvor præcis modellen er under træning. Vi vil minimerer denne funktion til, at "styre" modellen i den rigtige retning.
- Optimizer — Denne afgører hvordan modellen er opdateret, baseret på det data den ser 
- Metrics — Brugt til at monitorerer under træningen og testing trin.

In [10]:
model.compile(optimizer=tf.train.AdamOptimizer(), 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

### Træning af modellen

In [11]:
model.fit(train_data, train_labels, epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x1a25a03ac8>

### Evaluerer præcisionen

In [12]:
test_loss, test_acc = model.evaluate(test_data, test_labels)

print('Test accuracy:', test_acc)

Test accuracy: 0.361875


### Predictions

In [102]:
input = np.asarray([0.0])
prediction = model.predict(input)

print('klassificeringer')
print(prediction)

print(f'model gætter på: {np.argmax(prediction)} som er {df_cp[df_cp.genre_code == np.argmax(prediction)].genre[0]}')

InvalidArgumentError: Matrix size-incompatible: In[0]: [1,1], In[1]: [3,3]
	 [[{{node MatMul_27}} = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_input_1_9_0_0, MatMul_27/ReadVariableOp)]]