In [1]:
from matplotlib import pyplot as plt
import numpy as np
import math,random
from tqdm import tqdm
from random import shuffle
import pprint

import pandas as pd
from src.features.build_features import word_count, sentence_avg_word_length, normalize
from src.features.text_blob_analysis import analyze_sentiment
from src.data.make_dataset import create_dataset
from src.data.util import unzip_file

def step_function(x):
    return 1 if x >= 0 else 0


def perceptron_output(weights, bias, x):
    '''Returns 1 if the perceptrion 'fires', 0 if not '''
    return step_function(np.dot(weights, x) + bias)

def sigmoid(t):
    return 1 / (1 + math.exp(-t))

def neuron_output(weights, inputs):
    return sigmoid(np.dot(weights, inputs))

def predict(input, network):
    return feed_forward(network, input)[-1]

def feed_forward(neural_network, input_vector):
    """takes in a neural network (represented as a list of lists of lists of weights)
    and returns the output from forward-propagating the input"""

    outputs = []

    for layer in neural_network:

        input_with_bias = input_vector + [1]             # add a bias input
        output = [neuron_output(neuron, input_with_bias) # compute the output
                  for neuron in layer]                   # for this layer
        outputs.append(output)                           # and remember it

        # the input to the next layer is the output of this one
        input_vector = output

    return outputs
    
def backpropagate(network, input_vector, targets):
    hidden_outputs, outputs = feed_forward(network, input_vector)
    
  
    # the output * (1 - output) is from the derivative of sigmoid
    output_deltas = [output * (1 - output) * (output - target) for output, target in zip(outputs, targets)]
        # adjust weights for output layer, one neuron at a time
    for i, output_neuron in enumerate(network[-1]):
    # focus on the ith output layer neuron
        for j, hidden_output in enumerate(hidden_outputs + [1]):
            # adjust the jth weight based on both
            # this neuron's delta and its jth input
            output_neuron[j] -= output_deltas[i] * hidden_output
    # back-propagate errors to hidden layer
    hidden_deltas = [hidden_output * (1 - hidden_output) * np.dot(output_deltas, [n[i] for n in output_layer])for i, hidden_output in enumerate(hidden_outputs)]
        
    # adjust weights for hidden layer, one neuron at a time
    for i, hidden_neuron in enumerate(network[0]):
        for j, input in enumerate(input_vector + [1]):
            hidden_neuron[j] -= hidden_deltas[i] * input

In [3]:
# Importing data
#training_df, df = create_dataset()
df, test_df = create_dataset()

print(len(df))
#print(len(testdata_df))


Creating missing paths...
Skipping unzip...
Skipping data filtering...
12000


In [4]:
# targets
series = df['genre'].value_counts()
genre_labels = series.keys() # getting genre labels
targets = [[1 if i == j else 0 for i in genre_labels] for j in df['genre']]

# features
df = sentence_avg_word_length(df,"avg_word_len", 'lyrics')
df = word_count(df,"word_count", 'lyrics')
df = normalize(df, 'word_count_nm', 'word_count')
df = analyze_sentiment(df)

polarity = df['polarity']
subjectivity = df['subjectivity']

# Create feature list
inputs = [[f, wl, p, s] for f, wl, p, s in zip(df['avg_word_len'],df["word_count_nm"], polarity, subjectivity)]
print(inputs[0:10])

#shuffle(inputs)
#inputs = inputs[0:500]

Analyzing sentiment...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12000/12000 [00:17<00:00, 692.77it/s]
Analyzing sentiment...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 12000/12000 [00:00<00:00, 923093.04it/s]
Analyzing sentiment...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 12000/12000 [00:00<00:00, 748938.28it/s]


[[3.443418013856813, 0.05289518690447105, 0.14826839826839824, 0.38809523809523816], [3.7906976744186047, 0.03151722452968483, 0.11572871572871572, 0.500808682058682], [3.8588235294117648, 0.02076716344979233, 0.00472027972027972, 0.6147435897435898], [3.425287356321839, 0.06376740776936232, 0.390422077922078, 0.49743867243867235], [3.823717948717949, 0.038113852919618864, -0.17083333333333328, 0.44666666666666677], [3.3819444444444446, 0.01759100903982409, 0.33611111111111114, 0.5222222222222223], [3.7220338983050847, 0.03603713657463963, 0.231081081081081, 0.46940154440154436], [4.3860103626943, 0.047153677009528466, 0.23476946334089188, 0.36307634164777014], [3.7795454545454548, 0.053750305399462496, 0.11762820512820509, 0.4822649572649574], [3.563049853372434, 0.04165648668458344, 0.27885304659498217, 0.6448028673835126]]


In [None]:
########### Træning af model ###########

###########
# Opsætning af Neural Network
###########
random.seed(0) # to get repeatable results
input_size = 4 # antal af input noder (samme antal som feautures)
num_hidden = 1 # antal af hidden noder
output_size = 3 # antal af output noder (i vores tilfælde, genres)

# each hidden neuron has one weight per input, plus a bias weight
hidden_layer = [[random.random() for __ in range(input_size + 1)] for __ in range(num_hidden)]

# each output neuron has one weight per hidden neuron, plus a bias weight
output_layer = [[random.random() for __ in range(num_hidden + 1)] for __ in range(output_size)]

# the network starts out with random weights
network = [hidden_layer, output_layer]

# Iteration of training
num = 0
print(network)
for __ in  tqdm(range(5000)):
    num = num +1
    if num == 1000 or num == 2000 or num == 3000 or num == 3500 or num == 4500:
        print(network)
    for input_vector, target_vector in zip(inputs, targets):
        backpropagate(network, input_vector, target_vector)
print(network)

[[[0.8444218515250481, 0.7579544029403025, 0.420571580830845, 0.25891675029296335, 0.5112747213686085]], [[0.4049341374504143, 0.7837985890347726], [0.30331272607892745, 0.4765969541523558], [0.5833820394550312, 0.9081128851953352]]]


 20%|███████████████████████████▎                                                                                                             | 999/5000 [03:43<14:57,  4.46it/s]

[[[2.0387877817010223, 0.732161286772175, 0.5289428928839273, 0.3002674497036609, 0.8083068105857345]], [[-2.9661594215978972, -2.199464743708133], [2.408609657595663, 2.39510674082735], [-2.470888190377237, -2.336408736678068]]]


 24%|████████████████████████████████▍                                                                                                       | 1193/5000 [04:27<14:10,  4.48it/s]

In [12]:
# 'Rock', 'Pop', 'Hip-Hop', 'Not Available', 'Metal', 'Country', 'Jazz', 'Electronic', 'Other', 'R&B', 'Indie', 'Folk'
print(inputs[-300:])

print('##########################')
print(genre_labels)
res = predict([4.052631578947368, 0.24311023622047245, 0.05758928571428573, 0.43883928571428577], network)
print(res)


[[4.052631578947368, 0.24311023622047245, 0.05758928571428573, 0.43883928571428577], [3.8746518105849583, 0.35334645669291337, 0.3206481481481481, 0.6279629629629632], [4.033816425120773, 0.20374015748031496, 0.03693528693528693, 0.3634310134310134], [3.7358490566037736, 0.2608267716535433, 0.37916666666666665, 0.4861111111111111], [3.8395061728395063, 0.23917322834645668, 0.07230269730269731, 0.19276556776556777], [3.6372549019607843, 0.30118110236220474, 0.3290404040404041, 0.41342171717171705], [3.6272189349112427, 0.16633858267716536, 0.3333333333333333, 0.6416666666666666], [3.985327313769752, 0.8720472440944882, -0.047883597883597896, 0.6019620811287479], [4.121212121212121, 0.12992125984251968, 0.1623280423280423, 0.5745767195767196], [3.782608695652174, 0.27165354330708663, -0.10012019230769231, 0.5116185897435898], [3.5316455696202533, 0.3110236220472441, 0.24827586206896546, 0.5662835249042144], [3.901639344262295, 0.3001968503937008, 0.2346480679814013, 0.3878747795414462], 


##########################
Index(['Hip-Hop', 'Pop', 'Rock'], dtype='object')
[0.03523980899119838, 0.02390106220683287, 0.9641036424767987]
