In [1]:
from matplotlib import pyplot as plt
import numpy as np
import math,random
from tqdm import tqdm
from random import shuffle
import pprint

import pandas as pd
from src.features.build_features import word_count, sentence_avg_word_length, normalize
from src.features.text_blob_analysis import analyze_sentiment, analyze_word_class
from src.data.make_dataset import create_dataset
from src.data.util import unzip_file

def step_function(x):
    return 1 if x >= 0 else 0


def perceptron_output(weights, bias, x):
    '''Returns 1 if the perceptrion 'fires', 0 if not '''
    return step_function(np.dot(weights, x) + bias)

def sigmoid(t):
    return 1 / (1 + math.exp(-t))

def neuron_output(weights, inputs):
    return sigmoid(np.dot(weights, inputs))

def predict(input, network):
    return feed_forward(network, input)[-1]

def feed_forward(neural_network, input_vector):
    """takes in a neural network (represented as a list of lists of lists of weights)
    and returns the output from forward-propagating the input"""

    outputs = []

    for layer in neural_network:

        input_with_bias = input_vector + [1]             # add a bias input
        output = [neuron_output(neuron, input_with_bias) # compute the output
                  for neuron in layer]                   # for this layer
        outputs.append(output)                           # and remember it

        # the input to the next layer is the output of this one
        input_vector = output

    return outputs
    
def backpropagate(network, input_vector, targets):
    hidden_outputs, outputs = feed_forward(network, input_vector)
    
  
    # the output * (1 - output) is from the derivative of sigmoid
    output_deltas = [output * (1 - output) * (output - target) for output, target in zip(outputs, targets)]
        # adjust weights for output layer, one neuron at a time
    for i, output_neuron in enumerate(network[-1]):
    # focus on the ith output layer neuron
        for j, hidden_output in enumerate(hidden_outputs + [1]):
            # adjust the jth weight based on both
            # this neuron's delta and its jth input
            output_neuron[j] -= output_deltas[i] * hidden_output
    # back-propagate errors to hidden layer
    hidden_deltas = [hidden_output * (1 - hidden_output) * np.dot(output_deltas, [n[i] for n in output_layer])for i, hidden_output in enumerate(hidden_outputs)]
        
    # adjust weights for hidden layer, one neuron at a time
    for i, hidden_neuron in enumerate(network[0]):
        for j, input in enumerate(input_vector + [1]):
            hidden_neuron[j] -= hidden_deltas[i] * input

In [2]:
# Importing data
test_df, df = create_dataset()
#test_df, df = create_dataset()

print(len(df))

#print(len(testdata_df))
df = df.sample(frac=1).reset_index(drop=True)#
print(df.tail())

Creating missing paths...
Skipping unzip...
Skipping data filtering...
750
       genre                                             lyrics
745  Hip-Hop  peace this is spike lee aka shelton jackson le...
746  Hip-Hop  you out there louder well clap your hands to w...
747  Hip-Hop  silk smooth how i move but i'm gonna do do wha...
748      Pop  swore i count  of the dead of the night a love...
749     Rock  got your pride and your prose tucked just like...


In [3]:
# targets
series = df['genre'].value_counts()
genre_labels = series.keys() # getting genre labels
targets = [[1 if i == j else 0 for i in genre_labels] for j in df['genre']]

# features
df = sentence_avg_word_length(df,"avg_word_len", 'lyrics')
df = normalize(df, 'avg_word_len_nm', 'avg_word_len')
df = word_count(df,"word_count", 'lyrics')
df = normalize(df, 'word_count_nm', 'word_count')
df = analyze_sentiment(df)
df = analyze_word_class(df)

avg_word_len = df['avg_word_len_nm']
words = df["word_count_nm"]
polarity = df['polarity']
subjectivity = df['subjectivity']
nouns = df['nouns']
adverbs = df['adverbs']
verbs = df['verbs']

# Create feature list
inputs = [[f, p, s, n, a, v, wl] for f, p, s, n, a, v, wl in zip(words, polarity, subjectivity, nouns, adverbs, verbs, avg_word_len)]
print(inputs[0:10])

#shuffle(inputs)
#inputs = inputs[0:500]

Analyzing sentiment...: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:04<00:00, 171.97it/s]
Analyzing sentiment...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:00<00:00, 125302.85it/s]
Analyzing sentiment...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:00<00:00, 150426.93it/s]
Preparing Text class analysis...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:35<00:00, 21.05it/s]
Analyzing classes...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:00<00:00, 9283.95it/s]
Analyzing classes...: 100%|████████████████████████████████████████████████████████████████████████████████████████

[[0.13513513513513514, 0.2845588235294118, 0.5355392156862745, 0.34, 0.09, 0.11, 0.03217586697768667], [0.5218295218295218, 0.08303589834202083, 0.46652899254940083, 1.49, 0.31, 0.39, 0.0030785883347231708], [0.5758835758835759, 0.14672430743859316, 0.49861521698256395, 1.18, 0.43, 0.5, 0.002118704857859705], [0.32536382536382535, 0.031563786008230454, 0.4622633744855966, 0.42, 0.34, 0.3, 0.00691362556108521], [0.23076923076923078, 0.19999999999999996, 0.354, 0.61, 0.11, 0.05, 0.015621890672178227], [0.20686070686070687, 0.22566844919786108, 0.5039215686274511, 0.32, 0.74, 0.36, 0.020415656343342473], [0.2515592515592516, 0.1736111111111111, 0.33472222222222225, 0.57, 0.36, 0.04, 0.012722854082660263], [0.2588357588357588, 0.1618055555555556, 0.37986111111111104, 0.72, 0.21, 0.31, 0.013170316844755645], [0.23284823284823286, 0.13604166666666667, 0.5635416666666667, 0.5, 0.31, 0.36, 0.014332772674146092], [0.16735966735966737, 0.07910466269841267, 0.5955977182539683, 0.23, 0.27, 0.25, 0

In [4]:
########### Træning af model ###########

###########
# Opsætning af Neural Network
###########
random.seed(0) # to get repeatable results
input_size = 7 # antal af input noder (samme antal som feautures)
num_hidden = 4 # antal af hidden noder
output_size = 3 # antal af output noder (i vores tilfælde, genres)

# each hidden neuron has one weight per input, plus a bias weight
hidden_layer = [[random.random() for __ in range(input_size + 1)] for __ in range(num_hidden)]

# each output neuron has one weight per hidden neuron, plus a bias weight
output_layer = [[random.random() for __ in range(num_hidden + 1)] for __ in range(output_size)]

# the network starts out with random weights
network = [hidden_layer, output_layer]

# Iteration of training
#num = 0
print(network)
for __ in  tqdm(range(3000)):
    #num = num +1
    #if num == 200 or num == 1000 or num == 1500 or num == 2000 or num == 3500:
     #   print(network)
    for input_vector, target_vector in zip(inputs, targets):
        backpropagate(network, input_vector, target_vector)
print(network)

[[[0.8444218515250481, 0.7579544029403025, 0.420571580830845, 0.25891675029296335, 0.5112747213686085, 0.4049341374504143, 0.7837985890347726, 0.30331272607892745], [0.4765969541523558, 0.5833820394550312, 0.9081128851953352, 0.5046868558173903, 0.28183784439970383, 0.7558042041572239, 0.6183689966753316, 0.25050634136244054], [0.9097462559682401, 0.9827854760376531, 0.8102172359965896, 0.9021659504395827, 0.3101475693193326, 0.7298317482601286, 0.8988382879679935, 0.6839839319154413], [0.47214271545271336, 0.1007012080683658, 0.4341718354537837, 0.6108869734438016, 0.9130110532378982, 0.9666063677707588, 0.47700977655271704, 0.8653099277716401]], [[0.2604923103919594, 0.8050278270130223, 0.5486993038355893, 0.014041700164018955, 0.7197046864039541], [0.39882354222426875, 0.824844977148233, 0.6681532012318508, 0.0011428193144282783, 0.49357786646532464], [0.8676027754927809, 0.24391087688713198, 0.32520436274739006, 0.8704712321086546, 0.19106709150239054]]]


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3000/3000 [04:24<00:00, 13.10it/s]


[[[1.8921955236372352, 1.6370103020745497, 3.6393070110217303, 2.9949735415783114, 1.3482331298476364, 1.0059215930087673, 1.1700806297125212, 6.914125488321733], [18.25204404129903, -0.7397557130003489, 8.746428003305947, 6.3017191213388415, 2.004629640052655, 6.959149379069717, -3.4629212576147053, -17.758807253057824], [44.875308925584925, 3.429502114818368, -5.954117415654864, 13.05925553107093, -3.1788143467360634, 20.100944492311054, -1.4873809571092396, -1.940439311291881], [1.285495122598479, 0.7258172302060236, 3.588997249614815, 2.942499214109537, 1.5269409275839434, 1.2509410291557457, 0.7815409634791093, 7.501375265809378]], [[-3.2425932317620396, 5.266694447572339, 11.819605918775345, -4.841237207302153, -6.017787761345706], [-4.638950292456452, -3.169899516221693, 13.924113235810754, -4.576276885433242, -4.222090455480682], [2.8537139904863693, -3.8252488788625953, -12.30179119905892, 3.627960921889604, 4.649104947523585]]]


In [5]:
# features
test_df = test_df.copy()
test_df = test_df.sample(100)

test_df = sentence_avg_word_length(test_df,"avg_word_len", 'lyrics')
test_df = normalize(test_df, 'avg_word_len_nm', 'avg_word_len')

test_df = word_count(test_df,"word_count", 'lyrics')
test_df = normalize(test_df, 'word_count_nm', 'word_count')
test_df = analyze_sentiment(test_df)
test_df = analyze_word_class(test_df)

avg_word_len = test_df['avg_word_len_nm']
words = test_df["word_count_nm"]
polarity = test_df['polarity']
subjectivity = test_df['subjectivity']
nouns = test_df['nouns']
adverbs = test_df['adverbs']
verbs = test_df['verbs']

# Create feature list
test_features = [[f, p, s, n, a, v, wl] for f, p, s, n, a, v, wl in zip(words, polarity, subjectivity, nouns, adverbs, verbs, avg_word_len)]

Analyzing sentiment...: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 231.03it/s]
Analyzing sentiment...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 97723.77it/s]
Analyzing sentiment...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 101606.20it/s]
Preparing Text class analysis...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 25.31it/s]
Analyzing classes...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 7156.78it/s]
Analyzing classes...: 100%|████████████████████████████████████████████████████████████████████████████████████████

In [6]:
# 'Rock', 'Pop', 'Hip-Hop', 'Not Available', 'Metal', 'Country', 'Jazz', 'Electronic', 'Other', 'R&B', 'Indie', 'Folk'


print('##########################')
print(genre_labels)
res = predict([0.71148825065274152, 0.22561965811965812, 0.129914529914531, 0.2506896551724138, 0.0513455968010067,1,1], network)
print(res)


##########################
Index(['Hip-Hop', 'Pop', 'Rock'], dtype='object')
[0.8724030346200379, 0.11469677164895586, 0.01441877759447464]


In [7]:
count = 0
for test, t in zip(test_features, test_df['genre']):
    test_res = predict(test,network)
    
    maxnum = test_res.index(max(test_res))
    if genre_labels[maxnum] == t:
        count = count +1
        print(test_res)
        print(t)
        
print(count)
    

[2.679547085332456e-06, 6.529792045482793e-06, 0.9999449485642234]
Rock
[0.9517397089627953, 0.0640862255092327, 0.006732310938257045]
Hip-Hop
[0.004409017027710538, 0.038805523669949146, 0.8903921015258296]
Rock
[0.11669870011492328, 0.5821526221912914, 0.20450706951153577]
Pop
[0.9518384619632565, 0.06400867514017809, 0.006721866026586713]
Hip-Hop
[0.09309094123328249, 0.6176367686113597, 0.23607213121452403]
Pop
[0.09326266786720518, 0.6183022638513951, 0.2356785232524625]
Pop
[0.10386063458453655, 0.6012086424837763, 0.22041759543066838]
Pop
[0.08360565399959452, 0.587636602794291, 0.2584555908596763]
Pop
[0.7473252471538561, 0.1766330123466279, 0.026184154459558365]
Hip-Hop
[0.95187699589741, 0.06397805831580307, 0.006717791061717151]
Hip-Hop
[0.09738600764309358, 0.6114915799671357, 0.22947909621681592]
Pop
[0.9063096874848148, 0.09512860987831084, 0.011242640694851683]
Hip-Hop
[0.35049168112452905, 0.3739325142456354, 0.08467842978499289]
Pop
[0.9518746306341697, 0.0639800571551