In [1]:
from matplotlib import pyplot as plt
import numpy as np
import math,random
from tqdm import tqdm
from random import shuffle
import pprint

import pandas as pd
from src.features.build_features import word_count, sentence_avg_word_length, normalize
from src.features.text_blob_analysis import analyze_sentiment, analyze_word_class
from src.data.make_dataset import create_dataset
from src.data.util import unzip_file

def step_function(x):
    return 1 if x >= 0 else 0


def perceptron_output(weights, bias, x):
    '''Returns 1 if the perceptrion 'fires', 0 if not '''
    return step_function(np.dot(weights, x) + bias)

def sigmoid(t):
    return 1 / (1 + math.exp(-t))

def neuron_output(weights, inputs):
    return sigmoid(np.dot(weights, inputs))

def predict(input, network):
    return feed_forward(network, input)[-1]

def feed_forward(neural_network, input_vector):
    """takes in a neural network (represented as a list of lists of lists of weights)
    and returns the output from forward-propagating the input"""

    outputs = []

    for layer in neural_network:

        input_with_bias = input_vector + [1]             # add a bias input
        output = [neuron_output(neuron, input_with_bias) # compute the output
                  for neuron in layer]                   # for this layer
        outputs.append(output)                           # and remember it

        # the input to the next layer is the output of this one
        input_vector = output

    return outputs
    
def backpropagate(network, input_vector, targets):
    hidden_outputs, outputs = feed_forward(network, input_vector)
    
  
    # the output * (1 - output) is from the derivative of sigmoid
    output_deltas = [output * (1 - output) * (output - target) for output, target in zip(outputs, targets)]
        # adjust weights for output layer, one neuron at a time
    for i, output_neuron in enumerate(network[-1]):
    # focus on the ith output layer neuron
        for j, hidden_output in enumerate(hidden_outputs + [1]):
            # adjust the jth weight based on both
            # this neuron's delta and its jth input
            output_neuron[j] -= output_deltas[i] * hidden_output
    # back-propagate errors to hidden layer
    hidden_deltas = [hidden_output * (1 - hidden_output) * np.dot(output_deltas, [n[i] for n in output_layer])for i, hidden_output in enumerate(hidden_outputs)]
        
    # adjust weights for hidden layer, one neuron at a time
    for i, hidden_neuron in enumerate(network[0]):
        for j, input in enumerate(input_vector + [1]):
            hidden_neuron[j] -= hidden_deltas[i] * input

In [2]:
# Importing data
test_df, df = create_dataset()
#test_df, df = create_dataset()

print(len(df))

#print(len(testdata_df))
df = df.sample(frac=1).reset_index(drop=True)#
print(df.tail())

Creating missing paths...
Skipping unzip...
Skipping data filtering...
750
       genre                                             lyrics
745     Rock  she's an angel working the back streets with t...
746  Hip-Hop  gerald what the fuck is wrong man cheer the fu...
747      Pop  i wish you well i hope you survive i hope you ...
748      Pop  that's no lie no dã©jã  vu oh you trying tryin...
749     Rock  here i am caught in the moment seems to be fro...


In [3]:
# targets
series = df['genre'].value_counts()
genre_labels = series.keys() # getting genre labels
targets = [[1 if i == j else 0 for i in genre_labels] for j in df['genre']]

# features
df = sentence_avg_word_length(df,"avg_word_len", 'lyrics')
df = normalize(df, 'avg_word_len_nm', 'avg_word_len')
df = word_count(df,"word_count", 'lyrics')
df = normalize(df, 'word_count_nm', 'word_count')
df = analyze_sentiment(df)
df = analyze_word_class(df)

avg_word_len = df['avg_word_len_nm']
words = df["word_count_nm"]
polarity = df['polarity']
subjectivity = df['subjectivity']
nouns = df['nouns']
adverbs = df['adverbs']
verbs = df['verbs']

# Create feature list
inputs = [[f, p, s, n, wl] for f, p, s, n, wl in zip(words, polarity, subjectivity, nouns, avg_word_len)]
print(inputs[0:10])

#shuffle(inputs)
#inputs = inputs[0:500]

Analyzing sentiment...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:01<00:00, 694.51it/s]
Analyzing sentiment...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:00<00:00, 376283.25it/s]
Analyzing sentiment...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:00<00:00, 751308.34it/s]
Preparing Text class analysis...: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:12<00:00, 62.39it/s]
Analyzing classes...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:00<00:00, 51714.28it/s]
Analyzing classes...: 100%|███████████████████████████████████████████████████████████████████████████████████

[[0.27460629921259844, 0.24558823529411763, 0.5848039215686275, 0.41, 0.01500957673798135], [0.35236220472440943, 0.21553497942386834, 0.6005144032921811, 0.41, 0.008782588831888925], [0.14566929133858267, 0.18159171075837743, 0.5539902998236331, 0.4, 0.05509497861733378], [0.15846456692913385, 0.18600000000000003, 0.425, 0.27, 0.04960288596984633], [0.14468503937007873, 0.10288461538461541, 0.6230769230769231, 0.24, 0.05086261567666097], [0.14468503937007873, 0.15206611570247938, 0.309504132231405, 0.26, 0.05145866195412185], [0.34153543307086615, -0.0063063063063063095, 0.1362612612612612, 0.6, 0.012023623619058054], [0.18700787401574803, 0.25341880341880346, 0.5918803418803418, 0.24, 0.03303272297620463], [0.31200787401574803, -0.17053782505910162, 0.5594562647754134, 0.61, 0.012085448662303074], [0.2992125984251969, 0.0011724386724386708, 0.578697691197691, 0.3, 0.01279333308054365]]


In [4]:
########### Træning af model ###########

###########
# Opsætning af Neural Network
###########
random.seed(0) # to get repeatable results
input_size = 5 # antal af input noder (samme antal som feautures)
num_hidden = 4 # antal af hidden noder
output_size = 3 # antal af output noder (i vores tilfælde, genres)

# each hidden neuron has one weight per input, plus a bias weight
hidden_layer = [[random.random() for __ in range(input_size + 1)] for __ in range(num_hidden)]

# each output neuron has one weight per hidden neuron, plus a bias weight
output_layer = [[random.random() for __ in range(num_hidden + 1)] for __ in range(output_size)]

# the network starts out with random weights
network = [hidden_layer, output_layer]

# Iteration of training
#num = 0
print(network)
for __ in  tqdm(range(3000)):
    #num = num +1
    #if num == 200 or num == 1000 or num == 1500 or num == 2000 or num == 3500:
     #   print(network)
    for input_vector, target_vector in zip(inputs, targets):
        backpropagate(network, input_vector, target_vector)
print(network)

[[[0.8444218515250481, 0.7579544029403025, 0.420571580830845, 0.25891675029296335, 0.5112747213686085, 0.4049341374504143], [0.7837985890347726, 0.30331272607892745, 0.4765969541523558, 0.5833820394550312, 0.9081128851953352, 0.5046868558173903], [0.28183784439970383, 0.7558042041572239, 0.6183689966753316, 0.25050634136244054, 0.9097462559682401, 0.9827854760376531], [0.8102172359965896, 0.9021659504395827, 0.3101475693193326, 0.7298317482601286, 0.8988382879679935, 0.6839839319154413]], [[0.47214271545271336, 0.1007012080683658, 0.4341718354537837, 0.6108869734438016, 0.9130110532378982], [0.9666063677707588, 0.47700977655271704, 0.8653099277716401, 0.2604923103919594, 0.8050278270130223], [0.5486993038355893, 0.014041700164018955, 0.7197046864039541, 0.39882354222426875, 0.824844977148233]]]


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3000/3000 [01:41<00:00, 29.43it/s]


[[[57.286505693316414, -12.774122316302577, 12.694009882754317, 11.559131848657026, -1.7253483057151033, -35.95209942739075], [63.12405865312613, 3.483886971017617, -2.3551765477171207, -5.542653050281869, 11.830975315221577, -7.246366809306477], [1.5515753268162322, 1.0728126223425583, 3.7708470240228382, 2.870095969409604, 0.8974522610755592, 7.435355569382166], [2.161855149772, 1.1807899996278868, 3.583357134145531, 3.1675621423936198, 0.8927136405122937, 7.260198648802185]], [[-4.227418534400823, 0.6551644744076558, -0.7608088237749622, -0.8758655653531142, 0.8492137004373576], [4.9327001993739135, 29.919064101108567, -10.126946495433812, -10.12317884196534, -12.009698437331336], [-4.570433778002583, -1.959823099325856, 0.7215040239868218, 0.07514034154518234, 1.1280019522940319]]]


In [5]:
# features
test_df = test_df.copy()
test_df = test_df.sample(100)

test_df = sentence_avg_word_length(test_df,"avg_word_len", 'lyrics')
test_df = normalize(test_df, 'avg_word_len_nm', 'avg_word_len')

test_df = word_count(test_df,"word_count", 'lyrics')
test_df = normalize(test_df, 'word_count_nm', 'word_count')
test_df = analyze_sentiment(test_df)
test_df = analyze_word_class(test_df)

avg_word_len = test_df['avg_word_len_nm']
words = test_df["word_count_nm"]
polarity = test_df['polarity']
subjectivity = test_df['subjectivity']
nouns = test_df['nouns']
adverbs = test_df['adverbs']
verbs = test_df['verbs']

# Create feature list
test_features = [[f, p, s, n, wl] for f, p, s, n, wl in zip(words, polarity, subjectivity, nouns, avg_word_len)]

Analyzing sentiment...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 677.29it/s]
Analyzing sentiment...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 100126.62it/s]
Analyzing sentiment...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<?, ?it/s]
Preparing Text class analysis...: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 66.43it/s]
Analyzing classes...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 33407.44it/s]
Analyzing classes...: 100%|███████████████████████████████████████████████████████████████████████████████████

In [6]:
# 'Rock', 'Pop', 'Hip-Hop', 'Not Available', 'Metal', 'Country', 'Jazz', 'Electronic', 'Other', 'R&B', 'Indie', 'Folk'


print('##########################')
print(genre_labels)
res = predict([0.0152699731248473, 0.12666666666666668, 0.4533333333333333,0.10,1], network)
print(res)


##########################
Index(['Pop', 'Hip-Hop', 'Rock'], dtype='object')
[0.4649457776520458, 0.062111201115895576, 0.4973342294653449]


In [7]:
count = 0
for test, t in zip(test_features, test_df['genre']):
    test_res = predict(test,network)
    print(test_res)
    maxnum = test_res.index(max(test_res))
    if genre_labels[maxnum] == t:
        count = count +1
        print(t)
        
print(count)
    

[0.46656593951398706, 0.08775896575743128, 0.4908152812984493]
[0.4668944755860076, 0.08782859660781835, 0.4911209403392755]
[0.39749362841461283, 0.11814651719882446, 0.4153741379319867]
Rock
[0.46687452790792083, 0.08625671883431085, 0.49152773930542615]
Rock
[0.012728824952831836, 0.9296861956920817, 0.009988592950533172]
[0.4664721360278689, 0.0841756715986282, 0.4917157299662053]
[0.012620727121119427, 0.9303418865068384, 0.009896668265772535]
Hip-Hop
[0.10922373784761137, 0.488449526212845, 0.10329476952355612]
Hip-Hop
[0.4663270775047924, 0.07868531640056096, 0.4931667580013182]
Rock
[0.11727152669237277, 0.4573448298111727, 0.11187363184670654]
[0.46691112223796927, 0.08715907842104091, 0.4913179951322046]
Rock
[0.012620735332050208, 0.9303423418675209, 0.009896666015071939]
Hip-Hop
[0.4663633813508276, 0.07908395087444368, 0.4930798969987304]
Rock
[0.42978523582990086, 0.10292614063240962, 0.45070394682413994]
Rock
[0.46542536939711815, 0.06750938694565568, 0.4958718560409935]