In [1]:
from matplotlib import pyplot as plt
import numpy as np
import math,random
from tqdm import tqdm
from random import shuffle
import pprint

import pandas as pd
from src.features.build_features import word_count, sentence_avg_word_length, normalize
from src.features.text_blob_analysis import analyze_sentiment, analyze_word_class
from src.data.make_dataset import create_dataset
from src.data.util import unzip_file

def step_function(x):
    return 1 if x >= 0 else 0


def perceptron_output(weights, bias, x):
    '''Returns 1 if the perceptrion 'fires', 0 if not '''
    return step_function(np.dot(weights, x) + bias)

def sigmoid(t):
    return 1 / (1 + math.exp(-t))

def neuron_output(weights, inputs):
    return sigmoid(np.dot(weights, inputs))

def predict(input, network):
    return feed_forward(network, input)[-1]

def feed_forward(neural_network, input_vector):
    """takes in a neural network (represented as a list of lists of lists of weights)
    and returns the output from forward-propagating the input"""

    outputs = []

    for layer in neural_network:

        input_with_bias = input_vector + [1]             # add a bias input
        output = [neuron_output(neuron, input_with_bias) # compute the output
                  for neuron in layer]                   # for this layer
        outputs.append(output)                           # and remember it

        # the input to the next layer is the output of this one
        input_vector = output

    return outputs
    
def backpropagate(network, input_vector, targets):
    hidden_outputs, outputs = feed_forward(network, input_vector)
    
  
    # the output * (1 - output) is from the derivative of sigmoid
    output_deltas = [output * (1 - output) * (output - target) for output, target in zip(outputs, targets)]
        # adjust weights for output layer, one neuron at a time
    for i, output_neuron in enumerate(network[-1]):
    # focus on the ith output layer neuron
        for j, hidden_output in enumerate(hidden_outputs + [1]):
            # adjust the jth weight based on both
            # this neuron's delta and its jth input
            output_neuron[j] -= output_deltas[i] * hidden_output
    # back-propagate errors to hidden layer
    hidden_deltas = [hidden_output * (1 - hidden_output) * np.dot(output_deltas, [n[i] for n in output_layer])for i, hidden_output in enumerate(hidden_outputs)]
        
    # adjust weights for hidden layer, one neuron at a time
    for i, hidden_neuron in enumerate(network[0]):
        for j, input in enumerate(input_vector + [1]):
            hidden_neuron[j] -= hidden_deltas[i] * input

In [2]:
# Importing data
test_df, df = create_dataset()
#test_df, df = create_dataset()

print(len(df))

#print(len(testdata_df))
df = df.sample(frac=1).reset_index(drop=True)#
print(df.tail())

Creating missing paths...
Skipping unzip...
Skipping data filtering...
750
       genre                                             lyrics
745  Hip-Hop  chorus) (flo-rida) this for my ballas ayyyy(4)...
746      Pop  you're lying - baby why are you breaking my he...
747  Hip-Hop  10 bad bitches in a mansion wrist on milly roc...
748     Rock  if i was a hero a superhero i'd know what to d...
749     Rock  say you're gonna take me up and down your trac...


In [3]:
# targets
series = df['genre'].value_counts()
genre_labels = series.keys() # getting genre labels
targets = [[1 if i == j else 0 for i in genre_labels] for j in df['genre']]

# features
df = sentence_avg_word_length(df,"avg_word_len", 'lyrics')
df = normalize(df, 'avg_word_len_nm', 'avg_word_len')
df = word_count(df,"word_count", 'lyrics')
df = normalize(df, 'word_count_nm', 'word_count')
df = analyze_sentiment(df)
df = analyze_word_class(df)

avg_word_len = df['avg_word_len_nm']
words = df["word_count_nm"]
polarity = df['polarity']
subjectivity = df['subjectivity']
nouns = df['nouns']
adverbs = df['adverbs']
verbs = df['verbs']

# Create feature list
inputs = [[f, p, s, n, a, v, wl] for f, p, s, n, a, v, wl in zip(words, polarity, subjectivity, nouns, adverbs, verbs, avg_word_len)]
print(inputs[0:10])

#shuffle(inputs)
#inputs = inputs[0:500]

Analyzing sentiment...: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:01<00:00, 700.95it/s]
Analyzing sentiment...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:00<00:00, 744374.82it/s]
Analyzing sentiment...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:00<00:00, 750770.41it/s]
Preparing Text class analysis...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:13<00:00, 56.63it/s]
Analyzing classes...: 100%|█████████████████████████

[[0.3381201044386423, -0.03369483525733524, 0.5218915343915345, 0.19927536231884058, 0.06521739130434782, 0.09420289855072464, 0.1278980245834284], [0.7872062663185379, 0.22907657657657654, 0.38162162162162144, 0.25696594427244585, 0.06501547987616099, 0.05108359133126935, 0.02155606468909279], [0.2950391644908616, 0.010858585858585857, 0.25555555555555554, 0.167420814479638, 0.06334841628959276, 0.04524886877828054, 0.13530119302755017], [0.3289817232375979, 0.20859022556390977, 0.5764411027568922, 0.1625441696113074, 0.06713780918727916, 0.0706713780918728, 0.15239559231217706], [0.195822454308094, 0.10493827160493825, 0.7098765432098767, 0.2289156626506024, 0.060240963855421686, 0.12048192771084337, 0.3567591352145797], [0.2193211488250653, 0.2857142857142857, 0.375, 0.17391304347826086, 0.043478260869565216, 0.010869565217391304, 0.29144219937990945], [0.6005221932114883, 0.4466666666666667, 0.7569047619047617, 0.1392931392931393, 0.031185031185031187, 0.09355509355509356, 0.037677

In [4]:
########### Træning af model ###########

###########
# Opsætning af Neural Network
###########
random.seed(0) # to get repeatable results
input_size = 7 # antal af input noder (samme antal som feautures)
num_hidden = 4 # antal af hidden noder
output_size = 3 # antal af output noder (i vores tilfælde, genres)

# each hidden neuron has one weight per input, plus a bias weight
hidden_layer = [[random.random() for __ in range(input_size + 1)] for __ in range(num_hidden)]

# each output neuron has one weight per hidden neuron, plus a bias weight
output_layer = [[random.random() for __ in range(num_hidden + 1)] for __ in range(output_size)]

# the network starts out with random weights
network = [hidden_layer, output_layer]

# Iteration of training
#num = 0
print(network)
for __ in  tqdm(range(3000)):
    #num = num +1
    #if num == 200 or num == 1000 or num == 1500 or num == 2000 or num == 3500:
     #   print(network)
    for input_vector, target_vector in zip(inputs, targets):
        backpropagate(network, input_vector, target_vector)
print(network)

[[[0.8444218515250481, 0.7579544029403025, 0.420571580830845, 0.25891675029296335, 0.5112747213686085, 0.4049341374504143, 0.7837985890347726, 0.30331272607892745], [0.4765969541523558, 0.5833820394550312, 0.9081128851953352, 0.5046868558173903, 0.28183784439970383, 0.7558042041572239, 0.6183689966753316, 0.25050634136244054], [0.9097462559682401, 0.9827854760376531, 0.8102172359965896, 0.9021659504395827, 0.3101475693193326, 0.7298317482601286, 0.8988382879679935, 0.6839839319154413], [0.47214271545271336, 0.1007012080683658, 0.4341718354537837, 0.6108869734438016, 0.9130110532378982, 0.9666063677707588, 0.47700977655271704, 0.8653099277716401]], [[0.2604923103919594, 0.8050278270130223, 0.5486993038355893, 0.014041700164018955, 0.7197046864039541], [0.39882354222426875, 0.824844977148233, 0.6681532012318508, 0.0011428193144282783, 0.49357786646532464], [0.8676027754927809, 0.24391087688713198, 0.32520436274739006, 0.8704712321086546, 0.19106709150239054]]]


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3000/3000 [01:44<00:00, 28.69it/s]


[[[4.321692884082499, 0.48088974720196015, 3.0865386449069896, 2.113619561667293, 0.5895282193356174, 0.678931824416741, 1.7760446769311464, 6.854479160172247], [-15.779719765068341, 7.414034633773398, -6.622941556253754, -52.83118998589471, -2.0988931517753886, -28.165085422406587, 45.35116127839483, 19.17324715306261], [2.3625059102195447, 1.5915594081474354, 3.232227564663867, 1.863603794092083, 0.4860829727554986, 0.7518658596692722, 2.7395887283726967, 5.551944562019436], [3.8094687669109675, 0.14852104267588645, 2.685584695808479, 2.249465654425198, 1.0569617355745249, 1.3964431068863523, 0.3474753387741677, 7.376136560529309]], [[-1.6237818854742148, 3.382856447746776, -1.9633300220178764, -0.9603849697060493, -0.24522634274240404], [-0.9937410258385413, 4.223823544817371, 1.6475932404755835, -2.2040382398958704, -1.3485029134262285], [2.224679420500646, -5.678605195127497, -0.48243489643763676, 1.6228495314549711, -0.8786300046126648]]]


In [36]:
# features
test_df = test_df.copy()
test_df = test_df.sample(100)

test_df = sentence_avg_word_length(test_df,"avg_word_len", 'lyrics')
test_df = normalize(test_df, 'avg_word_len_nm', 'avg_word_len')

test_df = word_count(test_df,"word_count", 'lyrics')
test_df = normalize(test_df, 'word_count_nm', 'word_count')
test_df = analyze_sentiment(test_df)
test_df = analyze_word_class(test_df)

avg_word_len = test_df['avg_word_len_nm']
words = test_df["word_count_nm"]
polarity = test_df['polarity']
subjectivity = test_df['subjectivity']
nouns = test_df['nouns']
adverbs = test_df['adverbs']
verbs = test_df['verbs']

# Create feature list
test_features = [[f, p, s, n, a, v, wl] for f, p, s, n, a, v, wl in zip(words, polarity, subjectivity, nouns, adverbs, verbs, avg_word_len)]

Analyzing sentiment...: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 588.84it/s]
Analyzing sentiment...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<?, ?it/s]
Analyzing sentiment...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<?, ?it/s]
Preparing Text class analysis...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 51.27it/s]
Analyzing classes...: 100%|█████████████████████████

In [26]:
# 'Rock', 'Pop', 'Hip-Hop', 'Not Available', 'Metal', 'Country', 'Jazz', 'Electronic', 'Other', 'R&B', 'Indie', 'Folk'


print('##########################')
print(genre_labels)
res = predict([0.71148825065274152, 0.22561965811965812, 0.129914529914531, 0.2506896551724138, 0.0513455968010067], network)
print(res)


##########################
Index(['Pop', 'Hip-Hop', 'Rock'], dtype='object')
[0.025855813416636387, 0.9084535626079527, 0.024967467864379127]


In [37]:
count = 0
for test, t in zip(test_features, test_df['genre']):
    test_res = predict(test,network)
    
    maxnum = test_res.index(max(test_res))
    if genre_labels[maxnum] == t:
        count = count +1
        print(test_res)
        print(t)
        
print(count)
    

[0.012961425073751892, 0.9764636333711986, 0.04056135580320553]
Hip-Hop
[0.012962007373608959, 0.9764615647060653, 0.04056391448290603]
Hip-Hop
[0.1951133053613148, 0.1039346880564449, 0.7405371322215246]
Rock
[0.01639833843067727, 0.9625394670249662, 0.05615749737002833]
Hip-Hop
[0.013109331395704391, 0.9759277090775832, 0.04120616330953853]
Hip-Hop
[0.01296227205994509, 0.9764604180189842, 0.040565196462025055]
Hip-Hop
[0.19555590038430215, 0.10388896475355759, 0.739924290562173]
Rock
[0.3603777394627713, 0.09366796341187664, 0.5150659618474498]
Rock
[0.19412729033702533, 0.10402671479795148, 0.741904474798971]
Rock
[0.19413474504791695, 0.10402507194060963, 0.7418934725421044]
Rock
[0.013071535637744313, 0.9760799009489268, 0.04100745818542954]
Hip-Hop
[0.01308504687039503, 0.9760131248864603, 0.04110758968814852]
Hip-Hop
[0.19412814559509117, 0.10402379738577529, 0.7419029150522146]
Rock
[0.01297050264553924, 0.9764306033842381, 0.04060146779778115]
Hip-Hop
[0.19413768941151768, 0.