In [118]:
from matplotlib import pyplot as plt
import numpy as np
import math,random
from tqdm import tqdm
from random import shuffle
import pprint

import pandas as pd
from src.features.build_features import word_count, sentence_avg_word_length, normalize
from src.features.text_blob_analysis import analyze_sentiment
from src.data.make_dataset import create_dataset
from src.data.util import unzip_file

def step_function(x):
    return 1 if x >= 0 else 0


def perceptron_output(weights, bias, x):
    '''Returns 1 if the perceptrion 'fires', 0 if not '''
    return step_function(np.dot(weights, x) + bias)

def sigmoid(t):
    return 1 / (1 + math.exp(-t))

def neuron_output(weights, inputs):
    return sigmoid(np.dot(weights, inputs))

def predict(input, network):
    return feed_forward(network, input)[-1]

def feed_forward(neural_network, input_vector):
    """takes in a neural network (represented as a list of lists of lists of weights)
    and returns the output from forward-propagating the input"""

    outputs = []

    for layer in neural_network:

        input_with_bias = input_vector + [1]             # add a bias input
        output = [neuron_output(neuron, input_with_bias) # compute the output
                  for neuron in layer]                   # for this layer
        outputs.append(output)                           # and remember it

        # the input to the next layer is the output of this one
        input_vector = output

    return outputs
    
def backpropagate(network, input_vector, targets):
    hidden_outputs, outputs = feed_forward(network, input_vector)
    
  
    # the output * (1 - output) is from the derivative of sigmoid
    output_deltas = [output * (1 - output) * (output - target) for output, target in zip(outputs, targets)]
        # adjust weights for output layer, one neuron at a time
    for i, output_neuron in enumerate(network[-1]):
    # focus on the ith output layer neuron
        for j, hidden_output in enumerate(hidden_outputs + [1]):
            # adjust the jth weight based on both
            # this neuron's delta and its jth input
            output_neuron[j] -= output_deltas[i] * hidden_output
    # back-propagate errors to hidden layer
    hidden_deltas = [hidden_output * (1 - hidden_output) * np.dot(output_deltas, [n[i] for n in output_layer])for i, hidden_output in enumerate(hidden_outputs)]
        
    # adjust weights for hidden layer, one neuron at a time
    for i, hidden_neuron in enumerate(network[0]):
        for j, input in enumerate(input_vector + [1]):
            hidden_neuron[j] -= hidden_deltas[i] * input

In [154]:
# Importing data
df, test_df = create_dataset()
#test_df, df = create_dataset()

print(len(df))
#print(len(testdata_df))
df = df.sample(frac=1).reset_index(drop=True)
print(df.tail())

Creating missing paths...
Skipping unzip...
Skipping data filtering...
12000
         genre                                             lyrics
11995      Pop  hey somebody's giving me grief i'm having one ...
11996  Hip-Hop  i know i mess around cos' of all i did before ...
11997      Pop  you do no rest to join the best with your smar...
11998      Pop  my my lagan love my lagan love where lagan str...
11999  Hip-Hop  [intro] just another story in the game but eve...


In [155]:
# targets
series = df['genre'].value_counts()
genre_labels = series.keys() # getting genre labels
targets = [[1 if i == j else 0 for i in genre_labels] for j in df['genre']]

# features
df = sentence_avg_word_length(df,"avg_word_len", 'lyrics')
df = word_count(df,"word_count", 'lyrics')
df = normalize(df, 'word_count_nm', 'word_count')
df = analyze_sentiment(df)

polarity = df['polarity']
subjectivity = df['subjectivity']

# Create feature list
inputs = [[f, p, s] for f, p, s in zip(df["word_count_nm"], polarity, subjectivity)]
print(inputs[0:10])

#shuffle(inputs)
#inputs = inputs[0:500]

Analyzing sentiment...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12000/12000 [00:18<00:00, 644.34it/s]
Analyzing sentiment...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 12000/12000 [00:00<00:00, 857394.82it/s]
Analyzing sentiment...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 12000/12000 [00:00<00:00, 859385.80it/s]


[[0.01746884925482531, 0.03333333333333331, 0.53], [0.019301246029806988, -0.19473684210526318, 0.7298245614035087], [0.02907402882970926, 0.1887179487179487, 0.49461538461538457], [0.07341803078426581, -0.023272946859903394, 0.5464354727398205], [0.02589787441974102, -0.3384615384615385, 0.5942307692307692], [0.01673589054483264, 0.08392857142857144, 0.5002976190476192], [0.06511116540434889, -0.06238425925925926, 0.5479745370370371], [0.015147813339848522, -0.041666666666666664, 0.17500000000000004], [0.02455411678475446, 0.03888888888888888, 0.5062091503267974], [0.0769606645492304, -0.008293799047223714, 0.454799462333709]]


In [156]:
########### Træning af model ###########

###########
# Opsætning af Neural Network
###########
random.seed(0) # to get repeatable results
input_size = 3 # antal af input noder (samme antal som feautures)
num_hidden = 3 # antal af hidden noder
output_size = 3 # antal af output noder (i vores tilfælde, genres)

# each hidden neuron has one weight per input, plus a bias weight
hidden_layer = [[random.random() for __ in range(input_size + 1)] for __ in range(num_hidden)]

# each output neuron has one weight per hidden neuron, plus a bias weight
output_layer = [[random.random() for __ in range(num_hidden + 1)] for __ in range(output_size)]

# the network starts out with random weights
network = [hidden_layer, output_layer]

# Iteration of training
#num = 0
print(network)
for __ in  tqdm(range(1000)):
    #num = num +1
    #if num == 200 or num == 1000 or num == 1500 or num == 2000 or num == 3500:
     #   print(network)
    for input_vector, target_vector in zip(inputs, targets):
        backpropagate(network, input_vector, target_vector)
print(network)

[[[0.8444218515250481, 0.7579544029403025, 0.420571580830845, 0.25891675029296335], [0.5112747213686085, 0.4049341374504143, 0.7837985890347726, 0.30331272607892745], [0.4765969541523558, 0.5833820394550312, 0.9081128851953352, 0.5046868558173903]], [[0.28183784439970383, 0.7558042041572239, 0.6183689966753316, 0.25050634136244054], [0.9097462559682401, 0.9827854760376531, 0.8102172359965896, 0.9021659504395827], [0.3101475693193326, 0.7298317482601286, 0.8988382879679935, 0.6839839319154413]]]


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [06:59<00:00,  2.40it/s]


[[[-156.3244782888326, 0.5136304929310919, -0.7632816783231973, 6.78787527019377], [-3.8395423824885797, 33.11712430522236, 1.5847863103177058, -4.320203612051829], [1.077695044521928, 2.0407635238620743, 4.797040296367694, 8.70372003342421]], [[0.7213965542478487, 1.8049169296521521, 0.4750195975897849, -2.6491334428648527], [6.382905253213949, -1.0205907383693082, -4.045609893381644, -2.293956494509389], [-4.74813071784525, -2.260290622281765, 1.6955597024283608, 1.30070786060303]]]


In [161]:
# features
test_df = test_df.copy()
test_df = test_df.sample(100)
print(test_df.tail())
test_df = sentence_avg_word_length(test_df,"avg_word_len", 'lyrics')
test_df = word_count(test_df,"word_count", 'lyrics')
test_df = normalize(test_df, 'word_count_nm', 'word_count')
test_df = analyze_sentiment(test_df)

polarity = test_df['polarity']
subjectivity = test_df['subjectivity']

# Create feature list
test_features = [[f, p, s] for f, p, s in zip(test_df["word_count_nm"], polarity, subjectivity)]

         genre                                             lyrics  \
68513  Hip-Hop  yeah man jah will be waiting there we a shout ...   
17076     Rock  every morning i put it on i walk outside and i...   
69671  Hip-Hop  i'm gonna stay to myself where there's no one ...   
17540     Rock  i've broken every rule i swore to keep about y...   
17836     Rock  d davidson baby baby love for sale you can't b...   

       avg_word_len  word_count  word_count_nm  polarity  subjectivity  
68513      4.193878         686       0.675197 -0.030041      0.445575  
17076      3.952756         127       0.125000 -0.095833      0.335417  
69671      3.944805         308       0.303150  0.250318      0.443277  
17540      4.036530         219       0.215551  0.100740      0.516089  
17836      3.785714         112       0.110236  0.022593      0.610741  


Analyzing sentiment...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 705.59it/s]
Analyzing sentiment...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<?, ?it/s]
Analyzing sentiment...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 100366.21it/s]


In [158]:
# 'Rock', 'Pop', 'Hip-Hop', 'Not Available', 'Metal', 'Country', 'Jazz', 'Electronic', 'Other', 'R&B', 'Indie', 'Folk'


print('##########################')
print(genre_labels)
res = predict([0.0152699731248473, 0.12666666666666668, 0.4533333333333333], network)
print(res)


##########################
Index(['Pop', 'Rock', 'Hip-Hop'], dtype='object')
[0.41915941929678396, 0.3313358731175177, 0.043060621653171205]


In [163]:
count = 0
for test, t in zip(test_features, test_df['genre']):
    test_res = predict(test,network)
    print(test_res)
    maxnum = test_res.index(max(test_res))
    if genre_labels[maxnum] == t:
        count = count +1
    print(t)
        
print(count)
    

[0.10650227632997333, 0.0017157227156242001, 0.9496564188956718]
Rock
[0.1021005522292767, 0.0017620031316483993, 0.9524035043171104]
Pop
[0.1021293299606386, 0.0017616847463560927, 0.9523857812611275]
Pop
[0.1021416840130401, 0.0017615804383262353, 0.9523776666648818]
Rock
[0.10210808978596103, 0.0017619223185976081, 0.9523988218886777]
Hip-Hop
[0.3298992309266476, 0.0007701362752963722, 0.761520458953296]
Rock
[0.10209868412766368, 0.0017620786114126966, 0.9524038070611617]
Hip-Hop
[0.2575612956131854, 0.0009385199809766504, 0.8319352370321833]
Pop
[0.40871084813428915, 0.0006357394130124783, 0.6761410741358075]
Pop
[0.10394222905864789, 0.001742399716388469, 0.9512574978294298]
Pop
[0.11399177338797249, 0.0016433377308179731, 0.9448815846750993]
Hip-Hop
[0.4056992034918534, 0.0006402440041652518, 0.6795515215386836]
Pop
[0.10209875198762342, 0.0017620438738841302, 0.9524042875665292]
Rock
[0.1097243071139345, 0.0016837215678446415, 0.9476165466935772]
Rock
[0.10704785074788646, 0.00