In [1]:
from matplotlib import pyplot as plt
import numpy as np
import math,random
from tqdm import tqdm
from random import shuffle
import pprint

import pandas as pd
from src.features.build_features import word_count, sentence_avg_word_length, normalize
from src.features.text_blob_analysis import analyze_sentiment, analyze_word_class
from src.data.make_dataset import create_dataset
from src.data.util import unzip_file

def step_function(x):
    return 1 if x >= 0 else 0


def perceptron_output(weights, bias, x):
    '''Returns 1 if the perceptrion 'fires', 0 if not '''
    return step_function(np.dot(weights, x) + bias)

def sigmoid(t):
    return 1 / (1 + math.exp(-t))

def neuron_output(weights, inputs):
    return sigmoid(np.dot(weights, inputs))

def predict(input, network):
    return feed_forward(network, input)[-1]

def feed_forward(neural_network, input_vector):
    """takes in a neural network (represented as a list of lists of lists of weights)
    and returns the output from forward-propagating the input"""

    outputs = []

    for layer in neural_network:

        input_with_bias = input_vector + [1]             # add a bias input
        output = [neuron_output(neuron, input_with_bias) # compute the output
                  for neuron in layer]                   # for this layer
        outputs.append(output)                           # and remember it

        # the input to the next layer is the output of this one
        input_vector = output

    return outputs
    
def backpropagate(network, input_vector, targets):
    hidden_outputs, outputs = feed_forward(network, input_vector)
    
  
    # the output * (1 - output) is from the derivative of sigmoid
    output_deltas = [output * (1 - output) * (output - target) for output, target in zip(outputs, targets)]
        # adjust weights for output layer, one neuron at a time
    for i, output_neuron in enumerate(network[-1]):
    # focus on the ith output layer neuron
        for j, hidden_output in enumerate(hidden_outputs + [1]):
            # adjust the jth weight based on both
            # this neuron's delta and its jth input
            output_neuron[j] -= output_deltas[i] * hidden_output
    # back-propagate errors to hidden layer
    hidden_deltas = [hidden_output * (1 - hidden_output) * np.dot(output_deltas, [n[i] for n in output_layer])for i, hidden_output in enumerate(hidden_outputs)]
        
    # adjust weights for hidden layer, one neuron at a time
    for i, hidden_neuron in enumerate(network[0]):
        for j, input in enumerate(input_vector + [1]):
            hidden_neuron[j] -= hidden_deltas[i] * input

In [2]:
# Importing data
test_df, df = create_dataset()
#test_df, df = create_dataset()

print(len(df))
#print(len(testdata_df))
df = df.sample(frac=1).reset_index(drop=True)
print(df.tail())

Creating missing paths...
Skipping unzip...
Skipping data filtering...
750
       genre                                             lyrics
745     Rock  back to back worlds apart in this endless figh...
746     Rock  here i am caught in the moment seems to be fro...
747      Pop  kiss me deep in the devil's rain when i'm danc...
748  Hip-Hop  vocabulary spills remix consequence's verse yo...
749  Hip-Hop  huh i couldn't be nobody but myself you know t...


In [3]:
# targets
series = df['genre'].value_counts()
genre_labels = series.keys() # getting genre labels
targets = [[1 if i == j else 0 for i in genre_labels] for j in df['genre']]

# features
df = sentence_avg_word_length(df,"avg_word_len", 'lyrics')
df = word_count(df,"word_count", 'lyrics')
df = normalize(df, 'word_count_nm', 'word_count')
df = analyze_sentiment(df)
df = analyze_word_class(df)

polarity = df['polarity']
subjectivity = df['subjectivity']
nouns = df['nouns']
adverbs = df['adverbs']
verbs = df['verbs']

# Create feature list
inputs = [[f, p, s, n] for f, p, s, n in zip(df["word_count_nm"], polarity, subjectivity, nouns)]
print(inputs[0:10])

#shuffle(inputs)
#inputs = inputs[0:500]

Analyzing sentiment...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:01<00:00, 743.33it/s]
Analyzing sentiment...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:00<00:00, 375878.60it/s]
Analyzing sentiment...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:00<00:00, 375654.17it/s]
Preparing Text class analysis...: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:10<00:00, 70.21it/s]
Analyzing classes...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:00<00:00, 53712.53it/s]
Analyzing classes...: 100%|███████████████████████████████████████████████████████████████████████████████████

[[0.3838582677165354, 0.20958994708994708, 0.39804894179894174, 0.52], [0.25688976377952755, 0.08583333333333332, 0.7641666666666667, 0.31], [0.3828740157480315, 0.2467171717171717, 0.6143939393939394, 0.96], [0.7933070866141733, 0.09164887966971305, 0.4731040564373897, 1.42], [0.3828740157480315, 0.3010714285714285, 0.5139285714285714, 0.55], [0.6909448818897638, -0.02892581274934217, 0.44462481962481976, 1.32], [0.10826771653543307, 0.009523809523809518, 0.4385361552028219, 0.19], [0.1968503937007874, -0.06499999999999999, 0.4149999999999999, 0.18], [0.2529527559055118, 0.03792249417249417, 0.3697698135198135, 0.56], [0.1279527559055118, -0.015625000000000007, 0.40625, 0.31]]


In [4]:
########### Træning af model ###########

###########
# Opsætning af Neural Network
###########
random.seed(0) # to get repeatable results
input_size = 4 # antal af input noder (samme antal som feautures)
num_hidden = 4 # antal af hidden noder
output_size = 3 # antal af output noder (i vores tilfælde, genres)

# each hidden neuron has one weight per input, plus a bias weight
hidden_layer = [[random.random() for __ in range(input_size + 1)] for __ in range(num_hidden)]

# each output neuron has one weight per hidden neuron, plus a bias weight
output_layer = [[random.random() for __ in range(num_hidden + 1)] for __ in range(output_size)]

# the network starts out with random weights
network = [hidden_layer, output_layer]

# Iteration of training
#num = 0
print(network)
for __ in  tqdm(range(3000)):
    #num = num +1
    #if num == 200 or num == 1000 or num == 1500 or num == 2000 or num == 3500:
     #   print(network)
    for input_vector, target_vector in zip(inputs, targets):
        backpropagate(network, input_vector, target_vector)
print(network)

[[[0.8444218515250481, 0.7579544029403025, 0.420571580830845, 0.25891675029296335, 0.5112747213686085], [0.4049341374504143, 0.7837985890347726, 0.30331272607892745, 0.4765969541523558, 0.5833820394550312], [0.9081128851953352, 0.5046868558173903, 0.28183784439970383, 0.7558042041572239, 0.6183689966753316], [0.25050634136244054, 0.9097462559682401, 0.9827854760376531, 0.8102172359965896, 0.9021659504395827]], [[0.3101475693193326, 0.7298317482601286, 0.8988382879679935, 0.6839839319154413, 0.47214271545271336], [0.1007012080683658, 0.4341718354537837, 0.6108869734438016, 0.9130110532378982, 0.9666063677707588], [0.47700977655271704, 0.8653099277716401, 0.2604923103919594, 0.8050278270130223, 0.5486993038355893]]]


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3000/3000 [01:30<00:00, 32.88it/s]


[[[2.5603100507885683, 1.1529064487573306, 3.4897169607266645, 3.2093810513129433, 7.657452006603896], [56.33939127800425, 2.488708266753539, -6.623449443347104, 9.364473788168844, -6.009975096194004], [45.67873940129307, -0.7659575923763904, 11.305505981930416, 6.557174267731038, -27.00770917950639], [1.450482394939248, 0.9430368589386668, 3.833261584236663, 3.1178722525099642, 7.863923239053654]], [[-14.848063151451253, 47.13353627144578, 5.026361411341153, -15.567933952265761, -17.785110087164316], [-0.9574856810900833, -2.4057102111377513, -3.3975668889105983, 0.5790131678886916, 1.6810849994840737], [-1.04321524116235, 1.2705829565263909, -4.073540140330175, -0.651479848006928, -0.4617129695327657]]]


In [5]:
# features
test_df = test_df.copy()
test_df = test_df.sample(100)
print(test_df.tail())
test_df = sentence_avg_word_length(test_df,"avg_word_len", 'lyrics')
test_df = word_count(test_df,"word_count", 'lyrics')
test_df = normalize(test_df, 'word_count_nm', 'word_count')
test_df = analyze_sentiment(test_df)

polarity = test_df['polarity']
subjectivity = test_df['subjectivity']
nouns = df['nouns']
adverbs = df['adverbs']
verbs = df['verbs']

# Create feature list
test_features = [[f, p, s, n] for f, p, s, n in zip(test_df["word_count_nm"], polarity, subjectivity, nouns)]


      genre                                             lyrics
4575   Rock  you have always lived like this the ice blue c...
5040   Rock  is their anyone out there earth calling is the...
32999   Pop  i've always been the kinda girl that hid my fa...
11862   Pop  i wanted to talk to you - pick up the phone i ...
12724   Pop  save your point of view for the manic charm th...


Analyzing sentiment...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 742.68it/s]
Analyzing sentiment...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<?, ?it/s]
Analyzing sentiment...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 100342.20it/s]


In [6]:
# 'Rock', 'Pop', 'Hip-Hop', 'Not Available', 'Metal', 'Country', 'Jazz', 'Electronic', 'Other', 'R&B', 'Indie', 'Folk'


print('##########################')
print(genre_labels)
res = predict([0.0152699731248473, 0.12666666666666668, 0.4533333333333333,0.10], network)
print(res)


##########################
Index(['Hip-Hop', 'Rock', 'Pop'], dtype='object')
[1.2240209566865162e-21, 0.7858725398881213, 0.10386113703750448]


In [7]:
count = 0
for test, t in zip(test_features, test_df['genre']):
    test_res = predict(test,network)
    print(test_res)
    maxnum = test_res.index(max(test_res))
    if genre_labels[maxnum] == t:
        count = count +1
        print(t)
        
print(count)
    

[0.975338374365715, 0.01324970870901831, 0.0087380774460874]
Hip-Hop
[0.9812736909388093, 0.01097972135162718, 0.006968527709036214]
Hip-Hop
[0.24739986130119335, 0.24955377831298206, 0.291667797013161]
[0.5975073350439479, 0.10989811722611213, 0.1119260483719831]
[0.25704184068003777, 0.2482716999009484, 0.29077562896102005]
Pop
[0.9812711499039566, 0.010979849745903098, 0.006968580513997374]
Hip-Hop
[0.3294239951051432, 0.20681205478822898, 0.23595880882514422]
Hip-Hop
[6.769376283711332e-12, 0.5387112832029507, 0.17503995900794012]
Rock
[0.2712830102275121, 0.23926621737998696, 0.27886525664183803]
[0.00011561554933262887, 0.33294921728540894, 0.24947574387307447]
[0.19449003400363646, 0.2520193642662699, 0.2892267962010165]
[0.25963149201702934, 0.24668573866076612, 0.2886940891949235]
Pop
[0.2517006274259506, 0.24933278290610023, 0.29179168080866835]
[2.4323908389817964e-05, 0.35086333711413054, 0.24169648905227897]
Rock
[0.3577093689370917, 0.19325166312498926, 0.2181317635661987