In [1]:
from matplotlib import pyplot as plt
import numpy as np
import math,random
from tqdm import tqdm
from random import shuffle
import pprint

import pandas as pd
from src.features.build_features import word_count, sentence_avg_word_length, normalize
from src.features.text_blob_analysis import analyze_sentiment, analyze_word_class
from src.data.make_dataset import create_dataset
from src.data.util import unzip_file

def step_function(x):
    return 1 if x >= 0 else 0


def perceptron_output(weights, bias, x):
    '''Returns 1 if the perceptrion 'fires', 0 if not '''
    return step_function(np.dot(weights, x) + bias)

def sigmoid(t):
    return 1 / (1 + math.exp(-t))

def neuron_output(weights, inputs):
    return sigmoid(np.dot(weights, inputs))

def predict(input, network):
    return feed_forward(network, input)[-1]

def feed_forward(neural_network, input_vector):
    """takes in a neural network (represented as a list of lists of lists of weights)
    and returns the output from forward-propagating the input"""

    outputs = []

    for layer in neural_network:

        input_with_bias = input_vector + [1]             # add a bias input
        output = [neuron_output(neuron, input_with_bias) # compute the output
                  for neuron in layer]                   # for this layer
        outputs.append(output)                           # and remember it

        # the input to the next layer is the output of this one
        input_vector = output

    return outputs
    
def backpropagate(network, input_vector, targets):
    hidden_outputs, outputs = feed_forward(network, input_vector)
    
  
    # the output * (1 - output) is from the derivative of sigmoid
    output_deltas = [output * (1 - output) * (output - target) for output, target in zip(outputs, targets)]
        # adjust weights for output layer, one neuron at a time
    for i, output_neuron in enumerate(network[-1]):
    # focus on the ith output layer neuron
        for j, hidden_output in enumerate(hidden_outputs + [1]):
            # adjust the jth weight based on both
            # this neuron's delta and its jth input
            output_neuron[j] -= output_deltas[i] * hidden_output
    # back-propagate errors to hidden layer
    hidden_deltas = [hidden_output * (1 - hidden_output) * np.dot(output_deltas, [n[i] for n in output_layer])for i, hidden_output in enumerate(hidden_outputs)]
        
    # adjust weights for hidden layer, one neuron at a time
    for i, hidden_neuron in enumerate(network[0]):
        for j, input in enumerate(input_vector + [1]):
            hidden_neuron[j] -= hidden_deltas[i] * input

In [2]:
# Importing data
test_df, df = create_dataset()
#test_df, df = create_dataset()

print(len(df))

#print(len(testdata_df))
df = df.sample(frac=1).reset_index(drop=True)#
print(df.tail())

Creating missing paths...
Skipping unzip...
Skipping data filtering...
750
       genre                                             lyrics
745     Rock  my strange uncles from abroad yes i never met'...
746  Hip-Hop  heyah heyah heyah heyah hey i'm thinking about...
747  Hip-Hop  ladies and gentlemen we got de la up in the ho...
748  Hip-Hop  this one's for all the leaders leader lets all...
749      Pop  blue painted sea watercolors in my dreams a pu...


In [3]:
# targets
series = df['genre'].value_counts()
genre_labels = series.keys() # getting genre labels
targets = [[1 if i == j else 0 for i in genre_labels] for j in df['genre']]

# features
df = sentence_avg_word_length(df,"avg_word_len", 'lyrics')
df = normalize(df, 'avg_word_len_nm', 'avg_word_len')
df = word_count(df,"word_count", 'lyrics')
df = normalize(df, 'word_count_nm', 'word_count')
df = analyze_sentiment(df)
df = analyze_word_class(df)

avg_word_len = df['avg_word_len_nm']
words = df["word_count_nm"]
polarity = df['polarity']
subjectivity = df['subjectivity']
nouns = df['nouns']
adverbs = df['adverbs']
verbs = df['verbs']

# Create feature list
inputs = [[f, p, s, n, a, v, wl] for f, p, s, n, a, v, wl in zip(words, polarity, subjectivity, nouns, adverbs, verbs, avg_word_len)]
print(inputs[0:10])

#shuffle(inputs)
#inputs = inputs[0:500]

Analyzing sentiment...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:01<00:00, 672.26it/s]
Analyzing sentiment...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:00<00:00, 752746.59it/s]
Analyzing sentiment...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:00<00:00, 719023.54it/s]
Preparing Text class analysis...: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:11<00:00, 65.47it/s]
Analyzing classes...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:00<00:00, 46997.46it/s]
Analyzing classes...: 100%|███████████████████████████████████████████████████████████████████████████████████

[[0.2079002079002079, 0.07499999999999998, 0.31250000000000006, 0.63, 0.09, 0.09, 0.01888758920124565], [0.4178794178794179, -0.023524305555555555, 0.5927951388888889, 0.73, 0.42, 0.32, 0.004637961089191585], [0.2702702702702703, -0.00788690476190476, 0.5146825396825395, 0.67, 0.35, 0.14, 0.011110226424631933], [0.2182952182952183, -0.1270408163265306, 0.464030612244898, 0.52, 0.11, 0.18, 0.01489337311827869], [0.07588357588357589, -0.09583333333333333, 0.3354166666666667, 0.25, 0.13, 0.11, 0.07613477230145324], [0.19542619542619544, 0.08166666666666668, 0.5711111111111111, 0.3, 0.52, 0.17, 0.019701193180852266], [0.31496881496881496, 0.2128384687208217, 0.6376750700280113, 0.5, 0.09, 0.38, 0.007963665384534248], [0.6018711018711018, 0.006659512803580602, 0.540883104654291, 1.28, 0.48, 0.39, 0.0017418571090835314], [0.632016632016632, 0.05679563492063492, 0.6968253968253967, 1.41, 0.54, 0.56, 0.001552089144493869], [0.17255717255717257, 0.484589947089947, 0.8312334656084657, 0.41, 0.11

In [4]:
########### Træning af model ###########

###########
# Opsætning af Neural Network
###########
random.seed(0) # to get repeatable results
input_size = 7 # antal af input noder (samme antal som feautures)
num_hidden = 5 # antal af hidden noder
output_size = 3 # antal af output noder (i vores tilfælde, genres)

# each hidden neuron has one weight per input, plus a bias weight
hidden_layer = [[random.random() for __ in range(input_size + 1)] for __ in range(num_hidden)]

# each output neuron has one weight per hidden neuron, plus a bias weight
output_layer = [[random.random() for __ in range(num_hidden + 1)] for __ in range(output_size)]

# the network starts out with random weights
network = [hidden_layer, output_layer]

# Iteration of training
#num = 0
print(network)
for __ in  tqdm(range(3000)):
    #num = num +1
    #if num == 200 or num == 1000 or num == 1500 or num == 2000 or num == 3500:
     #   print(network)
    for input_vector, target_vector in zip(inputs, targets):
        backpropagate(network, input_vector, target_vector)
print(network)

[[[0.8444218515250481, 0.7579544029403025, 0.420571580830845, 0.25891675029296335, 0.5112747213686085, 0.4049341374504143, 0.7837985890347726, 0.30331272607892745], [0.4765969541523558, 0.5833820394550312, 0.9081128851953352, 0.5046868558173903, 0.28183784439970383, 0.7558042041572239, 0.6183689966753316, 0.25050634136244054], [0.9097462559682401, 0.9827854760376531, 0.8102172359965896, 0.9021659504395827, 0.3101475693193326, 0.7298317482601286, 0.8988382879679935, 0.6839839319154413], [0.47214271545271336, 0.1007012080683658, 0.4341718354537837, 0.6108869734438016, 0.9130110532378982, 0.9666063677707588, 0.47700977655271704, 0.8653099277716401]], [[0.2604923103919594, 0.8050278270130223, 0.5486993038355893, 0.014041700164018955, 0.7197046864039541], [0.39882354222426875, 0.824844977148233, 0.6681532012318508, 0.0011428193144282783, 0.49357786646532464], [0.8676027754927809, 0.24391087688713198, 0.32520436274739006, 0.8704712321086546, 0.19106709150239054]]]


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3000/3000 [01:47<00:00, 27.63it/s]


[[[5.819527295055575, 13.47195739914964, -8.188822924429266, 2.282260580460534, -1.9005190928476696, 56.954003819523074, 8.725476067147865, -7.7758977010204005], [34.849244904472386, -9.557263082789964, 15.070479644357885, 17.81977973344491, 9.522707597360547, 11.164769835371091, -3.5180353999144116, -37.07220984472131], [1.8174484737894494, 1.5455668051905904, 3.4917062595367256, 2.9981912227959406, 1.136720369942305, 1.537555416139526, 1.3246029700856543, 6.549482051024189], [1.4867933235235047, 0.5952650581860128, 3.1222006053525964, 2.7370807758618336, 1.9985487350227926, 2.1676136946917177, 0.2624101417984257, 6.8687145432113335]], [[2.4213116099043233, 4.905516833285936, 0.31311947439683835, -0.9535007223950023, -2.2087211236803905], [-2.0685677666289344, -9.56013215333961, -0.41410408463893367, -0.994844437792153, 1.3483688468865092], [0.9130941852792996, -4.032689357566527, -0.5905138616558943, 0.7327804302833754, -0.42990576414057435]]]


In [13]:
# features
test_df = test_df.copy()
test_df = test_df.sample(100)

test_df = sentence_avg_word_length(test_df,"avg_word_len", 'lyrics')
test_df = normalize(test_df, 'avg_word_len_nm', 'avg_word_len')

test_df = word_count(test_df,"word_count", 'lyrics')
test_df = normalize(test_df, 'word_count_nm', 'word_count')
test_df = analyze_sentiment(test_df)
test_df = analyze_word_class(test_df)

avg_word_len = test_df['avg_word_len_nm']
words = test_df["word_count_nm"]
polarity = test_df['polarity']
subjectivity = test_df['subjectivity']
nouns = test_df['nouns']
adverbs = test_df['adverbs']
verbs = test_df['verbs']

# Create feature list
test_features = [[f, p, s, n, a, v, wl] for f, p, s, n, a, v, wl in zip(words, polarity, subjectivity, nouns, adverbs, verbs, avg_word_len)]

Analyzing sentiment...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 642.58it/s]
Analyzing sentiment...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 100150.53it/s]
Analyzing sentiment...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 100246.27it/s]
Preparing Text class analysis...: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 56.95it/s]
Analyzing classes...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 25079.55it/s]
Analyzing classes...: 100%|███████████████████████████████████████████████████████████████████████████████████

In [14]:
# 'Rock', 'Pop', 'Hip-Hop', 'Not Available', 'Metal', 'Country', 'Jazz', 'Electronic', 'Other', 'R&B', 'Indie', 'Folk'


print('##########################')
print(genre_labels)
res = predict([0.71148825065274152, 0.22561965811965812, 0.129914529914531, 0.2506896551724138, 0.0513455968010067,1,1], network)
print(res)


##########################
Index(['Hip-Hop', 'Rock', 'Pop'], dtype='object')
[0.8984977102605137, 0.000736710633906047, 0.17962886371791117]


In [15]:
count = 0
for test, t in zip(test_features, test_df['genre']):
    test_res = predict(test,network)
    
    maxnum = test_res.index(max(test_res))
    if genre_labels[maxnum] == t:
        count = count +1
        print(test_res)
        print(t)
        
print(count)
    

[0.9887683688235016, 8.383140592861208e-06, 0.032070136192982455]
Hip-Hop
[0.057575119204955186, 0.4734168569009032, 0.43354696297176304]
Rock
[0.9887682853089413, 8.383276229282182e-06, 0.03207033485992916]
Hip-Hop
[0.7545102810303407, 0.005743451466870997, 0.34244281456066056]
Hip-Hop
[0.9886684602261593, 8.447537088030916e-06, 0.031965448645637246]
Hip-Hop
[0.703743109530164, 0.009463532802088077, 0.39186419449494675]
Hip-Hop
[0.9882326234917497, 8.72803955266045e-06, 0.031524359125590276]
Hip-Hop
[0.0644486101772123, 0.44797509315179945, 0.44469294046636576]
Rock
[0.39015063597799915, 0.10784223602879794, 0.6498366341930782]
Pop
[0.9887683703520171, 8.383143263889552e-06, 0.0320701354758099]
Hip-Hop
[0.3946498805657231, 0.10630168623088776, 0.6514564222275997]
Pop
[0.9887683700426303, 8.383143367056846e-06, 0.032070136746813034]
Hip-Hop
[0.9887497384030194, 8.410582297225196e-06, 0.03211284745121219]
Hip-Hop
[0.9887679241982827, 8.383829815995546e-06, 0.03207117689085404]
Hip-Hop
[