In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from typing import List

In [2]:
from collections import Counter

class Indexer(object):
    def __init__(self):
        self.objs_to_ints = {}
        self.ints_to_objs = {}

    def __repr__(self):
        return str([str(self.get_object(i)) for i in range(0, len(self))])

    def __str__(self):
        return self.__repr__()

    def __len__(self):
        return len(self.objs_to_ints)

    def get_object(self, index):
        if (index not in self.ints_to_objs):
            return None
        else:
            return self.ints_to_objs[index]

    def contains(self, object):
        return self.index_of(object) != -1

    def index_of(self, object):
        if (object not in self.objs_to_ints):
            return -1
        else:
            return self.objs_to_ints[object]

    def add_and_get_index(self, object, add=True):
        if not add:
            return self.index_of(object)
        if (object not in self.objs_to_ints):
            new_idx = len(self.objs_to_ints)
            self.objs_to_ints[object] = new_idx
            self.ints_to_objs[new_idx] = object
        return self.objs_to_ints[object]

class UnigramFeatureExtractor():
    def __init__(self, indexer: Indexer):
        self._indexer = indexer

    def get_indexer(self) -> Indexer:
        return self._indexer

    def extract_features(self, sentence: List[str], add_to_indexer: bool = False) -> Counter:

        unigram_list: List[str] = []  # Will contain non-unique list of all unigrams parsed from the sentence

        for unigram in sentence:

            index = self._indexer.add_and_get_index(unigram.lower(), add_to_indexer)

            if (index != -1):  # If unigram didn't get added to the indexer (this occurs during the testing phase)
                unigram_list.append(unigram.lower())

        return Counter(unigram_list)
    
# Helper function to create feature vector from feature counter
def get_feature_vector(feature_counter: Counter, feature_extractor) -> np.ndarray:

    feature_vector = np.zeros(len(feature_extractor.get_indexer()) + 1)

    for feature in feature_counter:
        feature_idx = feature_extractor.get_indexer().index_of(feature)
        feature_vector[feature_idx] = feature_counter[feature]

    feature_vector[-1] = 1

    return feature_vector

In [3]:
indexer = Indexer()
uni_fv = UnigramFeatureExtractor(indexer)

In [4]:
# load lyric data
data = pd.read_csv('lyric_data.csv')
data

Unnamed: 0,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,lyrics
0,52xJxFP6TqMuO4Yt0eOkMz,0.577,0.450,0,-8.516,0,0.0834,0.357000,0.000000,0.1110,0.8300,205.863,216120,4,"We don't talk about Bruno, no, no, no\nWe don'..."
1,3XOalgusokruzA5ZBA2Qcb,0.773,0.422,1,-4.572,0,0.1870,0.007830,0.006930,0.1290,0.4880,77.502,136267,1,"Wheezy outta here\nPushin' P\nYeah, pushin' P,..."
2,02MWAaffLxlfxAUY7c5dvx,0.761,0.525,11,-6.900,1,0.0944,0.440000,0.000007,0.0921,0.5310,80.870,238805,4,"(Don't stop, baby, you can go on through\n(Don..."
3,5Z9KJZvQzH6PFmb8SNkxuk,0.741,0.691,10,-7.395,0,0.0672,0.022100,0.000000,0.0476,0.8920,150.087,212353,4,"Baby back, ayy\nCouple racks, ayy\nCouple Gram..."
4,1r9xUipOqoNwggBpENDsvJ,0.728,0.783,11,-4.424,0,0.2660,0.237000,0.000000,0.4340,0.5550,77.011,173381,4,Look out for yourself\n\nI wake up to the soun...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,4xkOaSrkexMciUUogZKVTS,0.548,0.847,1,-3.237,1,0.1860,0.062200,0.000000,0.0816,0.1000,171.447,297787,4,
98,42Xba467wgGvYrR2EE6s0i,0.597,0.615,2,-6.346,1,0.0421,0.030300,0.000000,0.3780,0.4670,160.014,227527,4,"Nay-nay-nay, nah, oh, oh\n\nSmoke cigarettes, ..."
99,3KyKxJ4P3pVCgaZwaq2rUC,0.728,0.741,6,-7.075,0,0.0473,0.000582,0.002060,0.3300,0.3100,123.006,212166,4,Let's burn it fucking down\nYeah\n\nBack from ...
100,161DnLWsx1i3u1JT05lzqU,0.498,0.590,1,-4.721,0,0.0320,0.511000,0.000000,0.1070,0.0784,145.867,217867,4,I know you're somewhere out there\nSomewhere f...


In [5]:
data.loc[data['track_id'] == '7rglLriMNBPAyuJOMGwi39']['lyrics']

8    Şarkı Sözleri\n\nDinle\n\nAna Sonuçlar\n\nIt's...
Name: lyrics, dtype: object

In [6]:
# preprocess data
data.dropna(subset = ['lyrics'], inplace=True)
data = data.astype({"lyrics": str}, errors='raise') 

# filter lyrics
pre_filter_lyrics = data['lyrics'].tolist()
lyrics = []

for song in pre_filter_lyrics:
    song = song.replace('\n', ' ')
    song = song.replace('.', ' ')
    song = song.replace(',', ' ')
    song = song.replace('(', ' ')
    song = song.replace(')', ' ')
    lyrics.append(song)

In [7]:
# get feature vectors
feature_matrix = np.zeros((len(lyrics), len(uni_fv.get_indexer()) + 1))
for i, song in enumerate(lyrics):
    words = song.split(' ')
    counter = uni_fv.extract_features(words, True)
    
    feature_vector = get_feature_vector(counter, uni_fv)
    feature_vector = np.where(feature_vector > 0, 1, 0)
    feature_matrix[i] = feature_vector

feature_matrix

ValueError: could not broadcast input array from shape (101) into shape (1)

In [None]:
# # bag of words
# CountVec = CountVectorizer(ngram_range=(1,1), # to use bigrams ngram_range=(2,2)
#                            stop_words='english')
# #transform
# Count_data = CountVec.fit_transform(lyrics)
 
# #create dataframe
# cv_dataframe=pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names())
# print(cv_dataframe)

In [None]:
# danceability model
X = feature_matrix
y = data[['danceability']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=44)

dance_reg = LinearRegression().fit(X_train, y_train)
diff = dance_reg.predict(X_test) - y_test
print(diff)
print(diff.sum(axis=0)**2)

In [None]:
# energy model
X = feature_matrix
y = data[['energy']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=44)

energy_reg = LinearRegression().fit(X_train, y_train)
diff = energy_reg.predict(X_test) - y_test
print(diff)
print(diff.sum(axis=0)**2)

In [None]:
# valence model
X = feature_matrix
y = data[['valence']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=44)

valence_reg = LinearRegression().fit(X_train, y_train)
diff = valence_reg.predict(X_test) - y_test
print(diff)
print(diff.sum(axis=0)**2)

In [None]:
sentence = "no way bro what is up bro"
sentence = sentence.replace('\n', ' ')
sentence = sentence.replace('.', ' ')
sentence = sentence.replace(',', ' ')
sentence = sentence.replace('(', ' ')
sentence = sentence.replace(')', ' ')

words = sentence.split(' ')
counter = uni_fv.extract_features(words, False)

feature_vector = get_feature_vector(counter, uni_fv)
feature_vector = np.where(feature_vector > 0, 1, 0)
feature_vector = np.reshape(feature_vector, (1, feature_vector.shape[0]))
print(dance_reg.predict(feature_vector), energy_reg.predict(feature_vector), valence_reg.predict(feature_vector))