In [5]:
import os
import re
import argparse
import pickle

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import warnings
warnings.simplefilter("ignore")

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import svm

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from sklearn.metrics import classification_report

data_queen = pickle.load(open('pickle_files/data_queen.pkl', 'rb'))
data_pixies = pickle.load(open('pickle_files/data_pixies.pkl', 'rb'))

In [6]:
data_queen

Unnamed: 0,lyrics,artist
0,one two three four ooh ooh while the sun hangs...,Queen
1,oh oh you don t fool me you don t fool me you ...,Queen
2,woh everything i do i do for you ow oh yeah we...,"Queen, Brian May"
3,can anybody find me somebody to love ooh each ...,Queen
4,yesterday my life was in ruin now today i know...,Queen
...,...,...
95,take off i was told a million times of all the...,Queen
96,yesterday my life was in ruin now today i know...,Queen
97,take off i was told a million times of all the...,Queen
98,i want you woman tried to be a son and daughte...,Queen


In [7]:
data_to_concat = [data_queen, data_pixies]
data = pd.concat(data_to_concat)

In [8]:
data

Unnamed: 0,lyrics,artist
0,one two three four ooh ooh while the sun hangs...,Queen
1,oh oh you don t fool me you don t fool me you ...,Queen
2,woh everything i do i do for you ow oh yeah we...,"Queen, Brian May"
3,can anybody find me somebody to love ooh each ...,Queen
4,yesterday my life was in ruin now today i know...,Queen
...,...,...
95,as soon as i get my head around you i come aro...,Pixies
96,i smell smoke that comes from a gun named exti...,Pixies
97,i ve got something against you i ve got someth...,Pixies
98,hermanita ven conmigo hermanita ven conmigo ha...,Pixies


In [9]:
data['artist'] = data['artist'].str.replace('Queen, Brian May','1').str.replace('David Bowie, Queen', '1').str.replace(
                'Smile','1').str.replace('Freddie Mercury, Queen','1')

In [10]:
data['artist'] = data['artist'].str.replace('Queen','1').str.replace('Pixies', '0')
data[40:60]

Unnamed: 0,lyrics,artist
40,mmm num ba de dum bum ba be doo buh dum ba beh...,1
41,oh yes i m the great pretender ooh ooh pretend...,1
42,aah aah in the land where horses born with eag...,1
43,take off i was told a million times of all the...,1
44,yesterday my life was in ruin now today i know...,1
45,i have sinned dear father father i have sinned...,1
46,well she s gone dear gone this morning see wha...,1
47,take off i was told a million times of all the...,1
48,i have sinned dear father father i have sinned...,1
49,i want you woman tried to be a son and daughte...,1


In [11]:
pickle.dump(data, open('data.pkl', 'wb'))

In [12]:
data = pickle.load(open('data.pkl', 'rb'))

In [13]:
data

Unnamed: 0,lyrics,artist
0,one two three four ooh ooh while the sun hangs...,1
1,oh oh you don t fool me you don t fool me you ...,1
2,woh everything i do i do for you ow oh yeah we...,1
3,can anybody find me somebody to love ooh each ...,1
4,yesterday my life was in ruin now today i know...,1
...,...,...
95,as soon as i get my head around you i come aro...,0
96,i smell smoke that comes from a gun named exti...,0
97,i ve got something against you i ve got someth...,0
98,hermanita ven conmigo hermanita ven conmigo ha...,0


##### Choose input features and split the data.

In [14]:
X = data['lyrics']
y = data['artist']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=0)
X_train.shape

(150,)

#### Start the NLP with Vectorization.

In [16]:
cv = CountVectorizer()
vectorized_lyrics = cv.fit_transform(X_train)
vectorized_lyrics
vectorized_lyrics.todense()

matrix([[1, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [17]:
cv.get_feature_names()[:15]

['aaaaaaaarghh',
 'aaargh',
 'aah',
 'abandoned',
 'aberdeen',
 'able',
 'about',
 'above',
 'absolutely',
 'according',
 'ache',
 'aching',
 'action',
 'add',
 'address']

In [18]:
readable_lyrics = pd.DataFrame(vectorized_lyrics.todense(), columns=cv.get_feature_names(), index=y_train)
readable_lyrics.sum()

aaaaaaaarghh      4
aaargh            4
aah              16
abandoned         1
aberdeen          1
               ... 
your            266
yourself         99
yup               1
zen               1
zip               1
Length: 2320, dtype: int64

In [19]:
readable_lyrics.shape

(150, 2320)

#### Apply TDFIF to inspect the lyrics before the processing. A new Tfidf object will be instantiated after the preprocess (with the spacy library)of the data - lemmatization, removing stop words etc- for the models.

In [20]:
from spacy.lang.en.stop_words import STOP_WORDS as en_stop
stopwords_list = list(en_stop)

#the regex pattern used in the parameters keeps only the letters no numbers.
tf = TfidfVectorizer(
    token_pattern='(?u)\\b[a-zA-Z]+\\b',stop_words=stopwords_list, use_idf=True
)
vector_lyrics = tf.fit_transform(X_train)
readable_tf_lyrics = pd.DataFrame(vector_lyrics.todense().round(2), 
                                   columns=tf.get_feature_names())

readable_tf_lyrics.shape

(150, 2103)

<b> Text Normalization after Vectorization. </b>

Stemming and Lemmatization helps us to achieve the root forms (sometimes called synonyms in search context) of inflected (derived) words. Stemming is different to Lemmatization in the approach it uses to produce root forms of words and the word produced.

Vocabulary constructed by the unique words in a text, with TF-IDF and spacy form the vocabulary with the Top K Frequent Words and replace the rare words in training data with unknown tokens (UNK). Use Lemmatization instead of Semmatization

In general is recommended to bring the size down.

In [21]:
import spacy
model = spacy.load('en_core_web_md')

for string in X_train:
    results = model(string)
    
results[0], results[0].is_stop, results[0].is_punct, results[0].lemma_ ,results[0].like_num
print(string)

in heaven everything is fine in heaven everything is fine in heaven everything is fine you got your good thing and i ve got mine in heaven everything is fine in heaven everything is fine in heaven everything is fine you got your good thing and you ve got mine in heaven everything is fine in heaven in heaven everything is fine you ve got your good thing and you ve got mine in heaven everything is fine


In [22]:
import spacy
def clean_my_string(string):
    """use spacy to tokenize, lemmatize and remove stop words"""
    clean_string = []
    token_string = model(string)
    for token in token_string:
        if not token.is_stop and not token.is_punct:
            clean_string.append(token.lemma_)
    return ' '.join(clean_string)

In [23]:
strings_list = []
for string in X_train:
    cleaned_string = clean_my_string(string)
    strings_list.append(cleaned_string)
    
strings_list[:2]

['fa fa fa fa fa fa fa fa fa faa time old man tell fable piper go soup cold table black crow fly find new destination sign come tonight come ogre sight come ogre battle fight give great big cry swallow ocean mighty tongue catch fly palm hand incredible size great big eye focus direction battle yeah yeah yeah come tonight come ogre sight come ogre battle fight fa fa fa fa faa hoooa ogre man inside way mirror mountain get ta right sight t ooh look ogre man come way mirror mountain run come t east cause get ta south aaargh aaaaaaaarghh ogre man go home great big fight bugle blow let trumpet cry ogre battle live oh oh oh come come come ogre battle fa fa fa fa faa',
 've get ve get ve get ve get ve get ve get ve get oh yeah happy prick']

#### Lemmas.

In [24]:
import spacy
nlp = spacy.load('en_core_web_sm')

lemma_list = []
for string in strings_list:
    sth= nlp(string)
    for token in sth:
        lemma_list.append(token.lemma_) 
lemma_list[:15]

['fa',
 'fa',
 'fa',
 'fa',
 'fa',
 'fa',
 'fa',
 'fa',
 'fa',
 'faa',
 'time',
 'old',
 'man',
 'tell',
 'fable']

Visualize a dependency parse or named entities in a browser or a Jupyter notebook.


In [None]:
from spacy import displacy
visual_string=strings_list[1]
visual_doc = nlp(visual_string)
displacy.serve(visual_doc , style='dep')


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



 Try it first on the X_train data before so as to compare with the ones from the strings_list.

In [None]:
pattern = '|'.join([f'(?i){word}' for word in lemma_list])
X_train.str.contains(pattern)

In [None]:
pattern[:100]

##### Convert the strings_list to a pandas Series.

In [None]:
new_X = pd.Series(strings_list)

In [None]:
pattern = '|'.join([f'(?i){word}' for word in lemma_list])
pattern
new_X.str.contains(pattern)

#### Parts taken from the documentation.

In [None]:
from collections import Counter
for string in strings_list:
    complete_doc = nlp(string)

    words = [token.text for token in complete_doc
         if not token.is_stop and not token.is_punct]

word_freq = Counter(words)
common_words = word_freq.most_common(150)
print(common_words)

#### Find unique words. Brings results with the X data but not with the X_train.

In [None]:
unique_words = [word for (word, freq) in word_freq.items() if freq == 1]

<b> Vectorize the result (TDIDF). </b>

As tf_1 i use the data i manually worked on. As tf_2 the X_train data as given after the splitting.


In [None]:
new_X.shape
type(new_X)
new_X.shape

In [None]:
X_train.shape

#### Playing with the parameters. 

The 'min_df' and 'max_df' ignore words etc. I'm still figuring out how it works. The n-gram is a contiguous sequence of n items from a given sample of text or speech. The items can be phonemes, syllables, letters, words or base pairs according to the application. The n-grams typically are collected from a text or speech corpus.


In [None]:
from spacy.lang.en.stop_words import STOP_WORDS as en_stop
stopwords_list = list(en_stop)

tf_1 = TfidfVectorizer(token_pattern='(?u)\\b[a-zA-Z]+\\b', stop_words=stopwords_list
                       #stop_words = 'english'
                    #,vocabulary= common_words
                     ,smooth_idf=True,use_idf=True
                     #,max_df=2.0
                     ,min_df=18
                     ,ngram_range=(1, 3)
                                         )
vector_lyrics_1 = tf_1.fit_transform(new_X)
readable_tf_lyrics_1 = pd.DataFrame(vector_lyrics_1.todense().round(2), columns=tf_1.get_feature_names())

readable_tf_lyrics_1[:10]

#### Thanks to Daria for the use of the function transform() here!

In [None]:
vector_lyrics_test_1 = tf_1.transform(X_test).toarray()
vector_lyrics_test_1.shape

#### Do the same for the X_train data.

In [None]:
tf_2 = TfidfVectorizer(token_pattern='(?u)\\b[a-zA-Z]+\\b'
                       #, stop_words = 'english'
                    #,vocabulary= common_words
                     #,smooth_idf=True,use_idf=True
                     #,min_df=10
                     #,ngram_range=(1, 3)
                                         )
vector_lyrics_2 = tf_2.fit_transform(X_train)
readable_tf2_lyrics = pd.DataFrame(vector_lyrics_2.todense().round(2), columns=tf_2.get_feature_names())

readable_tf2_lyrics[:10]

In [None]:
vector_lyrics_test_2 = tf_2.transform(X_test).toarray()
vector_lyrics_test_2.shape

#### Save the engineered -with Tdifd- data, with pickle for later use.

In [None]:
vector_lyrics_1#new_X
vector_lyrics_test_1

vector_lyrics_2#X_train
vector_lyrics_test_2

In [None]:
pickle.dump(vector_lyrics_1, open('vector_lyrics_1.pkl', 'wb'))

In [None]:
pickle.dump(vector_lyrics_test_1, open('vector_lyrics_test_1.pkl', 'wb'))

In [None]:
pickle.dump(vector_lyrics_2, open('vector_lyrics_2.pkl', 'wb'))

In [None]:
pickle.dump(vector_lyrics_test_2, open('vector_lyrics_test_2.pkl', 'wb'))

In [None]:
pickle.dump(y_train, open('y_train.pkl', 'wb'))

In [None]:
pickle.dump(y_test, open('y_test.pkl', 'wb'))

In [None]:
pickle.dump(tf_1, open('tf_1.pkl', 'wb'))

In [None]:
pickle.dump(tf_2, open('tf_2.pkl', 'wb'))