In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import KFold
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import matplotlib
from matplotlib import pyplot as plt

## Coletando músicas do CSV

In [2]:
df = pd.read_csv("/home/lmafra/Documentos/UnB/MT/Projeto_Final/lyrics-info/lyrics.csv",encoding='utf-8-sig')
df.head()

Unnamed: 0,music_title,lyrics,year,album
0,All I Need (All I Have),"[Intro]\nNiggas can't be serious, man\nYo, yo\...",,World Record Holders
1,Message In A Bottle Freestyle,"Uh, uh, uh\nYeah, yeah, yeah\nPut your hands i...",2007-01-01,
2,Intro (I’m Good),"[John Legend]\nOh oh oh oh, I'm good, so good\...",,I’m Good
3,Stronger,"[Produced by Kanye West, Mike Dean, and Timbal...",2007-07-31,Graduation
4,On Bonnaroo Music Festival,"Last time I got here, before I even arrived, t...",2014-06-13,Kanye West’s Visionary Streams of Consciousness


In [3]:
lyrics = df['lyrics']
lyrics = lyrics[pd.notnull(lyrics)]
print(lyrics.loc[0])

[Intro]
Niggas can't be serious, man
Yo, yo

[Verse 1: Kanye West]
Yo I'm the reason metal detectors go off the meter
'Cause I'll run through the airport with the heater
Gets the fag at his plane, and if I miss him
Put two in his brain at the baggage claim
Give him... heavy sluggage, tryna get his luggage
Send him to the crossroads, we way too thuggish
Shit, we at odds 'til we even up
'Til you leave on a stretcher or I leave in cuffs
Niggas wanna live, they done breathed enough
And I bet you next time they'll believe in us
It don't take much brains to know we bust things
My gat must bang, through your Mustang
Ma ma se, ma ma sa, ma ma ku sa
Niggas seen Kanye, red dotted them
Hurry up, stash the guns, the cops gon' come
And we gon' say we don't know who shot at 'em
Come on

[Hook]
All I have is my real niggas (What?)
Get this cash with my real niggas (Come on)
Give my last to my real niggas
Cock back, blast, for my real niggas
All I have is my real niggas
Get this cash with my real nigg

## Limpando Textos

### Removendo headers ([Chorus],[Intro], [Verse]...)

In [4]:
df['lyrics_final'] = lyrics.str.replace(r"([\[]().*?[\]])" , "")

lyrics_final = df['lyrics_final']

In [5]:
print(lyrics_final.loc[0])


Niggas can't be serious, man
Yo, yo


Yo I'm the reason metal detectors go off the meter
'Cause I'll run through the airport with the heater
Gets the fag at his plane, and if I miss him
Put two in his brain at the baggage claim
Give him... heavy sluggage, tryna get his luggage
Send him to the crossroads, we way too thuggish
Shit, we at odds 'til we even up
'Til you leave on a stretcher or I leave in cuffs
Niggas wanna live, they done breathed enough
And I bet you next time they'll believe in us
It don't take much brains to know we bust things
My gat must bang, through your Mustang
Ma ma se, ma ma sa, ma ma ku sa
Niggas seen Kanye, red dotted them
Hurry up, stash the guns, the cops gon' come
And we gon' say we don't know who shot at 'em
Come on


All I have is my real niggas (What?)
Get this cash with my real niggas (Come on)
Give my last to my real niggas
Cock back, blast, for my real niggas
All I have is my real niggas
Get this cash with my real niggas (Come on)
Give my last to my re

In [6]:
df.head()

Unnamed: 0,music_title,lyrics,year,album,lyrics_final
0,All I Need (All I Have),"[Intro]\nNiggas can't be serious, man\nYo, yo\...",,World Record Holders,"\nNiggas can't be serious, man\nYo, yo\n\n\nYo..."
1,Message In A Bottle Freestyle,"Uh, uh, uh\nYeah, yeah, yeah\nPut your hands i...",2007-01-01,,"Uh, uh, uh\nYeah, yeah, yeah\nPut your hands i..."
2,Intro (I’m Good),"[John Legend]\nOh oh oh oh, I'm good, so good\...",,I’m Good,"\nOh oh oh oh, I'm good, so good\nEven better ..."
3,Stronger,"[Produced by Kanye West, Mike Dean, and Timbal...",2007-07-31,Graduation,"\n\n\nWork it, make it, do it, makes us\nHarde..."
4,On Bonnaroo Music Festival,"Last time I got here, before I even arrived, t...",2014-06-13,Kanye West’s Visionary Streams of Consciousness,"Last time I got here, before I even arrived, t..."


### Eliminando Nulls

In [21]:
df.isnull().sum()

music_title       0
lyrics           12
year            215
album           233
lyrics_final     12
dtype: int64

In [38]:
print("Original:", df.shape)
lyrics_without_duplicates = df.drop_duplicates()
lyrics_wd = lyrics_without_duplicates.reset_index(drop=True)
print("Drop Duplicates:", lyrics_wd.shape)
lyrics_wd_dn = lyrics_wd.dropna(subset=['lyrics_final'])
df = lyrics_wd_dn.reset_index(drop=True)
print("Drop Nulls:", df.shape)

Original: (503, 5)
Drop Duplicates: (503, 5)
Drop Nulls: (491, 5)


In [39]:
lyrics_null.head()

Unnamed: 0,index,music_title,lyrics,year,album,lyrics_final
0,0,All I Need (All I Have),"[Intro]\nNiggas can't be serious, man\nYo, yo\...",,World Record Holders,"\nNiggas can't be serious, man\nYo, yo\n\n\nYo..."
1,1,Message In A Bottle Freestyle,"Uh, uh, uh\nYeah, yeah, yeah\nPut your hands i...",2007-01-01,,"Uh, uh, uh\nYeah, yeah, yeah\nPut your hands i..."
2,2,Intro (I’m Good),"[John Legend]\nOh oh oh oh, I'm good, so good\...",,I’m Good,"\nOh oh oh oh, I'm good, so good\nEven better ..."
3,3,Stronger,"[Produced by Kanye West, Mike Dean, and Timbal...",2007-07-31,Graduation,"\n\n\nWork it, make it, do it, makes us\nHarde..."
4,4,On Bonnaroo Music Festival,"Last time I got here, before I even arrived, t...",2014-06-13,Kanye West’s Visionary Streams of Consciousness,"Last time I got here, before I even arrived, t..."


### Tokenização

In [41]:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(df.lyrics_final[0])
print(tokens)

['Niggas', 'ca', "n't", 'be', 'serious', ',', 'man', 'Yo', ',', 'yo', 'Yo', 'I', "'m", 'the', 'reason', 'metal', 'detectors', 'go', 'off', 'the', 'meter', "'Cause", 'I', "'ll", 'run', 'through', 'the', 'airport', 'with', 'the', 'heater', 'Gets', 'the', 'fag', 'at', 'his', 'plane', ',', 'and', 'if', 'I', 'miss', 'him', 'Put', 'two', 'in', 'his', 'brain', 'at', 'the', 'baggage', 'claim', 'Give', 'him', '...', 'heavy', 'sluggage', ',', 'tryna', 'get', 'his', 'luggage', 'Send', 'him', 'to', 'the', 'crossroads', ',', 'we', 'way', 'too', 'thuggish', 'Shit', ',', 'we', 'at', 'odds', "'til", 'we', 'even', 'up', "'Til", 'you', 'leave', 'on', 'a', 'stretcher', 'or', 'I', 'leave', 'in', 'cuffs', 'Niggas', 'wan', 'na', 'live', ',', 'they', 'done', 'breathed', 'enough', 'And', 'I', 'bet', 'you', 'next', 'time', 'they', "'ll", 'believe', 'in', 'us', 'It', 'do', "n't", 'take', 'much', 'brains', 'to', 'know', 'we', 'bust', 'things', 'My', 'gat', 'must', 'bang', ',', 'through', 'your', 'Mustang', 'Ma',

### Stopwords

In [43]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

print( [i for i in tokens if i not in stop_words])

['Niggas', 'ca', "n't", 'serious', ',', 'man', 'Yo', ',', 'yo', 'Yo', 'I', "'m", 'reason', 'metal', 'detectors', 'go', 'meter', "'Cause", 'I', "'ll", 'run', 'airport', 'heater', 'Gets', 'fag', 'plane', ',', 'I', 'miss', 'Put', 'two', 'brain', 'baggage', 'claim', 'Give', '...', 'heavy', 'sluggage', ',', 'tryna', 'get', 'luggage', 'Send', 'crossroads', ',', 'way', 'thuggish', 'Shit', ',', 'odds', "'til", 'even', "'Til", 'leave', 'stretcher', 'I', 'leave', 'cuffs', 'Niggas', 'wan', 'na', 'live', ',', 'done', 'breathed', 'enough', 'And', 'I', 'bet', 'next', 'time', "'ll", 'believe', 'us', 'It', "n't", 'take', 'much', 'brains', 'know', 'bust', 'things', 'My', 'gat', 'must', 'bang', ',', 'Mustang', 'Ma', 'se', ',', 'sa', ',', 'ku', 'sa', 'Niggas', 'seen', 'Kanye', ',', 'red', 'dotted', 'Hurry', ',', 'stash', 'guns', ',', 'cops', 'gon', "'", 'come', 'And', 'gon', "'", 'say', "n't", 'know', 'shot', "'em", 'Come', 'All', 'I', 'real', 'niggas', '(', 'What', '?', ')', 'Get', 'cash', 'real', 'nigg

## Normalização das músicas

### Conversão para caixa baixa

In [44]:
df.lyrics_final[0]

'\nNiggas can\'t be serious, man\nYo, yo\n\n\nYo I\'m the reason metal detectors go off the meter\n\'Cause I\'ll run through the airport with the heater\nGets the fag at his plane, and if I miss him\nPut two in his brain at the baggage claim\nGive him... heavy sluggage, tryna get his luggage\nSend him to the crossroads, we way too thuggish\nShit, we at odds \'til we even up\n\'Til you leave on a stretcher or I leave in cuffs\nNiggas wanna live, they done breathed enough\nAnd I bet you next time they\'ll believe in us\nIt don\'t take much brains to know we bust things\nMy gat must bang, through your Mustang\nMa ma se, ma ma sa, ma ma ku sa\nNiggas seen Kanye, red dotted them\nHurry up, stash the guns, the cops gon\' come\nAnd we gon\' say we don\'t know who shot at \'em\nCome on\n\n\nAll I have is my real niggas (What?)\nGet this cash with my real niggas (Come on)\nGive my last to my real niggas\nCock back, blast, for my real niggas\nAll I have is my real niggas\nGet this cash with my r

### Conversão de apostrofes

### Removendo caracteres especiais e numéricos

## Lemmatização

## Separando dados para treinamento e teste

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
y=lyrics_null
x=lyrics_null.drop('lyrics_final',axis=1) 

In [30]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
x_train.head()

Unnamed: 0,index,music_title,lyrics,year,album
67,67,The Morning,"[Produced by !llmind, co-produced by Kanye Wes...",2012-09-14,Kanye West Presents Good Music Cruel Summer
146,149,“We’ll Find a Way” (Unreleased/Sunday Service),[Verse 1]\nThere's been a lot of change in my ...,,
348,355,My Way,[Hook]\nI've lived\nYes I've lived a life that...,,Get Well Soon...
166,169,MTV VMAs 2015: Vanguard Acceptance Speech,"Bro. Bro. Listen to the kids. Jeremy, I need, ...",2015-08-30,
340,347,On Achievement and Creativity,"Don’t let nobody tell you what you can do, wha...",2014-07-04,Kanye West’s Visionary Streams of Consciousness


In [31]:
x_train.shape

(392, 5)

In [32]:
y_train.shape

(392, 6)

In [33]:
x_test.head()

Unnamed: 0,index,music_title,lyrics,year,album
220,224,I Don’t Want to Be Liked (Freestyle),"I ride for this shit, and I'll die for this sh...",2016-04-10,
431,441,Power - SNL (Saturday Night Live) Version,[Kanye West - Alternative Second Verse]\nThe b...,2010-10-02,
9,9,Home,[Hook: John Legend]\nGo ahead roll it up and p...,,Get Well Soon...
285,292,Power (Remix),"[Intro: JAY-Z]\nIs this thing on?\nOh, I thoug...",2010-08-20,G.O.O.D. Fridays
365,372,On Michael Jordan,[Kanye West]\nWe should’ve never ever let Mich...,2013-12-18,Kanye West’s Visionary Streams of Consciousness


In [34]:
x_test.shape

(99, 5)

## Pré-processamento dos textos:

In [36]:
lyrics_final = y_train.str.lower()
stopWords = stopwords.words('english')
stopWords.extend(['-','(',')','{','}',';','...','!','--','.','?',',','/'"''"])

AttributeError: 'DataFrame' object has no attribute 'str'

In [14]:
def remove_punctuation(text):
    '''a function for removing punctuation'''
    import string
    # replacing the punctuations with no space, 
    # which in effect deletes the punctuation marks 
    translator = str.maketrans('', '', string.punctuation)
    # return the text stripped of punctuation marks
    return text.translate(translator)

lyrics_final = lyrics_final.apply(remove_punctuation)

In [15]:
def stopwords(text):
    '''a function for removing the stopword'''
    # removing the stop words and lowercasing the selected words
    text = [word.lower() for word in text.split() if word.lower() not in stopWords]
    # joining the list of words with space separato
    return " ".join(text)

lyrics_final = lyrics_final.apply(stopwords)

In [16]:
print(lyrics_final.loc[471])

yeezy season approachin fuck whatever yall hearin fuck fuck whatever yall wearin monster come alive soon pull park benz get bitch shaking like parkinsons take number lock indian hair moccasins many hoes house sin real nigga back house black timbs couch black dick spouse know like chocolate men got niggas cochran hah sight sight much give fuck let show right fore give much give fuck let show right fore give oh hell give us need may want baby girl tryna get nut girl tryna give chopped em dont judge em joe brown one last announcement sports bra lets keep bouncin everybody wanna live top mountain took bleau tried sip fountain david grutman kicked got back put dick mouth sight sight uhhuh uhhuh uhhuh uhhuh right need right right need need right right need need right


In [17]:
lyrics_final = lyrics_final[pd.notnull(lyrics_final)]
y_train = lyrics_final

In [18]:
y_train.head()

339    aint question want need feel slowly drifting a...
428    kanye verse friend showed pictures kids could ...
275    ayy ya heard good news yall sleeping huh good ...
266    drive slow homie drive slow homie never know h...
448    doesnt get much better nah unfair unfair advan...
Name: lyrics_final, dtype: object

## Caracterização dos textos

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [20]:
x_train =  x_train[pd.notnull(x_train)]
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1)

ValueError: Found input variables with inconsistent numbers of samples: [402, 391]

In [24]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))


ValueError: could not convert string to float: 'On Sight'