# Pre-Processing

Description: This code takes in our Translated_Lyrics.csv and produces our Core tables we will use to analyze our data. Due to the vast amount of songs we took a sample of 500 songs per Genre

### Output

1. VOCAB.csv
2. TOKEN.csv
3. LIB.csv
3. TFIDF.csv

In [1]:
import os
import csv
import pandas as pd
import re
import nltk
import numpy as np
from sklearn.decomposition import PCA
from scipy.linalg import norm
from scipy.linalg import eigh as eig
import plotly_express as px
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import pdist
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
from gensim.models import word2vec
from sklearn.manifold import TSNE
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
from matplotlib.pyplot import figure
figure(figsize=(8, 6), dpi=80)
import warnings
warnings.filterwarnings('ignore')

<Figure size 640x480 with 0 Axes>

In [2]:
All_Lyrics = "Translated_Lyrics.csv"
Artists_File = "artists-data.csv"
All_Lyrics_df = pd.read_csv (All_Lyrics)
artists_df = pd.read_csv(Artists_File)

### Build out LIB and DOC Tables

In [3]:
OHCO = ['Genre', 'Artist',  'SName','Language', 'sent_num', 'token_num']
All_Lyrics_df = All_Lyrics_df.drop(columns=['Unnamed: 0'])
All_Lyrics_df = All_Lyrics_df.rename(columns={'ALink': 'Link'})
All_Lyrics_df = All_Lyrics_df.rename(columns={'Idiom': 'Language'})
result = pd.merge(All_Lyrics_df, artists_df, how="left", on=["Link"])
Lang_agg = result.groupby("Language").size()
DOC = result[['Genre', 'Artist','Language','SName', 'Translated_Lyrics']]

DOC= DOC.drop_duplicates()


LIB = DOC[['Genre', 'Artist','SName']]

LIB.Genre = pd.Categorical(LIB.Genre)
LIB['Genre_Index'] = LIB.Genre.cat.codes


LIB.Artist = pd.Categorical(LIB.Artist)
LIB['Artist_Index'] = LIB.Artist.cat.codes


LIB.SName = pd.Categorical(LIB.SName)
LIB['SName_Index'] = LIB.SName.cat.codes


DOC = DOC.groupby("Genre").sample(n=500, random_state=113)
DOC = DOC.set_index(['Genre', 'Artist',  'SName','Language'])



### All Languages Present Before Reduction in Size

In [4]:
Lang_agg.nlargest()


Language
ENGLISH       129396
PORTUGUESE     94892
SPANISH         5211
ITALIAN          640
FRENCH           482
dtype: int64

### Even Distribution of Each Genre

In [5]:
DOC.groupby("Genre").size()


Genre
Funk Carioca    500
Hip Hop         500
Pop             500
Rock            500
Samba           500
Sertanejo       500
dtype: int64

In [6]:
LIB.head()

Unnamed: 0,Genre,Artist,SName,Genre_Index,Artist_Index,SName_Index
0,Rock,311,Summer Of Love - Traducao,3,13,103488
2,Rock,4SERES,Morada,3,18,69044
3,Rock,A Corte Animal,À Deriva,3,20,126890
4,Rock,A Corte Animal,À Flor da Pele Moderna,3,20,126896
5,Rock,A Corte Animal,"De Brasília, Com Amor",3,20,26208


### Build out TOKEN Table

In [7]:
def tokenize(doc_df, OHCO=OHCO, remove_pos_tuple=False, ws=False):
    

    df = doc_df.Translated_Lyrics\
        .apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
        .stack()\
        .to_frame()\
        .rename(columns={0:'sent_str'})
    
    def word_tokenize(x):
        if ws:
            s = pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x)))
        else:
            s = pd.Series(nltk.pos_tag(nltk.word_tokenize(x))) # Discards stuff in between
        return s
            
    df = df.sent_str\
        .apply(word_tokenize)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'pos_tuple'})
    
    # Grab info from tuple
    df['pos'] = df.pos_tuple.apply(lambda x: x[1])
    df['token_str'] = df.pos_tuple.apply(lambda x: x[0])
    if remove_pos_tuple:
        df = df.drop('pos_tuple', 1)
    
    # Add index
    df.index.names = OHCO
    
    return df

In [8]:
%%time
TOKEN = tokenize(DOC, ws=False)

Wall time: 2min 46s


### Build out VOCAB Table, Give POS, Zips Features, and Rank

In [9]:
TOKEN = TOKEN.dropna()
TOKEN['term_str'] = TOKEN['token_str'].str.lower().str.replace('[\W_]', '')
TOKEN = TOKEN[TOKEN['term_str'] != ""]
VOCAB = TOKEN.term_str.value_counts().to_frame()\
    .rename(columns={'index':'term_str', 'term_str':'n'})\
    .sort_index().reset_index().rename(columns={'index':'term_str'})
VOCAB.index.name = 'term_id'
VOCAB['num'] = VOCAB.term_str.str.match("\d+").astype('int')


TOKEN['term_id'] = TOKEN.term_str.map(VOCAB.reset_index().set_index('term_str').term_id)
VOCAB['pos_max'] = TOKEN.groupby(['term_id', 'pos']).count().iloc[:,0].unstack().idxmax(1)
VOCAB = VOCAB.sort_values('n', ascending=False).reset_index()
VOCAB.index.name = 'term_rank'
VOCAB = VOCAB.reset_index()
VOCAB = VOCAB.set_index('term_id')
VOCAB['term_rank'] = VOCAB['term_rank'] + 1
new_rank = VOCAB.n.value_counts()\
    .sort_index(ascending=False).reset_index().reset_index()\
    .rename(columns={'level_0':'term_rank2', 'index':'n', 'n':'nn'})\
    .set_index('n')
VOCAB['p'] = VOCAB.n / TOKEN.shape[0]
VOCAB['term_rank2'] = VOCAB.n.map(new_rank.term_rank2) + 1
VOCAB['zipf_k'] = VOCAB.n * VOCAB.term_rank
VOCAB['zipf_k2'] = VOCAB.n * VOCAB.term_rank2
VOCAB['zipf_k3'] = VOCAB.p * VOCAB.term_rank2
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB = VOCAB.dropna()

In [10]:
TOKEN.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,pos_tuple,pos,token_str,term_str,term_id
Genre,Artist,SName,Language,sent_num,token_num,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Funk Carioca,Marcos e Fernando,Papo de Jacaré,PORTUGUESE,0,0,"(Im, NNP)",NNP,Im,im,10645
Funk Carioca,Marcos e Fernando,Papo de Jacaré,PORTUGUESE,0,1,"(riding, VBG)",VBG,riding,riding,18060
Funk Carioca,Marcos e Fernando,Papo de Jacaré,PORTUGUESE,0,2,"(this, DT)",DT,this,this,21659
Funk Carioca,Marcos e Fernando,Papo de Jacaré,PORTUGUESE,0,3,"(girls, NNS)",NNS,girls,girls,8975
Funk Carioca,Marcos e Fernando,Papo de Jacaré,PORTUGUESE,0,4,"(wave, VBP)",VBP,wave,wave,23504


In [11]:
VOCAB.sample(10)

Unnamed: 0_level_0,term_rank,term_str,n,num,pos_max,p,term_rank2,zipf_k,zipf_k2,zipf_k3
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
13142,21375,manguin,1,0,NNP,1e-06,526,21375,526,0.000723
10245,7756,hoodie,4,0,NN,6e-06,523,31024,2092,0.002877
2723,2659,bouncing,18,0,VBG,2.5e-05,509,47862,9162,0.0126
7852,17669,felly,1,0,RB,1e-06,526,17669,526,0.000723
15718,4902,password,7,0,NN,1e-05,520,34314,3640,0.005006
9888,22501,heavyweight,1,0,JJ,1e-06,526,22501,526,0.000723
18226,635,rolling,116,0,VBG,0.00016,412,73660,47792,0.065723
14187,20899,mufuckas,1,0,NN,1e-06,526,20899,526,0.000723
6802,18138,e55,1,0,NN,1e-06,526,18138,526,0.000723
16538,21810,portela,1,0,NN,1e-06,526,21810,526,0.000723


In [12]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1
VOCAB['stop'] = VOCAB.term_str.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

In [13]:
VOCAB[VOCAB.stop == 1].sample(10)

Unnamed: 0_level_0,term_rank,term_str,n,num,pos_max,p,term_rank2,zipf_k,zipf_k2,zipf_k3,stop
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
14327,9,my,9993,0,PRP$,0.013742,9,89937,89937,0.123681,1
15373,1898,ourselves,28,0,PRP,3.9e-05,499,53144,13972,0.019214,1
15372,1043,ours,60,0,NNS,8.3e-05,467,62580,28020,0.038533,1
4991,7458,couldn,4,0,NNP,6e-06,523,29832,2092,0.002877,1
9973,75,her,1455,0,PRP$,0.002001,75,109125,109125,0.150068,1
21599,10770,theirs,2,0,NNS,3e-06,525,21540,1050,0.001444,1
21572,177,than,542,0,IN,0.000745,168,95934,91056,0.12522,1
21604,125,then,834,0,RB,0.001147,123,104250,102582,0.14107,1
14982,527,o,156,0,NNP,0.000215,374,82212,58344,0.080234,1
937,117,an,917,0,DT,0.001261,115,107289,105455,0.145021,1


### Build Out Stemmers

In [14]:
from nltk.stem.porter import PorterStemmer
stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.term_str.apply(stemmer1.stem)

from nltk.stem.snowball import SnowballStemmer
stemmer2 = SnowballStemmer("english")
VOCAB['stem_snowball'] = VOCAB.term_str.apply(stemmer2.stem)

from nltk.stem.lancaster import LancasterStemmer
stemmer3 = LancasterStemmer()
VOCAB['stem_lancaster'] = VOCAB.term_str.apply(stemmer3.stem)

In [15]:
VOCAB.sample(10)

Unnamed: 0_level_0,term_rank,term_str,n,num,pos_max,p,term_rank2,zipf_k,zipf_k2,zipf_k3,stop,stem_porter,stem_snowball,stem_lancaster
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
11645,13185,kaka,2,0,NNP,3e-06,525,26370,1050,0.001444,0,kaka,kaka,kak
18406,10675,runny,2,0,JJ,3e-06,525,21350,1050,0.001444,0,runni,runni,runny
3754,8144,cest,3,0,NNP,4e-06,524,24432,1572,0.002162,0,cest,cest,cest
14945,22078,nunmullodo,1,0,JJ,1e-06,526,22078,526,0.000723,0,nunmullodo,nunmullodo,nunmullodo
5902,4692,deserts,8,0,NNS,1.1e-05,519,37536,4152,0.00571,0,desert,desert,desert
15075,5447,officer,6,0,NN,8e-06,521,32682,3126,0.004299,0,offic,offic,off
15857,21547,peeyimp,1,0,NN,1e-06,526,21547,526,0.000723,0,peeyimp,peeyimp,peeyimp
12490,10208,lightens,2,0,NNS,3e-06,525,20416,1050,0.001444,0,lighten,lighten,light
19653,12732,skeet,2,0,NN,3e-06,525,25464,1050,0.001444,0,skeet,skeet,skeet
23814,16063,windmills,1,0,NNS,1e-06,526,16063,526,0.000723,0,windmil,windmil,windmil


### TFIDF Function

In [16]:
def TFIDF_FUNCTION(TOKEN, bag, count_type, tf_method, idf_method):
    TOKEN = TOKEN[~TOKEN.term_str.isna()]
    #TOKEN['term_id'] = TOKEN.term_str.map(VOCAB.reset_index().set_index('term_str').term_id)
    #TOKEN.reset_index().set_index('term_str').term_id
    BOW = TOKEN.groupby(bag+['term_id']).term_id.count()\
    .to_frame().rename(columns={'term_id':'n'})
    BOW['c'] = BOW.n.astype('bool').astype('int')
    DTCM = BOW[count_type].unstack().fillna(0).astype('int')
    
    if tf_method == 'sum':
        TF = DTCM.T / DTCM.T.sum()

    elif tf_method == 'max':
        TF = DTCM.T / DTCM.T.max()

    elif tf_method == 'log':
        TF = np.log10(1 + DTCM.T)

    elif tf_method == 'raw':
        TF = DTCM.T

    elif tf_method == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
        TF = tf_norm_k + (1 - tf_norm_k) * TF[TF > 0] # EXPLAIN; may defeat purpose of norming

    elif tf_method == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
    TF = TF.T
    N = DTCM.shape[0]
    DF = DTCM[DTCM > 0].count()
    
    if idf_method == 'standard':
        IDF = np.log10(N / DF)

    elif idf_method == 'max':
        IDF = np.log10(DF.max() / DF) 

    elif idf_method == 'smooth':
        IDF = np.log10((1 + N) / (1 + DF)) + 1 # Correct?
    
    TFIDF = TF * IDF
    
    return TFIDF

In [17]:
LANG = OHCO[:4]
SONG = OHCO[:3]
ARTIST = OHCO[:2]
GENRE = OHCO[:1]


### We Will Make A TFIDF Table Bagged By Language

This is our lowest level bagging

In [18]:
count_type = 'n'
tf_method = 'sum' 
tf_norm_k = .5 
idf_method = 'standard' 
TFIDF = TFIDF_FUNCTION(TOKEN, LANG, count_type, tf_method, idf_method )
TFIDF.head(5)

#Add Results to VOCAB TAble
VOCAB['tfidf_mean'] = TFIDF[TFIDF > 0].mean().fillna(0) 
VOCAB['tfidf_sum'] = TFIDF.sum()
VOCAB['tfidf_median'] = TFIDF[TFIDF > 0].median().fillna(0) # EXPLAIN
VOCAB['tfidf_max'] = TFIDF.max()


In [19]:
VOCAB.to_csv('VOCAB.csv')
LIB.to_csv('LIB.csv')
TOKEN.to_csv('TOKEN.csv')
TFIDF.to_csv('TFIDF.csv')