In [37]:
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from pathlib import Path
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from sklearn.linear_model import LogisticRegression

In [16]:
filepath = Path("../song_lyrics.csv")
music = pd.read_csv(filepath)
music = music.sample(n=999999)

In [17]:
music.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
4431139,Pulse Queen,rock,We Are PIGS,2020,70,{},[Verse 1]\nHey fool\nHey newcomer\nTalk big\nN...,6725402,en,en,en
2416258,Your Love Is Wonderful Second Pressing,pop,Hattie Littles,2005,41,{},[Chorus]\nYour love is so wonderful\nYour love...,3644419,en,en,en
3181656,Sauce Anthem,rap,Kay Young,2019,11,{},[INTRO]\nToo much sauce got me feeling my self...,4812776,en,en,en
144682,Hope N Pray,rap,Wyclef Jean,2013,308,"{""Young Chop""}",[Intro:]\nShe put a choppa to my ear and then ...,156320,en,en,en
1272283,Post,pop,Post Regiment,2015,15,{},"Coś, co już znasz i co zawsze pozostaje\nTo do...",1613126,pl,pl,pl


In [18]:
music = music[['title', 'tag', 'artist', 'year', 'views', 'lyrics', 'id', 'language']]
music 

Unnamed: 0,title,tag,artist,year,views,lyrics,id,language
4431139,Pulse Queen,rock,We Are PIGS,2020,70,[Verse 1]\nHey fool\nHey newcomer\nTalk big\nN...,6725402,en
2416258,Your Love Is Wonderful Second Pressing,pop,Hattie Littles,2005,41,[Chorus]\nYour love is so wonderful\nYour love...,3644419,en
3181656,Sauce Anthem,rap,Kay Young,2019,11,[INTRO]\nToo much sauce got me feeling my self...,4812776,en
144682,Hope N Pray,rap,Wyclef Jean,2013,308,[Intro:]\nShe put a choppa to my ear and then ...,156320,en
1272283,Post,pop,Post Regiment,2015,15,"Coś, co już znasz i co zawsze pozostaje\nTo do...",1613126,pl
...,...,...,...,...,...,...,...,...
499699,Where Is My Power?,pop,Antony and the Johnsons,2014,847,Where is my power?\nIs it in the air around me...,797908,en
1675981,This Light Between Us Orchestral Version,pop,Armin van Buuren,2011,84,Can you see this light between us?\nKeeps me b...,2234006,en
1210950,Jupiter Island,rock,Porcupine Tree,1992,1306,Jupiter Island is full of pleasures\nGlowing g...,1548294,en
462164,Yesterdays Heroes,pop,The 4-Skins,1982,813,From the cradle to the grave\nThe Oi Boy tryin...,758100,en


In [19]:
# how many different languages are there?
music['language'].nunique()

81

In [21]:
# how many songs are there of ecah languagae? we wanted to do this to see if it was safe to remove non-english languages. it looks like we're good.
music['language'].value_counts()

en    657163
es     53992
fr     36571
pt     32673
ru     32288
       ...  
gu         1
kn         1
sw         1
ps         1
mt         1
Name: language, Length: 81, dtype: int64

In [22]:
# drop any languages that aren't english
music = music[music['language'] == 'en']
music

Unnamed: 0,title,tag,artist,year,views,lyrics,id,language
4431139,Pulse Queen,rock,We Are PIGS,2020,70,[Verse 1]\nHey fool\nHey newcomer\nTalk big\nN...,6725402,en
2416258,Your Love Is Wonderful Second Pressing,pop,Hattie Littles,2005,41,[Chorus]\nYour love is so wonderful\nYour love...,3644419,en
3181656,Sauce Anthem,rap,Kay Young,2019,11,[INTRO]\nToo much sauce got me feeling my self...,4812776,en
144682,Hope N Pray,rap,Wyclef Jean,2013,308,[Intro:]\nShe put a choppa to my ear and then ...,156320,en
323243,Imagination Is A Powerful Deceiver,rock,Elvis Costello,1993,543,So you're trying to make connection\nYou heard...,457398,en
...,...,...,...,...,...,...,...,...
499699,Where Is My Power?,pop,Antony and the Johnsons,2014,847,Where is my power?\nIs it in the air around me...,797908,en
1675981,This Light Between Us Orchestral Version,pop,Armin van Buuren,2011,84,Can you see this light between us?\nKeeps me b...,2234006,en
1210950,Jupiter Island,rock,Porcupine Tree,1992,1306,Jupiter Island is full of pleasures\nGlowing g...,1548294,en
462164,Yesterdays Heroes,pop,The 4-Skins,1982,813,From the cradle to the grave\nThe Oi Boy tryin...,758100,en


In [26]:
# what different genres are we dealing with
music['tag'].value_counts()

pop        270995
rap        188201
rock       123138
rb          30194
misc        27573
country     17062
Name: tag, dtype: int64

In [31]:
# how many nulls?
music.isnull().sum()

title       24
tag          0
artist       0
year         0
views        0
lyrics       0
id           0
language     0
dtype: int64

In [33]:
#the dataset looks pretty clean, let's just drop those 24 null title songs
music = music.dropna()

In [35]:
# set the index to ID
music.set_index(['id'], inplace=True)

In [36]:
music

Unnamed: 0_level_0,title,tag,artist,year,views,lyrics,language
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6725402,Pulse Queen,rock,We Are PIGS,2020,70,[Verse 1]\nHey fool\nHey newcomer\nTalk big\nN...,en
3644419,Your Love Is Wonderful Second Pressing,pop,Hattie Littles,2005,41,[Chorus]\nYour love is so wonderful\nYour love...,en
4812776,Sauce Anthem,rap,Kay Young,2019,11,[INTRO]\nToo much sauce got me feeling my self...,en
156320,Hope N Pray,rap,Wyclef Jean,2013,308,[Intro:]\nShe put a choppa to my ear and then ...,en
457398,Imagination Is A Powerful Deceiver,rock,Elvis Costello,1993,543,So you're trying to make connection\nYou heard...,en
...,...,...,...,...,...,...,...
797908,Where Is My Power?,pop,Antony and the Johnsons,2014,847,Where is my power?\nIs it in the air around me...,en
2234006,This Light Between Us Orchestral Version,pop,Armin van Buuren,2011,84,Can you see this light between us?\nKeeps me b...,en
1548294,Jupiter Island,rock,Porcupine Tree,1992,1306,Jupiter Island is full of pleasures\nGlowing g...,en
758100,Yesterdays Heroes,pop,The 4-Skins,1982,813,From the cradle to the grave\nThe Oi Boy tryin...,en


In [64]:
#import the model we plan on using
from transformers import pipeline

distilled_student_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
    return_all_scores=True, truncation=True
)

In [65]:
scores = distilled_student_sentiment_classifier(music.loc[797908, 'lyrics'])

In [66]:
scores

[[{'label': 'positive', 'score': 0.5367258191108704},
  {'label': 'neutral', 'score': 0.1370551586151123},
  {'label': 'negative', 'score': 0.32621899247169495}]]

In [67]:
music_sample = music.sample(n=10)

In [71]:

def sentiment(lyrics):
    scores = distilled_student_sentiment_classifier(lyrics)
    pos_score = scores[0][0]['score']
    neu_score = scores[0][1]['score']
    neg_score = scores[0][2]['score']
    return pos_score, neg_score, neu_score
music['Positive'],  music['Negative'], music['Neutral'] = zip(*music['lyrics'].apply(sentiment))

KeyboardInterrupt: 

In [None]:
music