In [2]:
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from pathlib import Path
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from sklearn.linear_model import LogisticRegression

In [3]:
#! pip install pyarrow
# ! pip install fastparquet
filepath = Path("./song_lyrics.csv")
music = pd.read_csv(filepath)

In [5]:
song = music.loc[music['id'] == .iloc[0]
music.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,en,en,en
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,en,en,en
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4,en,en,en
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,en,en,en
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en,en,en


In [4]:
music = music[['title', 'tag', 'artist', 'year', 'views', 'lyrics', 'id']]
music 

Unnamed: 0,title,tag,artist,year,views,lyrics,id
0,Killa Cam,rap,Cam'ron,2004,173166,"[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1
1,Can I Live,rap,JAY-Z,1996,468624,"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3
2,Forgive Me Father,rap,Fabolous,2003,4743,Maybe cause I'm eatin\nAnd these bastards fien...,4
3,Down and Out,rap,Cam'ron,2004,144404,[Produced by Kanye West and Brian Miller]\n\n[...,5
4,Fly In,rap,Lil Wayne,2005,78271,"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6
...,...,...,...,...,...,...,...
5134851,Ocean,pop,Effemar,2022,3,[Verse 1]\nDance for me now\nKeeping yourself ...,7882842
5134852,64 Bars,rap,Rapido,2022,4,"[Intro]\n\nJa, ja\n\n[Part 1]\n\nR-A-H, Merhab...",7882843
5134853,Raise Our Hands,pop,"Culture Code, Pag & Mylo",2016,3,[Verse 1]\nHere our purpose feels alive\nWe ar...,7882845
5134854,CEO,rap,Antropolita,2022,5,Jestem CEO w tym\nTo jara twoją bitch\nNikt na...,7882846


In [7]:
# what different genres are we dealing with
music['tag'].value_counts()

tag
pop        32
rap        28
rock       19
misc       12
country     5
rb          4
Name: count, dtype: int64

In [5]:
# how many nulls?
music.isnull().sum()

title     188
tag         0
artist      0
year        0
views       0
lyrics      0
id          0
dtype: int64

In [6]:
#the dataset looks pretty clean, let's just drop those 24 null title songs
music = music.dropna()

In [7]:
# set the index to ID
music.set_index(['id'], inplace=True)

In [11]:
music

Unnamed: 0_level_0,title,tag,artist,year,views,lyrics,language
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
7318468,Little Mix,rap,T.Scam,2021,3396,[Intro]\n(Mikey)\nSmokin' on 'em\nSuwoo\n(Gang...,en
993107,See That My Grave Is Kept Clean,pop,B.B. King,2008,903,"Well, there's one kind favor I'll ask of you\n...",en
5248486,Mimosa,rap,Avatar Frost,2020,23,"[Verse 1]\nHere comes the monster, the beast, ...",en
1596001,Mistakes Are Mine,pop,Should,2014,113,I stayed awake that night and swore You’d be i...,en
5341666,LUV HURTS,rap,Jusanity,2020,19,[Intro]\nLUV HURTS\nYeah\n\n[Hook]\nWhy you al...,en
...,...,...,...,...,...,...,...
2476248,Train to Eternity,rock,Tiger Army,2016,1625,[Verse 1]\nI left the station many years ago\n...,en
5977558,FICTIONAL,rap,Lil Joof,2020,85,"[Hook: Lil Joof]\nI like my girls fictional, Y...",en
670981,EssentialSkillz and 4see Ltd release UKATA Cer...,misc,Greg Micyk,2015,8,Global eLearning provider EssentialSkillz and ...,en
3249893,Rope Burn,misc,Damaged Bug,2014,345,I'll drive a spike into your vein\nYou feel yo...,en


In [8]:
import torch

torch.cuda.is_available()
device = 0 if torch.cuda.is_available() else -1

In [9]:
#import the model we plan on using
#! pip install transformers
from transformers import pipeline

distilled_student_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
    return_all_scores=True, truncation=True, device=device
)



In [29]:
music_sample = music

In [23]:

def sentiment(lyrics):
    scores = distilled_student_sentiment_classifier(lyrics)
    pos_score = scores[0][0]['score']
    neu_score = scores[0][1]['score']
    neg_score = scores[0][2]['score']
    return pos_score, neg_score, neu_score
music['Positive'],  music['Negative'], music['Neutral'] = zip(*music['lyrics'].apply(sentiment))



In [32]:
music

Unnamed: 0_level_0,title,tag,artist,year,views,lyrics,language,Positive,Negative,Neutral
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
7318468,Little Mix,rap,T.Scam,2021,3396,[Intro]\n(Mikey)\nSmokin' on 'em\nSuwoo\n(Gang...,en,0.164948,0.656215,0.178837
993107,See That My Grave Is Kept Clean,pop,B.B. King,2008,903,"Well, there's one kind favor I'll ask of you\n...",en,0.262436,0.624318,0.113246
5248486,Mimosa,rap,Avatar Frost,2020,23,"[Verse 1]\nHere comes the monster, the beast, ...",en,0.275367,0.576875,0.147758
1596001,Mistakes Are Mine,pop,Should,2014,113,I stayed awake that night and swore You’d be i...,en,0.081622,0.763085,0.155293
5341666,LUV HURTS,rap,Jusanity,2020,19,[Intro]\nLUV HURTS\nYeah\n\n[Hook]\nWhy you al...,en,0.328744,0.515121,0.156135
...,...,...,...,...,...,...,...,...,...,...
2476248,Train to Eternity,rock,Tiger Army,2016,1625,[Verse 1]\nI left the station many years ago\n...,en,0.584043,0.269613,0.146344
5977558,FICTIONAL,rap,Lil Joof,2020,85,"[Hook: Lil Joof]\nI like my girls fictional, Y...",en,0.101416,0.826224,0.072360
670981,EssentialSkillz and 4see Ltd release UKATA Cer...,misc,Greg Micyk,2015,8,Global eLearning provider EssentialSkillz and ...,en,0.863000,0.053423,0.083577
3249893,Rope Burn,misc,Damaged Bug,2014,345,I'll drive a spike into your vein\nYou feel yo...,en,0.345537,0.475109,0.179354


In [11]:
music = music[music['views'] > 1000]
len(music)

839869

In [12]:
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from datasets import Dataset
import pandas as pd
from tqdm.auto import tqdm  # Import tqdm for the progress bar

# Assuming distilled_student_sentiment_classifier is already defined and using GPU (device=0)
pipe = distilled_student_sentiment_classifier

# Convert your DataFrame column to a dataset
dataset = Dataset.from_pandas(music)

lyrics_dataset = KeyDataset(dataset, 'lyrics')

# # # Process the data in batches
results = []
for out in tqdm(pipe(lyrics_dataset, batch_size=8, truncation=True), total=len(lyrics_dataset)//8):
    pos_score = out[0]['score']
    neu_score = out[1]['score']
    neg_score = out[2]['score']
    results.append((pos_score, neg_score, neu_score))
# 
# # Unpack results into the DataFrame
music['Positive'], music['Negative'], music['Neutral'] = zip(*results)
music.to_csv('output.csv', index=False)

  0%|          | 0/104983 [00:00<?, ?it/s]

In [14]:
most_positive = music.loc[music['Negative'].idxmax()]
most_positive

title                                    Shite Being Scottish
tag                                                      misc
artist                                            Danny Boyle
year                                                     1996
views                                                   18886
lyrics      [Rent-Boy]\nIt's shite being Scottish!\nWe're ...
Positive                                             0.006705
Negative                                              0.98341
Neutral                                              0.009885
Name: 230502, dtype: object