In [287]:
import pandas as pd
import spacy
from transformers import pipeline
import torch
import json
from datasets import Dataset
from concurrent.futures import ThreadPoolExecutor
import torch.multiprocessing as mp

In [281]:
mp.set_start_method('spawn', force=True)

print(torch.cuda.is_available())
print(torch.cuda.current_device())

True
0


In [3]:
nlp = spacy.load("en_core_web_sm")

In [158]:
with open('../datasets/rank_1/lyrics.json', 'r') as file:
    lyrics_json = json.load(file)
lyrics = [lyrics_json[i]['lyrics'] for i in lyrics_json]
track_ids = [lyrics_json[i]['track_id'] for i in lyrics_json]

In [159]:
def tokenize(text):
    doc = nlp(text)
    tokens = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens)

In [162]:
tokenized_lyrics = [tokenize(lyric) for lyric in lyrics]
dataset = Dataset.from_dict({'lyrics': tokenized_lyrics})

In [17]:
with open('../datasets/topic_labels.json', 'r') as file:
    labels_json = json.load(file)
label_pairs = labels_json['labels']
label_pairs

[['OPTIMISM', 'PESSIMISM'],
 ['LOVE', 'HATE'],
 ['JOY', 'SORROW'],
 ['HOPE', 'DESPAIR'],
 ['STRENGTH', 'WEAKNESS'],
 ['BRAVERY', 'FEAR'],
 ['SUCCESS', 'FAILURE'],
 ['GRATITUDE', 'ENTITLEMENT'],
 ['FORGIVENESS', 'RESENTMENT'],
 ['PEACE', 'CHAOS']]

In [34]:
classifier = pipeline('zero-shot-classification', model='roberta-large-mnli', device=0)

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [84]:
def classify_pair(lyric, label_pair):
    if not lyric.strip():
        return {label_pair[0]: 0.0, label_pair[1]: 0.0}
    result = classifier(lyric, label_pair)
    return {label_pair[0]: result['scores'][0], label_pair[1]: result['scores'][1]}

In [237]:
%%time
results = []
for label_pair in label_pairs:
    result = dataset.map(lambda x: {'results': classify_pair(x['lyrics'], label_pair)}, batched=False)
    results.append(result)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

CPU times: user 22.2 s, sys: 9.68 s, total: 31.9 s
Wall time: 32.1 s


## Too slow, must find another way...

In [282]:
def classify_parallel(lyrics, label_pairs, num_workers=4):
    
    all_results = []
    
    for lyric in lyrics:
        
        with ThreadPoolExecutor(max_workers=num_workers) as executor:
            
            futures = [executor.submit(partial(classify_pair, lyric, label_pair)) for label_pair in label_pairs]
            results = [future.result() for future in futures]
            
        all_results.append(results)
        
    return all_results

In [283]:
%%time
results = classify_parallel(tokenized_lyrics, label_pairs)

CPU times: user 16.4 s, sys: 1.2 s, total: 17.6 s
Wall time: 11.9 s


# 2x Faster!

In [285]:
flattened_data = []
for i, result in enumerate(results):
    track_data = {'track_id': track_ids[i]}
    for category in result:
        track_data.update(category)
    flattened_data.append(track_data)

df = pd.DataFrame(flattened_data)
df.head()

Unnamed: 0,track_id,OPTIMISM,PESSIMISM,LOVE,HATE,JOY,SORROW,HOPE,DESPAIR,STRENGTH,...,BRAVERY,FEAR,SUCCESS,FAILURE,GRATITUDE,ENTITLEMENT,FORGIVENESS,RESENTMENT,PEACE,CHAOS
0,5ayybTSXNwcarDtxQKqvWX,0.509144,0.490856,0.801051,0.198949,0.845799,0.154201,0.71491,0.28509,0.64375,...,0.637215,0.362785,0.71794,0.28206,0.622349,0.377651,0.510038,0.489962,0.682343,0.317656
1,006Ndmw2hHxvnLbJsBFnPx,0.539714,0.460286,0.590999,0.409001,0.543967,0.456033,0.625337,0.374663,0.546861,...,0.716674,0.283326,0.578891,0.421109,0.543178,0.456822,0.589031,0.410969,0.718144,0.281856
2,6xupOaBWORbDmakCdQwMRG,0.582897,0.417102,0.954446,0.045554,0.652956,0.347044,0.679851,0.320149,0.665997,...,0.608168,0.391832,0.700274,0.299726,0.556012,0.443988,0.580708,0.419292,0.543781,0.456219
3,2tvt5K7y1gndmCgtIoLo1f,0.702907,0.297093,0.912151,0.087849,0.729678,0.270322,0.816118,0.183882,0.593635,...,0.958914,0.041086,0.966054,0.033946,0.622359,0.377641,0.763382,0.236618,0.662791,0.337209
4,1xVOttVNT27FBTD8iHjOfU,0.741109,0.25889,0.919729,0.080271,0.62669,0.37331,0.873529,0.126471,0.569743,...,0.675482,0.324518,0.848694,0.151306,0.528214,0.471786,0.561546,0.438454,0.567714,0.432286


In [290]:
df.to_csv('../datasets/rank_1/lyric_sentiment.csv', index=False)