# Setup GDrive

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!unzip -q '/content/gdrive/MyDrive/Colab Notebooks/INMCA_2021/PMEmo2019.zip'

In [3]:
dir_path = '/content/PMEmo2019/lyrics'

# Setup

In [None]:
!pip install -q flair
!pip install -q text2emotion

In [None]:
import numpy as np
import pandas as pd
import os

import tqdm
import csv
import re
import nltk
from textblob import TextBlob
import flair
import text2emotion as te

In [6]:
# Setup NTLK Vader model for sentiment analysis
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Setup Flair Text Classifier
flair_sentiment = flair.models.TextClassifier.load('en-sentiment')

# Setup output file
csv_file = "text_features.csv"
columns = ['id', 'neg', 'neu', 'pos', 'compound', 'polarity', 'subjectivity', 'flair_sentiment', 'Happy', 'Angry', 'Surprise', 'Sad', 'Fear']
with open(csv_file, 'w') as csvfile:
    writer = csv.DictWriter(csvfile, columns)
    writer.writeheader()

# Main loop
dirname = dir_path
for filename in tqdm.notebook.tqdm(os.listdir(dirname)):
    
    # Setup text REGEX
    REGEX = "^\[(\d{2})\:(\d{2})\.(\d{2})\](.+)"
    # Retrieve text from lyrics
    lrc_file = os.path.join(dirname, filename)
    parsed_text = ""
    with open(lrc_file, encoding="UTF-8") as lrc:
        lyrics = lrc.readlines()
        for line in lyrics:
            if line.strip():
                m = re.search(REGEX, line)
                if type(m) is re.Match:
                    parsed_text += " " + m.group(4).strip()
    sentence = parsed_text.strip()
    if sentence == "":
        continue
    
    # Use lyrics name as ID, following PMEmo dataset convention
    features = {"id":filename.split('.')[0]}

    # Calculate NLTK features
    nltk_features = sid.polarity_scores(sentence)

    # Calculate TextBlob features
    textblob_features = TextBlob(sentence).sentiment
    textblob_features = dict(textblob_features._asdict())
    textblob_features['polarity'] = (textblob_features['polarity']+1)/2 #Normalization

    # Calculate Flair features
    s = flair.data.Sentence(sentence)
    flair_sentiment.predict(s)
    total_sentiment = s.labels
    flair_feature = total_sentiment[0].score
    if total_sentiment[0].value == 'NEGATIVE':
        flair_feature = -flair_feature
    flair_features = {"flair_sentiment":(flair_feature + 1)/2} #Normalization

    # Calculate Text2emotion features
    text2emotion_features = te.get_emotion(sentence)

    # Update row
    features.update(nltk_features)
    features.update(textblob_features)
    features.update(flair_features)
    features.update(text2emotion_features)

    # Write row to csv
    with open(csv_file, 'a') as csvfile:
        writer = csv.DictWriter(csvfile, features.keys())
        writer.writerow(features)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...




2021-11-18 11:51:57,512 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-distilbert/sentiment-en-mix-distillbert_4.pt not found in cache, downloading to /tmp/tmpj_jj__7b


100%|██████████| 265512723/265512723 [00:15<00:00, 17212606.19B/s]

2021-11-18 11:52:13,444 copying /tmp/tmpj_jj__7b to cache at /root/.flair/models/sentiment-en-mix-distillbert_4.pt





2021-11-18 11:52:16,740 removing temp file /tmp/tmpj_jj__7b
2021-11-18 11:52:16,783 loading file /root/.flair/models/sentiment-en-mix-distillbert_4.pt


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

  0%|          | 0/629 [00:00<?, ?it/s]

In [7]:
# Save to GDrive
!cp -r text_features.csv '/content/gdrive/MyDrive/Colab Notebooks/INMCA_2021/CSV/text_features.csv'