<a href="https://colab.research.google.com/github/Kidara/INMCA_2021/blob/main/text_nlp_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup GDrive

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
!unzip -q '/content/gdrive/MyDrive/Colab Notebooks/INMCA_2021/PMEmo2019.zip'

In [6]:
dir_path = '/content/PMEmo2019/lyrics'

# Setup

In [9]:
import numpy as np
import pandas as pd
import os

import tqdm
import csv
import re
import nltk
from textblob import TextBlob
import flair
import text2emotion as te

In [2]:
!pip install -q flair
!pip install -q text2emotion

[K     |████████████████████████████████| 319 kB 2.8 MB/s 
[K     |████████████████████████████████| 56 kB 4.2 MB/s 
[K     |████████████████████████████████| 788 kB 42.4 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 981 kB 31.3 MB/s 
[K     |████████████████████████████████| 2.9 MB 29.6 MB/s 
[K     |████████████████████████████████| 48 kB 4.3 MB/s 
[K     |████████████████████████████████| 64 kB 2.2 MB/s 
[K     |████████████████████████████████| 1.2 MB 44.8 MB/s 
[K     |████████████████████████████████| 19.7 MB 10.5 MB/s 
[K     |████████████████████████████████| 62 kB 555 kB/s 
[K     |████████████████████████████████| 3.3 MB 20.3 MB/s 
[K     |████████████████████████████████| 596 kB 55.6 MB/s 
[K     |████████████████████████████████| 895 kB 68.6 MB/s 
[?25h  Building wheel for gdown (PEP 517) ... 

In [11]:
# Setup NTLK Vader model for sentiment analysis
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Setup Flair Text Classifier
flair_sentiment = flair.models.TextClassifier.load('en-sentiment')

# Setup output file
csv_file = "text_features.csv"
columns = ['id', 'neg', 'neu', 'pos', 'compound', 'polarity', 'subjectivity', 'flair_sentiment', 'Happy', 'Angry', 'Surprise', 'Sad', 'Fear']
with open(csv_file, 'w') as csvfile:
    writer = csv.DictWriter(csvfile, columns)
    writer.writeheader()

# Main loop
dirname = dir_path
for filename in tqdm.notebook.tqdm(os.listdir(dirname)):
    
    # Setup text REGEX
    REGEX = "^\[(\d{2})\:(\d{2})\.(\d{2})\](.+)"
    # Retrieve text from lyrics
    lrc_file = os.path.join(dirname, filename)
    parsed_text = ""
    with open(lrc_file, encoding="UTF-8") as lrc:
        lyrics = lrc.readlines()
        for line in lyrics:
            if line.strip():
                m = re.search(REGEX, line)
                if type(m) is re.Match:
                    parsed_text += " " + m.group(4).strip()
    sentence = parsed_text.strip()
    if sentence == "":
        continue
    
    # Use lyrics name as ID, following PMEmo dataset convention
    features = {"id":filename.split('.')[0]}

    # Calculate NLTK features
    nltk_features = sid.polarity_scores(sentence)

    # Calculate TextBlob features
    textblob_features = TextBlob(sentence).sentiment
    textblob_features = dict(textblob_features._asdict())
    textblob_features['polarity'] = (textblob_features['polarity']+1)/2 #Normalization

    # Calculate Flair features
    s = flair.data.Sentence(sentence)
    flair_sentiment.predict(s)
    total_sentiment = s.labels
    flair_feature = total_sentiment[0].score
    if total_sentiment[0].value == 'NEGATIVE':
        flair_feature = -flair_feature
    flair_features = {"flair_sentiment":(flair_feature + 1)/2} #Normalization

    # Calculate Text2emotion features
    text2emotion_features = te.get_emotion(sentence)

    # Update row
    features.update(nltk_features)
    features.update(textblob_features)
    features.update(flair_features)
    features.update(text2emotion_features)

    # Write row to csv
    with open(csv_file, 'a') as csvfile:
        writer = csv.DictWriter(csvfile, features.keys())
        writer.writerow(features)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
2021-10-14 11:05:49,589 loading file /root/.flair/models/sentiment-en-mix-distillbert_4.pt


  0%|          | 0/629 [00:00<?, ?it/s]

In [14]:
# Save to GDrive
!cp -r text_features.csv '/content/gdrive/MyDrive/Colab Notebooks/INMCA_2021/CSV/text_features.csv'