# Feature dei commenti

Creation of new features for the comment, considering both grammar and sentiment analysis/emotion detection

In [None]:
import csv
import json
import pandas as pd

#emoji analysis
import emoji
import emojis

#time analysis
from datetime import datetime

#pulizia testo
from html import unescape
import re
import string
import contractions

#POS tagging
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

#emozioni estratte con NRCLex
from nrclex import NRCLex

#test di leggibilità
from readability import Readability

#polarità di Textblob
from textblob import TextBlob



## File JSON

In [None]:
json_files = ["..."] 
folder = "../"

In [None]:

lines_def = []
for i in range(0, len(json_files)):
    with open(folder+json_files[i], 'r') as incsv:
        for line in incsv:
            lines_def.append(json.loads(line))

# emoji analysis

In [None]:
# Define the function to extract emoji information (number, set and list of emojis)
def extract_emojis(listLine):
    for i in listLine:
        count_e = emoji.emoji_count(i["text"])
        set_e = ''.join(emoji.distinct_emoji_list(i["text"]))
        data_list_e = emoji.emoji_list(i["text"])
        if data_list_e:
            str_e = ''.join([entry['emoji'] for entry in data_list_e])
        else:
            str_e = ''

        standard_s = emojis.decode(i["text"]) # the emojis are decoded (e.g. the smiley face becomes :smile:)
        i["text"] = standard_s
        i["emoji_count"] = count_e
        i["emoji_unique"] = set_e
        i["emoji_list"] = str_e


# time handling

In [None]:
#this function takes the key time, converts it into date and time_of day, adds these two keys and then removes the key time
def convert_time(listLine):
    for i in listLine:
        date = datetime.fromtimestamp(i["time"])
        i["date"] = date.date().strftime("%Y-%m-%d")
        i["time_of_day"] = date.time().strftime("%H:%M:%S")
        del i["time"]

# text cleaning

In [None]:
# Define a function to clean a single text
def clean_single_text(listLine):
    for i in listLine:
      # Decode HTML entities
      i["text"] = unescape(i["text"])

      # Remove URLs
      i["text"] = re.sub(r'https?://\S+|www\.\S+', '', i["text"])

      # Remove symbols excluding numbers and punctuation
      i["text"] = re.sub(r'[^a-zA-Z0-9\s' + re.escape(string.punctuation) + ']', '', i["text"])

      # Reduce multiple spaces to one
      i["text"] = re.sub(r'\s+', ' ', i["text"])

      # Remove new lines and tabs
      i["text"] = re.sub(r'[\n\t]', ' ', i["text"])

      i["text"] = i["text"].strip()  # Remove leading and trailing spaces
    

In [None]:
#used to expand contraction (it's => it is). slang is set to True as it is possible to find slang words in reddit posts
def expand_contractions(listLine):
    for i in listLine:
        i["text"] = contractions.fix(i["text"], slang=True)

In [None]:
#function to define number of unique words and uppercase words in the text
def unique_upper_words(listLine):
    for i in listLine:
        # number of unique words in the text
        i["num_unique_words"] = len(set(str(i['text']).lower().split()))

        # number of Upper case words in the text
        i["num_words_upper"] = len([w for w in str(i["text"]).split() if w.isupper()])

In [None]:
#IN CASE: remove punctuation and trasform text in lowercase
my_punct = ['"', '$', '%', '&', "'", '(', ')', '*', '+', ',',
           '/', ':', ';', '<', '=', '>', '@', '[', '\\', ']', '^', '_',
           '`', '{', '|', '}', '~', '»', '«', '“', '”', '#', '!', '?','.',':']

punct_pattern = re.compile("[" + re.escape("".join(my_punct)) + "]")
#function that removes punctuation
def remove_punct(listLine):
    for i in listLine:
        i["text"] = re.sub(punct_pattern, ' ', i["text"])
        i["text"] = i["text"].lower()

# PoS tagging

PoS Tagging used to define Nouns, Adjectives and Verbs

In [None]:
#POS Tagging to determine the number of "full words" (Verbs, Nouns and Adj) in a post
def posTag(listLine):
    for i in listLine:
        tokens = word_tokenize(i["text"])
        pos_tags = nltk.pos_tag(tokens)
        # Counting the full words (Adjectives, Nouns, Verbs) JJ, NN, VB
        number_full_words = sum(1 for _, tag in pos_tags if tag.startswith(("JJ", "NN", "VB")))
        i["number_full_words"] = number_full_words


# NRC LEX

Library used to define the values of 8 emotions

In [None]:
# Define a function to extract emotion vectors using NRCLex
def get_emotion_vector(listLine):
    for i in listLine:
        emotion = NRCLex(i["text"])
        affect_frequencies_dict = {emotion_class: round(frequency, 2) for emotion_class, frequency in emotion.affect_frequencies.items()}
        for k, v in affect_frequencies_dict.items():
            i[k] = v

# TextBlob

TextBlob's polarity

In [None]:
def get_polarity(listLine):
    for i in listLine:
        try:
            blob = TextBlob(i["text"])
            i["polarity"] = blob.sentiment.polarity
        except:
            i["polarity"] = 0

# VAD lexicon

Extraction of the values of Arousal, Dominance and Valence according to VAD lexicon

In [None]:
#file VAD translated into python dictionaries 
VAD = pd.read_csv('../NRC/NRC-VAD-Lexicon.txt', sep="\t", header=None)
VAD.columns = ["word", "valence", "arousal", "dominance"]
VAD_dict = VAD.set_index('word').T.to_dict()

In [None]:
#extraction of V A D values
def emotion_VAD(text, dim):
    words_VAD = text.split()
    score = [VAD_dict[i][dim] if i in VAD_dict else 0 for i in words_VAD]
    return sum(score) / max(len(score), 1)

def analyze_valence(text):
    return emotion_VAD(text, 'valence')

def analyze_arousal(text):
    return emotion_VAD(text, 'arousal')

def analyze_dominance(text):
    return emotion_VAD(text, 'dominance')

In [None]:
def get_VAD(listLine):
    for i in listLine:
        i["valence"] = analyze_valence(i["text"])
        i["arousal"] = analyze_arousal(i["text"])
        i["dominance"] = analyze_dominance(i["text"])

# Readability

Values of readability according to 9 tests

In [None]:
def readability_tests(listLine):
    for i in listLine:
        text_str = ''.join(i["text"])
        i["flesch_kincaid"] = 0
        i["flesch"] = 0
        i["fog"] = 0
        i["coleman_liau"] = 0
        i["dale_chall"] = 0
        i["ari"] = 0
        i["linsear_write"] = 0
        i["smog"] = 0
        i["spache"] = 0
        try:
            r = Readability(text_str)
            sentences = sent_tokenize(text_str)
            # Check if the sentence count is less than 30 for SMOG calculation
            num_sentences = len(sentences)

            # Tokenize words using nltk for flesch_kincaid()
            wordlst = text_str.split()
            num_words = len(wordlst)
            #print(num_sentences)
            #print(num_words)
            if num_words >= 100:
                i["flesch_kincaid"] = r.flesch_kincaid().score
                i["flesch"] = r.flesch().score
                i["fog"] = r.gunning_fog().score
                i["coleman_liau"] = r.coleman_liau().score
                i["dale_chall"] = r.dale_chall().score
                i["ari"] =r.ari().score
                i["linsear_write"] =r.linsear_write().score
                if num_sentences >= 30:
                    i["smog"] =r.smog().score 
                i["spache"] =r.spache().score
        except:
            continue
        

# Apply functions

In [None]:
extract_emojis(lines_def)
print("femoji")
convert_time(lines_def)
print("time")
clean_single_text(lines_def)
print("clean")
expand_contractions(lines_def)
print("contr")
unique_upper_words(lines_def)
print("upper")
remove_punct(lines_def)
print("punct")
posTag(lines_def)
print("POS")
get_emotion_vector(lines_def)
print("emotion")
readability_tests(lines_def)
print("readability")
get_polarity(lines_def)
print("polarity")
get_VAD(lines_def)
print("VAD")

Turn into CSV Dataset

In [None]:
df = pd.DataFrame.from_dict(lines_def, orient='columns')
df.to_csv('../file_csv/PIANO_comments.csv', index=False)