## Imports

In [None]:
import ast
import csv
import gzip
import math
import os
import re
import string
import sys
import unicodedata

import emoji
import enchant
import language_tool_python
import matplotlib.pyplot as plt
import nltk
import pandas as pd
import spacy
import spacy_udpipe
import splitter
import treetaggerwrapper
import wordninja

from itertools import groupby
from typing import List, Set

from nltk.corpus import stopwords, wordnet, sentiwordnet
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer
from tokenizer import *

from ekphrasis.dicts.emoticons import emoticons
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer

from textblob import TextBlob

dir_parts = os.getcwd().split(os.path.sep)
root_index = dir_parts.index('MyHaSpeeDe-1')
root_path = os.path.sep.join(dir_parts[:root_index + 1])

# Load the Italian model
nlp = spacy.load("it_core_news_sm")
# Load NLTK data
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('sentiwordnet')

## Path

In [None]:
# Directories
fb_dir = root_path + '/data/facebook/'
tw_dir = root_path + '/data/twitter/'
preprocessed_dir = 'preprocessed/'
w2v_dir = root_path + '/data/word2vec/'

# Filepaths (Facebook dataset)
fb_dev_path = fb_dir + 'dev/' + 'fb_dev.csv'
fb_test_path = fb_dir + 'test/' + 'fb_test.csv'

fb_dev_preprocessed_path = fb_dir + 'dev/' + preprocessed_dir + 'fb_dev_preprocessed.csv'
fb_test_preprocessed_path = fb_dir + 'test/' + preprocessed_dir + 'fb_test_preprocessed.csv'

# Filepaths (Twitter dataset)
tw_dev_path = tw_dir + 'dev/' + 'tw_dev.csv'
tw_test_path = tw_dir + 'test/' + 'tw_test.csv'

tw_dev_preprocessed_path = tw_dir + 'dev/' + preprocessed_dir + 'tw_dev_preprocessed.csv'
tw_test_preprocessed_path = tw_dir + 'test/' + preprocessed_dir + 'tw_test_preprocessed.csv'

# Corpus + Lexicon
dictionary_path = root_path + '/data/italian_words.txt' # vocabulary
bad_words_path = root_path + '/data/italian_bad_words.txt' # bad words
polarity_lexicon_path = root_path + '/data/DPL-IT_lrec2016.txt' # polarity lexicon (pos-neg-neutral)

## Data

In [None]:
pd.set_option("display.max_colwidth", None)

In [None]:
# Load Facebook dev/test dataset
fb_dev_inf = open(fb_dev_path, encoding='utf-8')
fb_dev = pd.read_csv(fb_dev_inf, sep=',', header=0)

fb_test_inf = open(fb_test_path, encoding='utf-8')
fb_test = pd.read_csv(fb_test_inf, sep=',', header=0)

In [None]:
# Load Twitter dev/test dataset
tw_dev_inf = open(tw_dev_path, encoding='utf-8')
tw_dev = pd.read_csv(tw_dev_inf, sep=',', header=0)

tw_test_inf = open(tw_test_path, encoding='utf-8')
tw_test = pd.read_csv(tw_test_inf, sep=',', header=0)

"""# Load Facebook dev/test dataset
fb_dev_inf = open(fb_dev_preprocessed_path, encoding='utf-8')
fb_dev = pd.read_csv(fb_dev_inf, sep=',', header=0)

fb_test_inf = open(fb_test_preprocessed_path, encoding='utf-8')
fb_test = pd.read_csv(fb_test_inf, sep=',', header=0)

# Load Twitter dev/test dataset
tw_dev_inf = open(tw_dev_preprocessed_path, encoding='utf-8')
tw_dev = pd.read_csv(tw_dev_inf, sep=',', header=0)

tw_test_inf = open(tw_test_preprocessed_path, encoding='utf-8')
tw_test = pd.read_csv(tw_test_inf, sep=',', header=0)"""

# Preprocessing
The pre-processing phase is performed according to the *Preprocessing* section of [[1]](https://ceur-ws.org/Vol-2263/paper043.pdf) by Bianchini et al..

Below a comprehensive list of the pre-processing steps:
- Extraction of the first feature: length of the comment;
- Extraction of the second feature: percentage of words written in CAPS-LOCK inside a tweet;
- Remove mentions and URLs;
- Handling special characters and newlines
- Conversion of disguised bad words;
- Hashtag splitting;
- Removal of nearby redundant vowels and/or consonants;
- Extraction of the third feature: number of sentences;
- Extraction of the fourth feature: number of ‘?’ and ‘!’;
- Extraction of the fifth feature: number of ‘.’ and ‘,’;
- Punctuation removal;
- Translation of emoticons;
- Replacement of abbreviations with the respective words;
- Replacement of acronyms with the respective words;
- Removal of articles, pronouns, prepositions, conjuctions and numbers;
- Removal of the laughs;
- Removal of accented characters with the respective unaccented characters;
- Tokenization;
- Lemmatization;
- Extraction of the sixth feature: percentage of spelling errors;
- Replacement of spelling errors;
- Extraction of the seventh feature: number of bad words;
- Extraction of the eigth feature: percentage of bad words;
- Extraction of the ninth feature: polarity of the message using SentiWordNet;
- Extraction of tenth feature: Polarity TextBlob;
- Extraction of the final feature: Subjectivity TextBlob;
- Part of Speech (PoS) tagging.

## Extraction of the first feature: length of the comment
Length of the comment.

In [None]:
def first_feature(text: str) -> int:
    return len(text)

In [None]:
# Facebook
fb_dev['text_len'] = fb_dev['text'].apply(first_feature)
fb_test['text_len'] = fb_test['text'].apply(first_feature)

In [None]:
# Twitter
tw_dev['text_len'] = tw_dev['text'].apply(first_feature)
tw_test['text_len'] = tw_test['text'].apply(first_feature)

## Extraction of the second feature: percentage of CAPS-LOCK words
Percentage of words written in CAPS-LOCK inside the tweet.

In [None]:
def second_feature(text: str) -> int:
    words = text.split()
    count_caps = sum(w.isupper() for w in words)
    
    return (count_caps * 100) // len(words)

In [None]:
# Facebook
fb_dev['n_caps_words'] = fb_dev['text'].apply(second_feature)
fb_test['n_caps_words'] = fb_test['text'].apply(second_feature)

In [None]:
# Twitter
tw_dev['n_caps_words'] = tw_dev['text'].apply(second_feature)
tw_test['n_caps_words'] = tw_test['text'].apply(second_feature)

## Replacing Mentions
Note: HaSpeeDe-1 organizers replaced mentions with anonymized placehoders with a different format for each dataset, specifically:
- Facebook: *\<PERSONA_i\>* --> MENZ;
- Twitter: *\<MENTION_i\>* --> MENZ.

In [None]:
def remove_fb_mentions(text: str) -> str:
    return re.sub(r'\<PERSONA_\d+\>', 'MENZ', text)

In [None]:
def remove_tw_mentions(text: str) -> str:
    return re.sub(r'\<MENTION_\d+\>', 'MENZ', text)

In [None]:
# Facebook
fb_dev['text'] = fb_dev['text'].apply(remove_fb_mentions)
fb_test['text'] = fb_test['text'].apply(remove_fb_mentions)

In [None]:
# Twitter
tw_dev['text'] = tw_dev['text'].apply(remove_tw_mentions)
tw_test['text'] = tw_test['text'].apply(remove_tw_mentions)

## Replacing URLs
Note: HaSpeeDe3 organzers replaced URLs with the placehorders *\<URL\> --> URL*

In [None]:
def remove_urls(text: str) -> str:
    return re.sub(r'\<URL\>', 'URL', text)

In [None]:
# Facebook
fb_dev['text'] = fb_dev['text'].apply(remove_urls)
fb_test['text'] = fb_test['text'].apply(remove_urls)

In [None]:
# Twitter
tw_dev['text'] = tw_dev['text'].apply(remove_urls)
tw_test['text'] = tw_test['text'].apply(remove_urls)

## Handling special characters and newlines
- Replace characters ‘&’, ‘@’ with ‘e’, ‘a’ respectively.
- Remove newlines ‘\n’

In [None]:
def replace_special_chars(text: str) -> str:
    text = text.replace('&', 'e')
    return text.replace('@', 'a')
    
def remove_newlines(text: str) -> str:
    return re.sub(r'\n', ' ', text)

In [None]:
# Facebook
fb_dev['text'] = fb_dev['text'].apply(replace_special_chars)
fb_dev['text'] = fb_dev['text'].apply(remove_newlines)
fb_test['text'] = fb_test['text'].apply(replace_special_chars)
fb_test['text'] = fb_test['text'].apply(remove_newlines)

In [None]:
# Twitter
tw_dev['text'] = tw_dev['text'].apply(replace_special_chars)
tw_dev['text'] = tw_dev['text'].apply(remove_newlines)
tw_test['text'] = tw_test['text'].apply(replace_special_chars)
tw_test['text'] = tw_test['text'].apply(remove_newlines)

## Hashtag splitting
Hashtag splitting is a crucial and complex phase in preprocessing our Twitter dataset. Since hashtags are often used to compose sentence, we're looking to normalise them into words.

The process, according to mentioned papers, can be summarize as follows:
- Identify single words within the hastag;
- Ignore words with only two characters for efficiency (except for digits);
- Reconstruct the original hashtag into a normalized sentence.

For simplicity and flexibility, before peforming hashtag splitting, we identify and save the original hasthtags in a dedicated column.

In [None]:
def save_hashtags(text: str) -> List[str]:
    return re.findall(r'#\w+', text)

In [None]:
# Facebook
fb_dev['hashtags'] = fb_dev['text'].apply(save_hashtags)
fb_test['hashtags'] = fb_test['text'].apply(save_hashtags)

In [None]:
# Twitter
tw_dev['hashtags'] = tw_dev['text'].apply(save_hashtags)
tw_test['hashtags'] = tw_test['text'].apply(save_hashtags)

Before performing hashtag splitting, we also need to load our italian dictionary (https://www.sketchengine.eu/italian-word-list/).

In [None]:
def load_dictionary(file: str) -> Set[str]:
    word_dict = set()
    
    with open(file, 'r', encoding='utf-8') as inf:
        for line in inf:
            word_dict.add(line.strip())
    return word_dict

In [None]:
word_dict = load_dictionary(dictionary_path)

In [None]:
def split_hashtags(text):
    
    text = f' {text} '
    result = re.findall(r'#\w+', text)

    for word in result:
        hashtag_content = word[1:] # stripping the hash

        new_word = " ".join(splitter.split(hashtag_content.lower(), 'it_IT'))

        # using strip to handle whitespace issues
        if not new_word.strip():
            new_word = hashtag_content

        text = text.replace(word, new_word)

    return text

In [None]:
# Twitter
tw_dev['text'] = tw_dev['text'].apply(split_hashtags)
tw_test['text'] = tw_test['text'].apply(split_hashtags)

## Removing duplicate vowels and consonants
Removing nearby equal vowels and/or nearby equal consonants if they are more than $2$.

In [None]:
def unique_words_with_consecutive_triples(df) -> Set[str]:
    # Regular expression to match words with three or more consecutive duplicate characters
    triple_pattern = r"(.)\1{2,}"
    only_digits_pattern = r"^\d+$"  # to exclude words that are purely numbers

    unique_words = set()

    for tweet in df['text']:
        # Tokenize the tweet into words
        words = re.findall(r'\b\w+\b', tweet)
        for word in words:
            # Exclude words that are only numbers
            if re.match(only_digits_pattern, word):
                continue
            
            # Check if word has three consecutive duplicate characters
            if re.search(triple_pattern, word):
                unique_words.add(word.lower())

    return unique_words

In [None]:
unique_words = unique_words_with_consecutive_triples(tw_dev)
unique_words

In [None]:
vowels = ['a','e','i','o','u','y']

def delete_words_with_redundant_characters(text: str) -> str:
    words = text.split()

    result = ""
    for word in words:
        if word.lower() in word_dict: 
            result += f"{word} "
            continue

        grouped_word = [list(g) for k, g in groupby(word)]

        corrected_word = ""
        for group in grouped_word:
            char = group[0]
            corrected_word += char if char in vowels else char * min(2, len(group))
        result += f"{corrected_word} "

    return result.strip()  # remove trailing space

In [None]:
# Facebook
fb_dev['text'] = fb_dev['text'].apply(delete_words_with_redundant_characters)
fb_test['text'] = fb_test['text'].apply(delete_words_with_redundant_characters)

In [None]:
# Twitter
tw_dev['text'] = tw_dev['text'].apply(delete_words_with_redundant_characters)
tw_test['text'] = tw_test['text'].apply(delete_words_with_redundant_characters)

## Conversion of disguised bad words
Recognition and conversion of censored or disguised bad-words, i.e. bad-words where some of their middle letters have been replace by special characters so that they're recognizable by humans but not by automated systems.

In [None]:
def load_bad_words(file: str) -> List[str]:
    bad_words = []
    with open(file, 'r') as inf:
        for line in inf:
            bad_words.append(line.strip())
            
    return bad_words

In [None]:
bad_words = load_bad_words(bad_words_path)

In [None]:
def convert_disguised_bad_words(text: str) -> str:
    words = text.split()
    
    for i, word in enumerate(words):
        # Check if first and last characters are letters
        if word[0].isalpha() and word[-1].isalpha():
            
            middle_word = word[1:-1]
            # Check if middle part is only special characters or x
            if re.match(r'^[.x*@%#$^]+$', middle_word):
                
                # Match against the list of bad words
                for bad_word in bad_words:
                    if bad_word.startswith(word[0]) and bad_word.endswith(word[-1]):
                        words[i] = bad_word
                        break
                        
    return ' '.join(words)

In [None]:
# Facebook
fb_dev['text'] = fb_dev['text'].apply(convert_disguised_bad_words)
fb_test['text'] = fb_test['text'].apply(convert_disguised_bad_words)

In [None]:
# Twitter
tw_dev['text'] = tw_dev['text'].apply(convert_disguised_bad_words)
tw_test['text'] = tw_test['text'].apply(convert_disguised_bad_words)

## Extraction of the third feature: number of sentences
Number of sentences inside the text, i.e. list of words that end with ‘.’, ‘?’, ‘?’.

In [None]:
def third_feature(text: str) -> int:
    return len(re.findall(r'[^.!?]*[.!?]', text))

In [None]:
# Facebook
fb_dev['#sentences'] = fb_dev['text'].apply(third_feature)
fb_test['#sentences'] = fb_test['text'].apply(third_feature)

In [None]:
# Twitter
tw_dev['#sentences'] = tw_dev['text'].apply(third_feature)
tw_test['#sentences'] = tw_test['text'].apply(third_feature)

## Extraction of the fourth feature: number of question/exclamation marks
Number of ‘?’ or ‘!’ inside the text.

In [None]:
def fourth_feature(text: str) -> int:
    return text.count('?') + text.count('!')

In [None]:
# Facebook
fb_dev['#?!'] = fb_dev['text'].apply(fourth_feature)
fb_test['#?!'] = fb_test['text'].apply(fourth_feature)

In [None]:
# Twitter
tw_dev['#?!'] = tw_dev['text'].apply(fourth_feature)
tw_test['#?!'] = tw_test['text'].apply(fourth_feature)

## Extraction of the fifth feature: number of punctuation symbols
Number of ‘.’ or ‘,’ inside the text.

In [None]:
def fifth_feature(text: str) -> int:
    return text.count('.') + text.count(',')

In [None]:
# Facebook
fb_dev['#.,'] = fb_dev['text'].apply(fifth_feature)
fb_test['#.,'] = fb_test['text'].apply(fifth_feature)

In [None]:
# Twitter
tw_dev['#.,'] = tw_dev['text'].apply(fifth_feature)
tw_test['#.,'] = tw_test['text'].apply(fifth_feature)

## Removing punctuation

In [None]:
def remove_punctuation(text: str) -> str:
    # Translation table for punctuation
    return text.translate(str.maketrans('', '', string.punctuation))

In [None]:
# Facebook
fb_dev['text'] = fb_dev['text'].apply(remove_punctuation)
fb_test['text'] = fb_test['text'].apply(remove_punctuation)

In [None]:
# Twitter
tw_dev['text'] = tw_dev['text'].apply(remove_punctuation)
tw_test['text'] = tw_test['text'].apply(remove_punctuation)

## Translating and/or removing emoticons
Considering the large presence of emojis in tweets, we translate them with the respective italian translation.

In [None]:
def translate_emoticons(text: str) -> str:
    text_result = emoji.demojize(text, language='it')
    text_result=re.sub(r':', ' ', text_result)
    text_result=re.sub(r'_', ' ', text_result)
    return text_result

In [None]:
# Facebook
fb_dev['text'] = fb_dev['text'].apply(translate_emoticons)
fb_test['text'] = fb_test['text'].apply(translate_emoticons)

In [None]:
# Twitter
tw_dev['text'] = tw_dev['text'].apply(translate_emoticons)
tw_test['text'] = tw_test['text'].apply(translate_emoticons)

## Replacing abbreviations with respective words

In [None]:
abbreviations_to_words = {
    '6': 'sei',
    '€': 'euro',
    'mld': 'miliardi',
    'mln': 'milioni',
    'anke': 'anche',
    'cm': 'come',
    'cmq': 'comunque',
    'cs': 'cosa',
    'dlla': 'della',
    'dlle': 'delle',
    'dv': 'dove',
    'dx': 'destra',
    'fb': 'facebook',
    'gov': 'governo',
    'grz': 'grazie',
    'ita': 'italia',
    'ke': 'che',
    'ki': 'chi',
    'msg': 'messaggio',
    'nn': 'non',
    'pkè': 'perchè',
    'qdo': 'quando',
    'qnd': 'quando',
    'qlcs': 'qualcosa',
    'qst': 'questo',
    'sn': 'sono',
    'sx': 'sinistra',
    'tv': 'televisore',
    'tvb': 'ti voglio bene',
    'tw': 'twitter',
    'x': 'per',
    'xchè': 'perchè',
    'xkè': 'perchè',
    'xò': 'però'
}

In [None]:
def replace_abbreviations(text: str) -> str:
    words = text.split()
    result = [abbreviations_to_words.get(word.lower(), word.lower()) for word in words]
    return ' '.join(result)

In [None]:
# Facebook
fb_dev['text'] = fb_dev['text'].apply(replace_abbreviations)
fb_test['text'] = fb_test['text'].apply(replace_abbreviations)

In [None]:
# Twitter
tw_dev['text'] = tw_dev['text'].apply(replace_abbreviations)
tw_test['text'] = tw_test['text'].apply(replace_abbreviations)

## Replacing acronyms with respective words

In [None]:
acronyms_to_words = {
    'ama': 'azienda municipale ambiente',
    'anpi': 'associazione nazionale partigiani italia',
    'cdm': 'consiglio dei ministri',
    'cgil': 'confederazione generale lavoro',
    'cnel': 'consiglio nazionale dell economia e del lavoro',
    'ddl': 'disegno di legge',
    'def': 'documento di economia e finanza',
    'eu': 'unione europea',
    'fdi': 'fratelli di italia',
    'ffoo': 'forze dell ordine',
    'gc': 'guardia costiera',
    'inps': 'istituto nazionale previdenza sociale',
    'lgbt': 'lesbiche gay bisessuali transgender',
    'lgbtq': 'lesbiche gay bisessuali transgender queer',
    'm5s': 'movimento cinque stelle',
    'nwo': 'nuovo ordine mondiale',
    'ong': 'organizzazione non governativa',
    'pd': 'partito democratico',
    'pil': 'prodotto interno lordo',
    'rai': 'radiotelevisione italiana',
    'rdc': 'reddito di cittadinanza',
    'sprar': 'sistema protezione richiedenti asilo',
    'tav': 'treno alta velocita',
    'tg': 'telegiornale',
    'ue': 'unione europea',
    'usa': 'stati uniti di america',
}

In [None]:
def replace_acronyms(text: str) -> str:
    words = text.split()
    result = [acronyms_to_words.get(word.lower(), word.lower()) for word in words]
    return ' '.join(result)

In [None]:
# Facebook
fb_dev['text'] = fb_dev['text'].apply(replace_acronyms)
fb_test['text'] = fb_test['text'].apply(replace_acronyms)

In [None]:
# Twitter
tw_dev['text'] = tw_dev['text'].apply(replace_acronyms)
tw_test['text'] = tw_test['text'].apply(replace_acronyms)

## Removing articles, pronouns, prepositions, conjuctions and numbers

In [None]:
articles = [
    "il", "lo", "la", "l'", "i", "gli", "le", 
    "un", "uno", "una", "un'", 
    "del", "dello", "della", "dei", "degli", "delle", 
    "al", "allo", "alla", "ai", "agli", "alle", 
    "dal", "dallo", "dalla", "dai", "dagli", "dalle", 
    "nel", "nello", "nella", "nei", "negli", "nelle", 
    "sul", "sullo", "sulla", "sui", "sugli", "sulle"
]

pronouns = [
    "io", "tu", "lui", "lei", "noi", "voi", "loro", 
    "mio", "mia", "miei", "mie", "tuo", "tua", "tuoi", "tue", "suo", "sua", "suoi", "sue", 
    "nostro", "nostra", "nostri", "nostre", "vostro", "vostra", "vostri", "vostre",
    "esso", "essa", "essi", "esse", 
    "chi", "cui", "che", 
    "questo", "questa", "questi", "queste", 
    "quello", "quella", "quelli", "quelle", 
    "ci", "vi", "si", "ne", "se", 
    "me", "te", "lui", "lei", "noi", "voi", "loro", "li", "le", 
    "qualcuno", "qualcosa", "nessuno", "niente", "alcuni", "altro"
]

prepositions = [
    "di", "a", "da", "in", "con", "su", "per", "tra", "fra", 
    "sopra", "sotto", "avanti", "dietro", "intorno", "attraverso", "verso", 
    "durante", "mediante", "entro", "senza", "vicino", "presso", 
    "fino", "dopo", "contro", "tra", "fra"
]

conjunctions = [
    "e", "anche", "ma", "o", "se", "perché", "quindi", "né", "che", 
    "come", "dunque", "mentre", "oppure", "però", "tuttavia", 
    "anche se", "benche", "quantunque", "sebbene", "affinché", "così", 
    "quando", "nonostante", "malgrado", "benché", "finché", "purché"
]

In [None]:
def remove_function_words(text: str) -> str:
    # Set containing all undesired words
    unwanted_words = set(articles + pronouns + prepositions + conjunctions)
    
    words = text.split()
    filtered_words = [
        word for word in words 
        if word.lower() not in unwanted_words and not word.isnumeric()
    ]

    return ' '.join(filtered_words)

In [None]:
# Facebook
fb_dev['text'] = fb_dev['text'].apply(remove_function_words)
fb_test['text'] = fb_test['text'].apply(remove_function_words)

In [None]:
# Twitter
tw_dev['text'] = tw_dev['text'].apply(remove_function_words)
tw_test['text'] = tw_test['text'].apply(remove_function_words)

## Removing the laughs

In [None]:
laughters = ['ha', 'ah', 'he', 'eh', 'hi', 'ih']

def remove_laughters(text: str) -> str:
    # Create a compound pattern for each laugh type that matches sequences
    # of that laugh pattern with any number of characters from the pattern
    patterns = ['[' + re.escape(laughter) + ']+' for laughter in laughters]
    
    # Combine compound patterns
    pattern = r'\b(?:' + '|'.join(patterns) + r')+\b'
    
    # Replace the laughter patterns with an empty string
    no_laughs = re.sub(pattern, '', text, flags=re.IGNORECASE)
    
    return no_laughs.strip()

In [None]:
# Facebook
fb_dev['text'] = fb_dev['text'].apply(remove_laughters)
fb_test['text'] = fb_test['text'].apply(remove_laughters)

In [None]:
# Twitter
tw_dev['text'] = tw_dev['text'].apply(remove_laughters)
tw_test['text'] = tw_test['text'].apply(remove_laughters)

## Replacing accented characters
Replacement of accented characters with their unaccented counterpart.

In [None]:
def remove_accents(text: str) -> str:
    nfkd_form = unicodedata.normalize('NFKD', text)
    return ''.join([c for c in nfkd_form if not unicodedata.combining(c)])

In [None]:
# Facebook
fb_dev['text'] = fb_dev['text'].apply(remove_accents)
fb_test['text'] = fb_test['text'].apply(remove_accents)

In [None]:
# Twitter
tw_dev['text'] = tw_dev['text'].apply(remove_accents)
tw_test['text'] = tw_test['text'].apply(remove_accents)

## Tokenization

In [None]:
def tokenization(text: str) -> List[str]:
    tknzr=SocialTokenizer(lowercase=False)
    return tknzr.tokenize(text)

In [None]:
# Facebook
fb_dev['tokens'] = fb_dev['text'].apply(tokenization)
fb_test['tokens'] = fb_test['text'].apply(tokenization)

In [None]:
# Twitter
tw_dev['tokens'] = tw_dev['text'].apply(tokenization)
tw_test['tokens'] = tw_test['text'].apply(tokenization)

## Lemmatization

In [None]:
def lemmatization(text: str) -> List[str]:
    doc = nlp(text)
    return [token.lemma_ for token in doc]

In [None]:
# Facebook
fb_dev['lemmas'] = fb_dev['text'].apply(lemmatization)
fb_test['lemmas'] = fb_test['text'].apply(lemmatization)

In [None]:
# Twitter
tw_dev['lemmas'] = tw_dev['text'].apply(lemmatization)
tw_test['lemmas'] = tw_test['text'].apply(lemmatization)

## Extraction of the sixth feature: percentage of spelling errors
Percentage of spelling errors in the tweet. To do this, a word is compared with a italian vocabulary corpora and if not present then it is a spelling error.

In [None]:
def sixth_feature(text: str) -> int:
    words = text.split()
    return sum(1 for word in words if word.lower() not in word_dict)

In [None]:
# Facebook
fb_dev['#wrong_spellings'] = fb_dev['text'].apply(sixth_feature)
fb_test['#wrong_spellings'] = fb_test['text'].apply(sixth_feature)

In [None]:
# Twitter
tw_dev['#wrong_spellings'] = tw_dev['text'].apply(sixth_feature)
tw_test['#wrong_spellings'] = tw_test['text'].apply(sixth_feature)

## Extraction of the seventh feature: number of bad words
Number of bad words in the tweet, leveraging a italian bad words corpora.

In [None]:
def seventh_feature(text: str) -> int:
    words = text.split()
    return sum(1 for word in words if word.lower() in bad_words)

In [None]:
# Facebook
fb_dev['#bad_words'] = fb_dev['text'].apply(seventh_feature)
fb_test['#bad_words'] = fb_test['text'].apply(seventh_feature)

In [None]:
# Twitter
tw_dev['#bad_words'] = tw_dev['text'].apply(seventh_feature)
tw_test['#bad_words'] = tw_test['text'].apply(seventh_feature)

## Extraction of the eighth feature: percentage of bad words
Percentage of bad words in the tweet.

In [None]:
def eighth_feature(text: str) -> int:
    words = text.split()
    if len(words) == 0:
        return 0
    
    n_bad_words = sum(1 for word in words if word.lower() in bad_words)
    return (n_bad_words*100) // len(words)

In [None]:
# Facebook
fb_dev['%bad_words'] = fb_dev['text'].apply(eighth_feature)
fb_test['%bad_words'] = fb_test['text'].apply(eighth_feature)

In [None]:
# Twitter
tw_dev['%bad_words'] = tw_dev['text'].apply(eighth_feature)
tw_test['%bad_words'] = tw_test['text'].apply(eighth_feature)

## Extraction of the ninth feature: polarity SentiWordNet
Polarity of the message using SentiWordNet. Specifically, each message is ytanslated using TextBlob and then the polarity is computed, since SentiWordNet was create to find polarity in English sentences.

*Note: the polarity of a sentence is computed as the worst polarity score across words.*

Firstly, we store the english translation of each sentence. Then, the polarity is computed w.r.t. the english translations.

In [None]:
def ita_to_eng(lemmas: List[str]) -> List[str]:
    translated_lemmas = []
    for lemma in lemmas:
        # Translate the text to English
        blob = TextBlob(lemma)
        try:
            translated_lemma = blob.translate(from_lang='it', to='en')
            translated_lemmas.append(str(translated_lemma))
        except Exception as e:
            #print(f'Translation error: {e} - For Text {lemma}')
            translated_lemmas.append(str(lemma)) # Append original text in case of errors
        
    return translated_lemmas

In [None]:
# Facebook
#fb_dev['lemmas_en'] = fb_dev['lemmas'].apply(ita_to_eng)
#fb_test['lemmas_en'] = fb_test['lemmas'].apply(ita_to_eng)

In [None]:
# Twitter
#tw_dev['lemmas_en'] = tw_dev['lemmas'].apply(ita_to_eng)
#tw_test['lemmas_en'] = tw_test['lemmas'].apply(ita_to_eng)

In [None]:
def get_polarity_sentiwordnet(text: str) -> str:
    try:
        if not text:
            return 0.0
        
        tokens = TextBlob(text).words
        polarity = float('inf')

        for token in tokens:
            synsets = list(sentiwordnet.senti_synsets(token))

            if synsets:
                # Get the average polarity for all possible senses
                synset = synsets[0]
                polarity = min(polarity, synset.pos_score() - synset.neg_score())
                
        if polarity == float('inf'):
            return 0.0

        return polarity
    except Exception as e:
        print(f'Error computing polarity {e} - {text}')
        return 0.0  # Return a neutral polarity in case of errors

In [None]:
# Facebook
#fb_dev['polaritySentiWordNet'] = fb_dev['text_en'].apply(get_polarity_sentiwordnet)
#fb_test['polaritySentiWordNet'] = fb_test['text_en'].apply(get_polarity_sentiwordnet)

In [None]:
# Twitter
#tw_dev['polaritySentiWordNet'] = tw_dev['text_en'].apply(get_polarity_sentiwordnet)
#tw_test['polaritySentiWordNet'] = tw_test['text_en'].apply(get_polarity_sentiwordnet)

## Extraction of tenth feature: polarity TextBlob
Polarity TextBlob, where the polarity value is computed using a TextBlob function. As before, the message is first translated to english.

*Note: the polarity of a sentence is computed as the worst polarity score across words.*

In [None]:
def get_polarity_textblob(text: str) -> float:
    try:
        blob = TextBlob(text)
        return blob.sentiment.polarity
    except Exception as e:
        print(f'Error computing polarity {e} - {text}')
        return 0.0  # Return the lowest polarity value found, even if it's not neutral

In [None]:
# Facebook
#fb_dev['polarityTextBlob'] = fb_dev['text_en'].apply(get_polarity_textblob)
#fb_test['polarityTextBlob'] = fb_test['text_en'].apply(get_polarity_textblob)

In [None]:
# Twitter
#tw_dev['polarityTextBlob'] = tw_dev['text_en'].apply(get_polarity_textblob)
#tw_test['polarityTextBlob'] = tw_test['text_en'].apply(get_polarity_textblob)

## Extraction of the eleventh feature: subjectivity TextBlob
Subjectivity TextBlob, another value computed using a TextBlob function.

In [None]:
def get_subjectivity_textblob(text: str) -> float:
    try:
        blob = TextBlob(text)
        subjectivity = blob.sentiment.subjectivity
        return subjectivity
    except Exception as e:
        return 0.0  # Return a neutral subjectivity in case of errors

In [None]:
# Facebook
fb_dev['subjectivityTextBlob'] = fb_dev['text_en'].apply(get_subjectivity_textblob)
fb_test['subjectivityTextBlob'] = fb_test['text_en'].apply(get_subjectivity_textblob)

In [None]:
# Twitter
tw_dev['subjectivityTextBlob'] = tw_dev['text'].apply(get_subjectivity_textblob)
tw_test['subjectivityTextBlob'] = tw_test['text'].apply(get_subjectivity_textblob)

## Extraction of the final feature: polarity with Italian lexicon
Polarity based on the *Distributional Polarity Lexicon (IT)* (Castellucci et al, 2016). It consists of polarity scorse for positivity - negativity - neturality.

*Note: the polarity of a sentence is computed by taking the highest 'negativity' value across words.*

In [None]:
def load_negativity_lexicon(lexicon_file):
    negativity_lexicon = {}
    with open(lexicon_file, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                word, scores = parts
                positive, negative, neutral = map(float, scores.split(','))
                negativity_lexicon[word] = negative  # Store the negativity score
                
    return negativity_lexicon

In [None]:
negativity_lexicon = load_negativity_lexicon(polarity_lexicon_path)

In [None]:
def get_polarity_DPL(tokens: List[str], lexicon=negativity_lexicon) -> float:
    polarity = 0.0

    for token in tokens:
        if token in lexicon:
            negativity_score = lexicon[token]
            polarity = max(polarity, negativity_score)

    return polarity

In [None]:
# Facebook
fb_dev['polarityDPL'] = fb_dev['tokens'].apply(get_polarity_DPL)
fb_test['polarityDPL'] = fb_test['tokens'].apply(get_polarity_DPL)

In [None]:
# Twitter
tw_dev['polarityDPL'] = tw_dev['tokens'].apply(get_polarity_DPL)
tw_test['polarityDPL'] = tw_test['tokens'].apply(get_polarity_DPL)

## PoS
Part of Speech (PoS) aims at grouping words by their grammar class (e.g. noun, verb, adjective, etc.).
PoS tagging can provide useful information about a word, and its neighbours, role in a sentence.

In [None]:
def PoS_tagging(text: str) -> List[str]:
    pos = []
    
    tokens = nlp(text)
    for token in tokens:
        pos.append(token.pos_)
        
    return pos

In [None]:
# Facebook
fb_dev['PoS'] = fb_dev['text'].apply(PoS_tagging)
fb_test['PoS'] = fb_test['text'].apply(PoS_tagging)

In [None]:
# Twitter
tw_dev['PoS'] = tw_dev['text'].apply(PoS_tagging)
tw_test['PoS'] = tw_test['text'].apply(PoS_tagging)

##

## Store pre-processed dataset

In [None]:
# Facebook dataset
fb_dev.to_csv(fb_dir + 'dev/' + preprocessed_dir + 'fb_dev_preprocessed.csv', index=False)
fb_test.to_csv(fb_dir + 'test/' + preprocessed_dir + 'fb_test_preprocessed.csv', index=False)

In [None]:
# Twitter dataset
tw_dev.to_csv(tw_dir + 'dev/' + preprocessed_dir + 'tw_dev_preprocessed.csv', index=False)
tw_test.to_csv(tw_dir + 'test/' + preprocessed_dir + 'tw_test_preprocessed.csv', index=False)