# Description

This notebook contains the preprocessing pipeline tried (not much success) and the dataset translation functionality, it was implemented as an experiment checking whether translating would fix
some of the typos and yield better results (not used because of bad performance).

In [21]:
import numpy as np
import pandas as pd
import re
import unicodedata
from spellchecker import SpellChecker
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from nltk import SnowballStemmer
import spacy
from nltk.tokenize import TweetTokenizer  # Import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, f1_score
from pathlib import Path
import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from joblib import Parallel, delayed
import json
# Download necessary NLTK data
try:
    nltk.data.find("corpora/stopwords")
except LookupError:
    nltk.download("stopwords")

try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")

try:
    nltk.data.find("wordnet")
except LookupError:
    nltk.download("wordnet")

# Load spaCy Spanish model
try:
    nlp = spacy.load("es_core_news_sm")
except OSError:
    print("Downloading es_core_news_sm model for spaCy...")
    spacy.cli.download("es_core_news_sm")
    nlp = spacy.load("es_core_news_sm")


# --- DATA CLEANING FUNCTIONS ---
def clean_keywords(keyword):
    cleaned = re.sub(r'%20', ' ', keyword)
    return cleaned

def remove_accents(keyword):
    normalized = unicodedata.normalize('NFD', keyword)
    cleaned = ''.join([char for char in normalized if unicodedata.category(char) != 'Mn'])
    return cleaned

def remove_punctuation(keyword):
    cleaned = re.sub(r"[!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n -' ]"," ",keyword)
    return cleaned

def normalize_money(keyword):
    money_symbols = [r'\$', r'€', r'£', r'¥', r'₹', r'₣']
    cleaned = keyword 
    for symbol in money_symbols:
        cleaned = re.sub(symbol, "dinero", cleaned)
    return cleaned

def normalize_pct(keyword):
    cleaned = re.sub(r"%", " porcentaje", keyword)
    return cleaned

def remove_jaja(keyword):
    jaja_set = ("jaja", "jeje", "jiji", "jojo", "juju")
    return " ".join([m for m in keyword.split() if not any(ja_word in m for ja_word in jaja_set)])

def is_exaggeration(word):
    for c in word:
        if len(c) == 1:
            return False
        if word.count(c) / len(word) > 0.6:
            return True
    return False

def remove_exaggerations(keyword):
    return " ".join([m for m in keyword.split() if not is_exaggeration(m)])

def remove_hashtag(keyword):
    cleaned = re.sub(r'INI_HASHTAG.*?END_HASHTAG', '', keyword)   
    return cleaned

def remove_unknown(keyword):
    return " ".join([m for m in keyword.split() if not "unknown" in m])

def replace_multiplicador(input_string):
    cleaned = re.sub(r'x(\d+)', r'multiplicador \1', input_string)
    return cleaned

def remove_user(input_string):
    cleaned = re.sub(r'\buser\b', '', input_string)
    return cleaned


def remove_extra_whitespaces(input_string):
    cleaned = re.sub(r'\s+', ' ', input_string).strip()
    return cleaned

def remove_numbers(keyword):
    cleaned = re.sub(r'\d+', '', keyword)
    
    return cleaned

def remove_emojis(keyword):
    return re.sub(r"[^\w\s,!?@#áéíóúÁÉÍÓÚñÑ]", "", keyword)

def remove_single_chars(keyword):
    return " ".join([m for m in keyword.split() if len(m) > 2])

def remove_consecutive_duplicates(word):
    cleaned = re.sub(r"(.)\1+", r"\1", word)
    return cleaned

# --- SPELL CHECKER FUNCTIONS ---

spell = SpellChecker(language='es')
word_cache = {}  # Initialize the word cache

def correct_spelling(keyword):
    corrected_words = []
    for word in keyword.split():
        # Check if the word is already in the cache
        if word in word_cache:
            correction = word_cache[word]
        else:
            # Correct the word if not in the cache
            correction = spell.correction(word)
            if correction is None:  # Handle unknown words
                correction = word  # Keep the original word
            word_cache[word] = correction  # Store in the cache

        corrected_words.append(correction)
    return " ".join(corrected_words)

def correct_spelling_parallel(keywords, n_jobs=-1):
    results = Parallel(n_jobs=n_jobs)(delayed(correct_spelling)(keyword) for keyword in keywords)
    return results

# --- PIPELINE FUNCTIONS ---
def clean_pipeline(input_string):
    input_string = input_string.lower()
    input_string = clean_keywords(input_string)
    input_string = remove_emojis(input_string)
    input_string = remove_hashtag(input_string)
    input_string = normalize_money(input_string)
    input_string = normalize_pct(input_string)
    input_string = remove_jaja(input_string)
    input_string = remove_consecutive_duplicates(input_string)
    input_string = remove_unknown(input_string)
    input_string = replace_multiplicador(input_string)
    input_string = remove_punctuation(input_string)
    input_string = remove_user(input_string)
    input_string = remove_numbers(input_string)
    input_string = remove_single_chars(input_string)
    input_string = remove_extra_whitespaces(input_string)
    input_string = remove_accents(input_string)
    
    return input_string

def lemmatize_spanish(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]
    return " ".join(lemmas)

# --- DATA LOADING FUNCTIONS ---

def get_users_data(data_path: str, proccess_data: bool = True) -> tuple:
    path = Path(data_path)
    all_users_data = []
    users_order = []
    tokenizer = TweetTokenizer()  # Initialize TweetTokenizer
    stop_words = set(stopwords.words('spanish'))  # Load Spanish stopwords
    lemmatizer = WordNetLemmatizer()

    for element in path.iterdir():
        if element.is_file():
            try:
                user_id = re.findall(r"[0-9]+", element.name)[0]
                users_order.append(user_id)
            except IndexError:
                print(f"Warning: Could not extract user ID from filename: {element.name}")
                continue

            try:
                with open(path / element, 'r', encoding='utf-8') as json_file:  # Use Path object
                    json_data = json.load(json_file)
                    messages = [str(record['message']) for record in json_data]
                    user_document = " ".join(messages)  # Combine messages into a single document

                    if proccess_data:
                        user_document = clean_pipeline(user_document)
                        # user_document = correct_spelling(user_document)  # Spelling correction
                        user_document = lemmatize_spanish(user_document)
                        tokens = tokenizer.tokenize(user_document)  # Tokenize the document
                        tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]  # Lemmatize & remove stopwords
                        user_document = " ".join(tokens)  # Rejoin the tokens

                    all_users_data.append(user_document)

            except Exception as e:
                print(f"Error processing file {element.name}: {e}")
                continue
    return all_users_data, users_order

def tokenize_and_vectorize(data_path: str, ngram_range=(1, 1)) -> tuple:
    """ Tokenizes and vectorizes messages with TF-IDF. """
    vectorizer = TfidfVectorizer(ngram_range=ngram_range)
    all_users_data, users_order = get_users_data(data_path)

    X = vectorizer.fit_transform(all_users_data)
    df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    return df, users_order


def analyze_sentiment(text: str) -> int:
    """
    Analyzes the sentiment of the given text using VADER and discretizes it into:
    -1: Negative
     0: Neutral
     1: Positive
    """
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(text)
    compound_score = vs['compound']

    if compound_score >= 0.05:
        return 1  # Positive
    elif compound_score <= -0.05:
        return -1  # Negative
    else:
        return 0

def tokenize_and_vectorize_with_sentiment(data_path: str, ngram_range=(1, 1)) -> tuple:
    """ Tokenizes, vectorizes with TF-IDF, and adds sentiment analysis. """
    vectorizer = TfidfVectorizer(ngram_range=ngram_range)
    all_users_data, users_order = get_users_data(data_path)

    X = vectorizer.fit_transform(all_users_data)
    df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

    sentiment_scores = [analyze_sentiment(text) for text in all_users_data]
    df['sentiment'] = sentiment_scores

    return df, users_order

# --- MAIN EXECUTION ---
if __name__ == "__main__":
    # Data loading
    data_path = os.getcwd() + '/data/task1/train/subjects/'
    df_vectorized, users_order = tokenize_and_vectorize_with_sentiment(data_path=data_path, ngram_range=(1, 2)) # Use bigrams

    # Load target variable
    target_path = os.getcwd() + '/data/task1/train/gold_task1.txt'
    target_col = pd.read_csv(filepath_or_buffer=target_path, delimiter=',').to_numpy()
    users_tags = {re.findall(r"[0-9]+", target_col[i][0])[0]: target_col[i][1] for i in range(len(target_col))}

    # Create supervised dataframe
    df_supervised = df_vectorized.copy()
    target = np.array([users_tags[user_id] for user_id in users_order], np.int8)
    df_supervised.insert(loc=len(df_supervised.columns), column='Target', value=target)

    # --- MODEL TRAINING AND EVALUATION ---

    def train_and_evaluate(seed, C=1.0): # Added C parameter
        X, y = df_supervised.drop(columns=['Target'], axis=1), df_supervised['Target'].to_numpy()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, stratify=y, shuffle=True)

        clf = LinearSVC(dual=False, max_iter=10000, C=C, class_weight='balanced')  # Tune C, handle imbalance
        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_true=y_test, y_pred=y_pred)
        f1 = f1_score(y_true=y_test, y_pred=y_pred, average='weighted') # Calculate F1 score

        return acc, f1

    # Hyperparameter tuning using GridSearchCV
    X, y = df_supervised.drop(columns=['Target'], axis=1), df_supervised['Target'].to_numpy()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y, shuffle=True)

    param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
    grid_search = GridSearchCV(LinearSVC(dual=False, max_iter=10000, class_weight='balanced'), 
                                param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print("Best parameters found by GridSearchCV:", grid_search.best_params_)
    best_C = grid_search.best_params_['C']

    # Train and evaluate the model with the best C parameter
    num_seeds = 5 # Reduce number of seeds
    seeds = [42, 123, 56, 789, 10]

    accuracies = []
    f1_scores = []
    for seed in seeds:
        acc, f1 = train_and_evaluate(seed, C=best_C)
        accuracies.append(acc)
        f1_scores.append(f1)

    mean_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)
    mean_f1 = np.mean(f1_scores)
    std_f1 = np.std(f1_scores)

    print(f"Mean SVM accuracy over {num_seeds} seeds: {mean_accuracy:.4f}")
    print(f"Standard deviation of SVM accuracy over {num_seeds} seeds: {std_accuracy:.4f}")
    print(f"Mean SVM F1 score over {num_seeds} seeds: {mean_f1:.4f}")
    print(f"Standard deviation of SVM F1 score over {num_seeds} seeds: {std_f1:.4f}")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maxi.rodriguez\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Best parameters found by GridSearchCV: {'C': 10}
Mean SVM accuracy over 5 seeds: 0.6543
Standard deviation of SVM accuracy over 5 seeds: 0.0246
Mean SVM F1 score over 5 seeds: 0.6520
Standard deviation of SVM F1 score over 5 seeds: 0.0265


In [None]:
from deep_translator import GoogleTranslator
from concurrent.futures import ThreadPoolExecutor, as_completed

def parallel_translate_documents(documents, target_lang="en", source_lang="auto", max_workers=8, verbose=False):
    """Translate a list of documents in parallel using deep-translator and return them in the same order."""
    
    def translate_single(index, text):
        try:
            translation = GoogleTranslator(source=source_lang, target=target_lang).translate(text)
            return index, translation
        except Exception as e:
            if verbose:
                print(f"Translation failed for index {index}: {e}")
            return index, text  # Return original text if translation fails

    translated_docs = [None] * len(documents)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(translate_single, i, doc) for i, doc in enumerate(documents)]

        for future in as_completed(futures):
            index, translation = future.result()
            translated_docs[index] = translation

    return translated_docs
