In [79]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
import joblib
from pathlib import Path
from sklearn.preprocessing import LabelEncoder

In [80]:
csv_aggressive_path = "../data/topic/topics_aggressive.csv"
#csv_light_path = "..\data\topic\topics_light.csv"

df_aggresive = pd.read_csv(csv_aggressive_path)
#df_light = pd.read_csv(csv_light_path)

In [81]:
y = df_aggresive["label"].values

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # 'neutral', 'positive', 'negative'

print(y[:5])
print(y.dtype)

print("\n",y_encoded[:5])
print(y_encoded.dtype)

['neutral' 'neutral' 'negative' 'positive' 'positive']
object

 [1 1 0 2 2]
int32


In [82]:
aggressive_topics  = df_aggresive["topic"].values

print(aggressive_topics)
print(aggressive_topics.dtype)

[ 5  4 51 ... 99  8  8]
int64


In [83]:
def check_files_exist(*file_paths):
    for path_str in file_paths:
        path = Path(path_str)
        if not path.exists():
            raise FileNotFoundError(f"Il file '{path}' non esiste.")

In [84]:
embeddings_dir_path = "../data/embeddings/"

embeddings_bert_path = os.path.join(embeddings_dir_path, "embeddings_bert.npy")
embeddings_sbert_path = os.path.join(embeddings_dir_path, "embeddings_sbert.npy")
embeddings_finbert_path = os.path.join(embeddings_dir_path, "embeddings_finbert.npy")
mbeddings_word2vec_path = os.path.join(embeddings_dir_path, "embeddings_word2vec.npy")

check_files_exist(
    embeddings_bert_path, embeddings_sbert_path, embeddings_finbert_path, mbeddings_word2vec_path
)

print("File embeddings presenti")

File embeddings presenti


In [85]:
def load_embeddings_with_optional_topic(embedding_path, topic_ids=None):
    X = np.load(embedding_path)

    if topic_ids is not None:
        encoder = OneHotEncoder(sparse_output=False)
        topic_features = encoder.fit_transform(topic_ids.reshape(-1, 1))
        X = np.hstack([X, topic_features])
        
    return X


In [102]:
def split_data(X, y, test_size=0.1, val_size=0.2, random_state=42):
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )
    val_ratio = val_size / (1 - test_size)  # val% su ciò che rimane
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_ratio, stratify=y_temp, random_state=random_state
    )
    return X_train, X_val, X_test, y_train, y_val, y_test


In [104]:
embeddings_bert = load_embeddings_with_optional_topic(embeddings_bert_path)
embeddings_bert2 = load_embeddings_with_optional_topic(embeddings_bert_path, aggressive_topics)

print(embeddings_bert.shape)
print(embeddings_bert2.shape)

(4821, 768)
(4821, 873)


In [90]:
def show_label_distribution(y_train, y_val, y_test):
    # Conteggio per ciascun set
    train_counts = pd.Series(y_train).value_counts().sort_index()
    val_counts = pd.Series(y_val).value_counts().sort_index()
    test_counts = pd.Series(y_test).value_counts().sort_index()

    # Unione in una tabella
    df = pd.DataFrame({
        'Train': train_counts,
        'Validation': val_counts,
        'Test': test_counts
    }).fillna(0).astype(int)

    # Percentuali per ogni colonna
    df_percent = df.div(df.sum(axis=0), axis=1) * 100
    df_percent = df_percent.round(2).astype(str) + '%'

    # Aggiungi le percentuali sotto i numeri assoluti
    result = df.astype(str) + ' (' + df_percent + ')'

    return result


In [103]:
test_size = 0.1
val_size = 0.1
X = embeddings_bert

X_train, X_val, X_test, y_train, y_val, y_test = split_data(
    X, y, test_size=test_size, val_size = val_size
)

print(f"Train Set Size: {len(X_train)}")
print(f"Validation Set Size: {len(X_val)}")
print(f"Test Set Size: {len(X_test)}")

distribuzione = show_label_distribution(y_train, y_val, y_test)
print("\n", distribuzione)

Train Set Size: 3855
Validation Set Size: 483
Test Set Size: 483

                   Train    Validation          Test
negative    482 (12.5%)   61 (12.63%)   60 (12.42%)
neutral   2284 (59.25%)  286 (59.21%)  286 (59.21%)
positive  1089 (28.25%)  136 (28.16%)  137 (28.36%)
