### Libraries installation

In [None]:
!pip install joblib
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import os
from langdetect import detect
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, Model
from keras import layers
from keras.layers import Embedding, Layer, Dense, Dropout, MultiHeadAttention, LayerNormalization, Input, GlobalAveragePooling1D, LSTM, Bidirectional
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau


### Functions definition

In [1]:
def convertir_a_meses(fecha_str):
    # Using regular expressions to find numbers in the string
    numeros = re.findall(r'\d+', fecha_str)
    if not numeros:
        if "un" in fecha_str.lower(): # If the string contains "one", convert it to 1
            numero = 1
        else:
            return None  # If no numbers or "one" found, return None
    else:
        numero = int(numeros[0])  # Take the first found number
    
     # Checking if the string contains 'año' or 'mes'
    if 'año' in fecha_str:
        return numero * 12
    elif 'mes' in fecha_str:
        return numero
    elif 'semana' in fecha_str:
        return numero * 4  # Considering a week as 4 weeks
    elif 'día' in fecha_str:
        return numero / 30  # Considering a day as 1/30 of a month
    else:
        return None
    
def top_words(text, N):
    all_words = ' '.join(text).split()
    freq_dist = nltk.FreqDist(all_words)
    top_words = [word for word, _ in freq_dist.most_common(N)]
    return top_words
def es_espanol(texto):
    try:
        return detect(texto) == 'es'
    except:
        return False

### Dowload and preprocessing of dataset

In [None]:
df = pd.read_excel('TODAS_ESTACIONES.xlsx')
df = df[df['Review'].apply(es_espanol)]
df = df.drop_duplicates(subset=['Review'])
df = df.dropna(subset=['Review', 'Real'])

# Remove duplicate rows in 'Review' column. 
resultados = df.drop_duplicates(subset=['Review'])

# Remove NA rows in 'Review' and 'Real' columns. 
resultados = resultados.dropna(subset=['Review', 'Real'])



In [None]:
import nltk
from sklearn.model_selection import StratifiedKFold

# Download the list of Spanish stop words from NLTK.
nltk.download('stopwords')
from nltk.corpus import stopwords
spanish_stopwords = stopwords.words('spanish')

## Experiments with SVM classifier 

In [None]:
# Define the StratifiedKFold object for cross-validation
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store results per category
precision_by_category = []
recall_by_category = []
f1_score_by_category = []
roc_auc_score_by_category = []
confusion_matrices = []
X = resultados['Review']
y = resultados['Real']
X = X.tolist()
y = y.tolist()

# Perform cross-validation
for train_index, test_index in stratified_kfold.split(X, y):
    X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
    y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
    
    # Create a pipeline including feature extraction (TF-IDF) and classifier (SVM)
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words=spanish_stopwords)),  # Configurar el vectorizador TF-IDF con stop words en español
        ('clf', SVC(kernel='linear'))  # Usar un kernel lineal para permitir la interpretación de los coeficientes
    ])

    # Resample the training dataset to balance the classes
    oversampler = RandomOverSampler(random_state=321)
    X_train_resampled, y_train_resampled = oversampler.fit_resample(np.array(X_train).reshape(-1, 1), y_train)

    # Train the classifier with the resampled training dataset
    pipeline.fit(X_train_resampled.ravel(), y_train_resampled)

    # Evaluate classifier performance
    predictions = pipeline.predict(X_test)

    # Calculate and store metrics per category
    precision_by_category.append(precision_score(y_test, predictions, average=None))
    recall_by_category.append(recall_score(y_test, predictions, average=None))
    f1_score_by_category.append(f1_score(y_test, predictions, average=None))
    roc_auc_score_by_category.append(roc_auc_score(y_test, predictions, average=None))
    
    # Calculate confusion matrix for this fold
    confusion_matrices.append(confusion_matrix(y_test, predictions))

# Calculate mean metrics per category
mean_precision = np.mean(precision_by_category, axis=0)
mean_recall = np.mean(recall_by_category, axis=0)
mean_f1_score = np.mean(f1_score_by_category, axis=0)
mean_roc_auc_score = np.mean(roc_auc_score_by_category, axis=0)

# Print mean metrics per categoryprint("Mean Precision:", mean_precision)
print("Mean Recall:", mean_recall)
print("Mean F1 Score:", mean_f1_score)
print("Mean ROC AUC Score:", mean_roc_auc_score)

# Calculate mean confusion matrixmean_confusion_matrix = np.mean(confusion_matrices, axis=0)

# Print mean confusion matrix
print("Mean Confusion Matrix:\n", mean_confusion_matrix)

# Save the model
joblib.dump(pipeline, 'modelo_svm_all.pkl')

## Experiments with Random Forest

In [None]:
# Define the StratifiedKFold object for cross-validation
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store results per category
precision_by_category = []
recall_by_category = []
f1_score_by_category = []
roc_auc_score_by_category = []
confusion_matrices = []
X = resultados['Review']
y = resultados['Real']
X = X.tolist()
y = y.tolist()

# Perform cross-validation
for train_index, test_index in stratified_kfold.split(X, y):
    X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
    y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
    
    # Create a pipeline including feature extraction (TF-IDF) and classifier (Random Forest)
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words=spanish_stopwords)),  # Configurar el vectorizador TF-IDF con stop words en español
        ('clf', RandomForestClassifier(n_estimators=1000, random_state=2345))  # Usar un kernel lineal para permitir la interpretación de los coeficientes
    ])

    # Resample the training dataset to balance the classes
    oversampler = RandomOverSampler(random_state=321)
    X_train_resampled, y_train_resampled = oversampler.fit_resample(np.array(X_train).reshape(-1, 1), y_train)

    # Train the classifier with the resampled training dataset
    pipeline.fit(X_train_resampled.ravel(), y_train_resampled)

    # Evaluate classifier performance
    predictions = pipeline.predict(X_test)

   # Calculate and store metrics per category
    precision_by_category.append(precision_score(y_test, predictions, average=None))
    recall_by_category.append(recall_score(y_test, predictions, average=None))
    f1_score_by_category.append(f1_score(y_test, predictions, average=None))
    roc_auc_score_by_category.append(roc_auc_score(y_test, predictions, average=None))
    
    # Calculate confusion matrix for this fold
    confusion_matrices.append(confusion_matrix(y_test, predictions))

# Calculate mean metrics per category
mean_precision = np.mean(precision_by_category, axis=0)
mean_recall = np.mean(recall_by_category, axis=0)
mean_f1_score = np.mean(f1_score_by_category, axis=0)
mean_roc_auc_score = np.mean(roc_auc_score_by_category, axis=0)

# Print mean metrics per category
print("Mean Precision:", mean_precision)
print("Mean Recall:", mean_recall)
print("Mean F1 Score:", mean_f1_score)
print("Mean ROC AUC Score:", mean_roc_auc_score)

# Calculate mean confusion matrix
mean_confusion_matrix = np.mean(confusion_matrices, axis=0)

# Print mean confusion matrix
print("Mean Confusion Matrix:\n", mean_confusion_matrix)

# Save the model
joblib.dump(pipeline, 'modelo_rf_all.pkl')


## Experiments with Transformer Networks

In [None]:
X = resultados['Review']
y = resultados['Real']

# Split data in training and testnig
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
## Set the parameters for Transformer Network
max_len = 20       
oov_token = '00_V' 
padding_type = 'post'
trunc_type = 'post'  

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1
print("Vocab Size: ",vocab_size)

In [None]:
# Transform text data into numerical data (Embedding)
X_train_list = X_train.tolist()
X_test_list = X_test.tolist()
train_sequences = tokenizer.texts_to_sequences(X_train_list)
X_train = pad_sequences(train_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)
test_sequences = tokenizer.texts_to_sequences(X_test_list)
X_test = pad_sequences(test_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)

In [None]:
## Define classes required
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = ops.shape(x)[-1]
        positions = ops.arange(start=0, stop=maxlen, step=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
        
    def call(self, x):
        maxlen = tf.shape(x)[-1]  # Utilizar tf.shape en lugar de ops.shape
        positions = tf.range(start=0, limit=maxlen, delta=1)  # Utilizar tf.range en lugar de ops.arange
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
## Develop Transformer model
embed_dim = 256  # Embedding size for each token
num_heads = 4  # Number of attention heads
ff_dim = 64  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(20,))
embedding_layer = TokenAndPositionEmbedding(20, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])



In [None]:
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import numpy as np

# Define the StratifiedKFold object for cross-validation
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store results per category
precision_by_category = []
recall_by_category = []
f1_score_by_category = []
roc_auc_score_by_category = []
confusion_matrices = []
X = resultados['Review']
y = resultados['Real']
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Perform cross-validation
for train_index, test_index in stratified_kfold.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]  

    
    # Convert text data to numerical sequences and apply padding
    X_train_list = X_train.tolist()
    X_test_list = X_test.tolist()
    train_sequences = tokenizer.texts_to_sequences(X_train_list)
    X_train_padded = pad_sequences(train_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)
    test_sequences = tokenizer.texts_to_sequences(X_test_list)
    X_test_padded = pad_sequences(test_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)
    embed_dim = 256  # Embedding size for each token
    num_heads = 4  # Number of attention heads
    ff_dim = 64  # Hidden layer size in feed forward network inside transformer

    inputs = layers.Input(shape=(20,))
    embedding_layer = TokenAndPositionEmbedding(20, vocab_size, embed_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(20, activation="relu")(x)
    x = layers.Dropout(0.1)(x)
    outputs = layers.Dense(2, activation="softmax")(x)

    model = keras.Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

    # Train the model on the current fold
    model.fit(X_train_padded, y_train, batch_size=32, epochs=100, verbose=1)
    
    # Make predictions on the test set
    predictions = model.predict(X_test_padded)
    
    # Convert probability predictions to classes (0 or 1)
    predicted_classes = np.argmax(predictions, axis=1)
    
   # Calculate and store metrics per category
    precision_by_category.append(precision_score(y_test, predicted_classes, average=None))
    recall_by_category.append(recall_score(y_test, predicted_classes, average=None))
    f1_score_by_category.append(f1_score(y_test, predicted_classes, average=None))
    roc_auc_score_by_category.append(roc_auc_score(y_test, predicted_classes, average=None))
    
    #  Calculate confusion matrix for this fold
    confusion_matrices.append(confusion_matrix(y_test, predicted_classes))

# Calculate mean metrics per category
mean_precision = np.mean(precision_by_category, axis=0)
mean_recall = np.mean(recall_by_category, axis=0)
mean_f1_score = np.mean(f1_score_by_category, axis=0)
mean_roc_auc_score = np.mean(roc_auc_score_by_category, axis=0)

# Print mean metrics per category
print("Mean Precision:", mean_precision)
print("Mean Recall:", mean_recall)
print("Mean F1 Score:", mean_f1_score)
print("Mean ROC AUC Score:", mean_roc_auc_score)

# Calculate mean confusion matrix
mean_confusion_matrix = np.mean(confusion_matrices, axis=0)

# Print mean confusion matrix
print("Mean Confusion Matrix:\n", mean_confusion_matrix)
