Presentacion y generalidades:



##### Modulo 1: Utilizar el dataset Sentiment140. Este dataset también debe de ser pre-procesado de acuerdo al artículo (ver Sección 3.1)


In [8]:
import pandas as pd
import numpy as np
import nltk
import skfuzzy as fuzz
import time
import re

def preprocess_text(text):
    # Eliminar URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Eliminar menciones (@usernames)
    text = re.sub(r'@\w+', '', text)
    
    # Eliminar el símbolo de hashtag (#) pero conservar la palabra
    text = re.sub(r'#', '', text)
    
    # Reemplazar contracciones comunes
    contractions = {
        "can't": "cannot", "cant": "cannot",
        "won't": "will not", "wont": "will not",
        "I'm": "I am", "Im": "I am",
        "it's": "it is", "its": "it is",
        "don't": "do not", "dont": "do not",
        "you're": "you are", "youre": "you are",
        "he's": "he is", "hes": "he is",
        "she's": "she is", "shes": "she is",
        "they're": "they are", "theyre": "they are",
        "that's": "that is", "thats": "that is",
        "what's": "what is", "whats": "what is",
        "where's": "where is", "wheres": "where is",
        "who's": "who is", "whos": "who is",
        "let's": "let us", "lets": "let us",
        "I've": "I have", "Ive": "I have",
        "you've": "you have", "youve": "you have",
        "we've": "we have", "weve": "we have",
        "they've": "they have", "theyve": "they have",
        "would've": "would have", "wouldve": "would have",
        "could've": "could have", "couldve": "could have",
        "should've": "should have", "shouldve": "should have",
        "might've": "might have", "mightve": "might have",
        "must've": "must have", "mustve": "must have"
    }
    for contraction, replacement in contractions.items():
        text = text.replace(contraction, replacement)
    # Eliminar caracteres especiales y numeros 
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Eliminar espacios adicionales
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Convertir a minúsculas
    text = text.lower()
    
    # Reemplazar letras repetidas más de 3 veces por una sola aparición
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    
    return text


# Cargar el dataset Sentiment140
def preprosses_dataset(archivo):
    # Cargar el dataset
    dataset = pd.read_csv(archivo, encoding='latin1', names=['Original_text', 'sentiment'], skiprows=1)
    
    # Crear una copia de la columna Original_text en la columna text
    dataset['text'] = dataset['Original_text']
    
    # Convertir valores no string a string vacíos y preprocesar la columna de texto
    dataset['text'] = dataset['text'].astype(str).apply(preprocess_text)
    
    return dataset

data = preprosses_dataset('test_data.csv')

print('Dataset preprocesado')
print(data)

Dataset preprocesado
                                         Original_text  sentiment  \
0    i loooooooovvvvvveee my kindle not that the dx...          1   
1    reading my kindle love it lee childs is good read          1   
2    ok first assesment of the kindle it fucking rocks          1   
3    you ll love your kindle i ve had mine for a fe...          1   
4    fair enough but i have the kindle and i think ...          1   
..                                                 ...        ...   
354  after using latex a lot any other typeset math...          1   
355  on that note i hate word i hate pages i hate l...          0   
356  ahhh back in a real text editing environment i...          1   
357  trouble in iran i see hmm iran iran so far awa...          0   
358  reading the tweets coming out of iran the whol...          0   

                                                  text  
0    i love my kindle not that the dx is cool but t...  
1    reading my kindle love it lee c

##### Modulo 2: Implementar un módulo que utilice un lexicón de sentimientos de la librería NLTK, y calcule el puntaje positivo y negativo de cada registro en tu dataset siguiendo las instrucciones de la Sección 3.2 del artículo. Al utilizar un analizador de sentimientos de NLTK, solo usar los valores positivos y negativos que retorne. Estos puntajes se deben incluir al dataset como dos columnas nuevas: puntaje positivo y puntaje negativo.


In [9]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Descargar los recursos necesarios de NLTK
nltk.download('vader_lexicon')

# Crear una instancia del analizador de sentimientos
sia = SentimentIntensityAnalyzer()

# Función para calcular los puntajes de sentimiento
def calculate_sentiment_scores(text):
    scores = sia.polarity_scores(text)
    return scores['pos'], scores['neg']

# Aplicar la función a cada registro del dataset
data['positive_score'], data['negative_score'] = zip(*data['text'].apply(calculate_sentiment_scores))

print('Dataset con puntajes de sentimiento:')
print(data)

Dataset con puntajes de sentimiento:
                                         Original_text  sentiment  \
0    i loooooooovvvvvveee my kindle not that the dx...          1   
1    reading my kindle love it lee childs is good read          1   
2    ok first assesment of the kindle it fucking rocks          1   
3    you ll love your kindle i ve had mine for a fe...          1   
4    fair enough but i have the kindle and i think ...          1   
..                                                 ...        ...   
354  after using latex a lot any other typeset math...          1   
355  on that note i hate word i hate pages i hate l...          0   
356  ahhh back in a real text editing environment i...          1   
357  trouble in iran i see hmm iran iran so far awa...          0   
358  reading the tweets coming out of iran the whol...          0   

                                                  text  positive_score  \
0    i love my kindle not that the dx is cool but t...      

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ger13\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


##### Modulo 3: Fuzzificación de los puntajes de sentimiento. Siguiendo las instrucciones de la Sección 3.3 del artículo, se deben crear conjuntos difusos para los puntajes positivos y negativos. Utilizar la librería `scikit-fuzzy` para este propósito.


In [10]:
import numpy as np
import matplotlib.pyplot as plt

# Define Membership Function

def triangular_membership(x, d, e, f):
    """
    Calculate the triangular membership value for a given x.
    
    Parameters:
    x (float): The input value.
    d (float): The lower bound of the triangular function.
    e (float): The middle value of the triangular function.
    f (float): The upper bound of the triangular function.
    
    Returns:
    float: The membership value.
    """
    if x <= d:
        return 0
    elif d < x <= e:
        return (x - d) / (e - d)
    elif e < x < f:
        return (f - x) / (f - e)
    else:
        return 0


# Calculate Membership Values

# Define the parameters for the fuzzy sets
min_val = data[['positive_score', 'negative_score']].min().min()
mid_val = data[['positive_score', 'negative_score']].mean().mean()
max_val = data[['positive_score', 'negative_score']].max().max()
# Define the fuzzy sets for positive, negative, and output variables
fuzzy_sets = {
    "Low": (min_val, min_val, mid_val),
    "Medium": (min_val, mid_val, max_val),
    "High": (mid_val, max_val, max_val)
}

# Function to calculate membership values for a given input
def calculate_membership_values(x, fuzzy_sets):
    """
    Calculate the membership values for a given input x using the defined fuzzy sets.
    
    Parameters:
    x (float): The input value.
    fuzzy_sets (dict): A dictionary containing the fuzzy sets with their parameters.
    
    Returns:
    dict: A dictionary with the membership values for each fuzzy set.
    """
    membership_values = {}
    for set_name, (d, e, f) in fuzzy_sets.items():
        membership_values[set_name] = triangular_membership(x, d, e, f)
    return membership_values


    # Apply fuzzification to the positive and negative scores
positive_fuzzy = []
negative_fuzzy = []

for index, row in data.iterrows():
    positive_fuzzy.append(calculate_membership_values(row['positive_score'], fuzzy_sets))
    negative_fuzzy.append(calculate_membership_values(row['negative_score'], fuzzy_sets))

data['positive_fuzzy'] = positive_fuzzy
data['negative_fuzzy'] = negative_fuzzy

print('Dataset con valores difusos:')
print(data[['positive_score', 'positive_fuzzy', 'negative_score', 'negative_fuzzy']])


Dataset con valores difusos:
     positive_score                                     positive_fuzzy  \
0             0.446  {'Low': 0, 'Medium': 0.5768945211191685, 'High...   
1             0.470  {'Low': 0, 'Medium': 0.5418427021144595, 'High...   
2             0.216  {'Low': 0, 'Medium': 0.9128077865809628, 'High...   
3             0.204  {'Low': 0, 'Medium': 0.9303336960833174, 'High...   
4             0.456  {'Low': 0, 'Medium': 0.562289596533873, 'High'...   
..              ...                                                ...   
354           0.000                 {'Low': 0, 'Medium': 0, 'High': 0}   
355           0.000                 {'Low': 0, 'Medium': 0, 'High': 0}   
356           0.000                 {'Low': 0, 'Medium': 0, 'High': 0}   
357           0.000                 {'Low': 0, 'Medium': 0, 'High': 0}   
358           0.000                 {'Low': 0, 'Medium': 0, 'High': 0}   

     negative_score                                     negative_fuzzy  
0        

##### Modulo 4: Base de reglas. Utilizar skfuzzy para la creación de la base de reglas siguiendo las secciones 3.3.2 y 3.3.3.

In [11]:
import numpy as np


# Definir las funciones de membresía para las puntuaciones positivas y negativas
def pos_low(x, min_val, mid_val):
    return triangular_membership(x, min_val, min_val, mid_val)

def pos_med(x, min_val, mid_val, max_val):
    return triangular_membership(x, min_val, mid_val, max_val)

def pos_high(x, mid_val, max_val):
    return triangular_membership(x, mid_val, max_val, max_val)

def neg_low(x, min_val, mid_val):
    return triangular_membership(x, min_val, min_val, mid_val)

def neg_med(x, min_val, mid_val, max_val):
    return triangular_membership(x, min_val, mid_val, max_val)

def neg_high(x, mid_val, max_val):
    return triangular_membership(x, mid_val, max_val, max_val)

def op_neg(x):
    return triangular_membership(x, min_val, min_val, mid_val)

def op_neu(x):
    return triangular_membership(x, min_val, mid_val, max_val)

def op_pos(x):
    return triangular_membership(x, mid_val, max_val, max_val)

# Definir las reglas de Mamdani
def mamdani_rules(pos_score, neg_score, min_val, mid_val, max_val):
    rules = {
        'R1': min(pos_low(pos_score, min_val, mid_val), neg_low(neg_score, min_val, mid_val)),
        'R2': min(pos_med(pos_score, min_val, mid_val, max_val), neg_low(neg_score, min_val, mid_val)),
        'R3': min(pos_high(pos_score, mid_val, max_val), neg_low(neg_score, min_val, mid_val)),
        'R4': min(pos_low(pos_score, min_val, mid_val), neg_med(neg_score, min_val, mid_val, max_val)),
        'R5': min(pos_med(pos_score, min_val, mid_val, max_val), neg_med(neg_score, min_val, mid_val, max_val)),
        'R6': min(pos_high(pos_score, mid_val, max_val), neg_med(neg_score, min_val, mid_val, max_val)),
        'R7': min(pos_low(pos_score, min_val, mid_val), neg_high(neg_score, mid_val, max_val)),
        'R8': min(pos_med(pos_score, min_val, mid_val, max_val), neg_high(neg_score, mid_val, max_val)),
        'R9': min(pos_high(pos_score, mid_val, max_val), neg_high(neg_score, mid_val, max_val)),
    }
    return rules

# Agregación de los resultados de las reglas
def aggregate_rules(rules):
    w_neg = max(rules['R4'], rules['R7'], rules['R8'])
    w_neu = max(rules['R1'], rules['R5'], rules['R9'])
    w_pos = max(rules['R2'], rules['R3'], rules['R6'])
    return w_neg, w_neu, w_pos

# Cálculo de las funciones de membresía de los consecuentes
def consequent_membership(w_neg, w_neu, w_pos, x):
    op_activation_low = min(w_neg, op_neg(x))
    op_activation_med = min(w_neu, op_neu(x))
    op_activation_high = min(w_pos, op_pos(x))
    return max(op_activation_low, op_activation_med, op_activation_high)


# Aplicar las reglas de Mamdani y la agregación a cada registro del dataset
def apply_fuzzy_logic(row):
    pos_score = row['positive_score']
    neg_score = row['negative_score']
    
    # Obtener las reglas de Mamdani
    rules = mamdani_rules(pos_score, neg_score, min_val, mid_val, max_val)
    
    # Agregar los resultados de las reglas
    w_neg, w_neu, w_pos = aggregate_rules(rules)
    
    # Calcular la membresía del consecuente
    output = consequent_membership(w_neg, w_neu, w_pos, pos_score - neg_score)
    
    return output

# Medir el tiempo inicial
start_time = time.time()

# Aplicar la lógica difusa a cada registro
data['fuzzy_output'] = data.apply(apply_fuzzy_logic, axis=1)

print('Dataset con salida difusa:')
print(data[['text', 'positive_score', 'negative_score', 'fuzzy_output']])

Dataset con salida difusa:
                                                  text  positive_score  \
0    i love my kindle not that the dx is cool but t...           0.446   
1    reading my kindle love it lee childs is good read           0.470   
2    ok first assesment of the kindle it fucking rocks           0.216   
3    you ll love your kindle i ve had mine for a fe...           0.204   
4    fair enough but i have the kindle and i think ...           0.456   
..                                                 ...             ...   
354  after using latex a lot any other typeset math...           0.000   
355  on that note i hate word i hate pages i hate l...           0.000   
356  ah back in a real text editing environment i l...           0.000   
357  trouble in iran i see hmm iran iran so far awa...           0.000   
358  reading the tweets coming out of iran the whol...           0.000   

     negative_score  fuzzy_output  
0             0.000       0.00000  
1           

##### Modulo 5: Implementar la defuzzificación siguiendo las instrucciones de la Sección 3.3.4. Como resultado agregar una tercera columna al dataset que será el puntaje del sentimiento para cada registro.

In [12]:
def defuzzify_centroid(row, output_mfs, output_range=(0, 10), num_samples=1000):
    z = np.linspace(output_range[0], output_range[1], num_samples)
    numerator = 0
    denominator = 0
    for zi in z:
        memberships = {
            "Negative": triangular_membership(zi, output_mfs["Negative"]["d"], output_mfs["Negative"]["e"], output_mfs["Negative"]["f"]) * max(row["negative_fuzzy"]["Low"], row["negative_fuzzy"]["Medium"], row["negative_fuzzy"]["High"]),
            "Neutral": triangular_membership(zi, output_mfs["Neutral"]["d"], output_mfs["Neutral"]["e"], output_mfs["Neutral"]["f"]) * row["positive_fuzzy"]["Medium"],
            "Positive": triangular_membership(zi, output_mfs["Positive"]["d"], output_mfs["Positive"]["e"], output_mfs["Positive"]["f"]) * max(row["positive_fuzzy"]["Medium"], row["positive_fuzzy"]["High"]),
        }
        max_membership = max(memberships.values())
        numerator += zi * max_membership
        denominator += max_membership
    return numerator / denominator if denominator != 0 else 5.0


def defuzzify_row(row, output_mfs):
    
    defuzzified_value = defuzzify_centroid(row, output_mfs)
    
    return defuzzified_value


def process_fuzzified_data(df, output_mfs):
    defuzzified_values = []
    for _, row in df.iterrows():
        defuzzified_value = defuzzify_row(row, output_mfs)
        defuzzified_values.append(defuzzified_value)

    df["defuzzified_value"] = defuzzified_values
    df["final_sentiment"] = df["defuzzified_value"].apply(
        lambda x: "Negativo" if x <= 3.3 else "Neutral" if x <= 6.7 else "Positivo"
    )
    return df


output_mfs = {
    "Negative": {"d": 0, "e": 0, "f": 5},
    "Neutral": {"d": 0, "e": 5, "f": 10},
    "Positive": {"d": 5, "e": 10, "f": 10},
}

# Procesar los datos difusos
data = process_fuzzified_data(data, output_mfs)

# Medir el tiempo final
end_time = time.time()

print('Dataset con valores defuzzificados:')
print(data[['text', 'positive_score', 'negative_score', 'fuzzy_output', 'defuzzified_value', 'final_sentiment']])

aux_structure = data.apply(lambda row: pd.Series({
    'Original_text': row['Original_text'],
    'sentiment': row['sentiment'],
    'positive_score': row['positive_score'],
    'negative_score': row['negative_score'],
    'positive_fuzzy_Low': row['positive_fuzzy']['Low'],
    'positive_fuzzy_Medium': row['positive_fuzzy']['Medium'],
    'positive_fuzzy_High': row['positive_fuzzy']['High'],
    'negative_fuzzy_Low': row['negative_fuzzy']['Low'],
    'negative_fuzzy_Medium': row['negative_fuzzy']['Medium'],
    'negative_fuzzy_High': row['negative_fuzzy']['High'],
    'fuzzy_output': row['fuzzy_output'],
      'defuzzified_value': row['defuzzified_value'],
    'final_sentiment': row['final_sentiment']
}), axis=1)

# Guardar la estructura auxiliar en un archivo CSV
aux_structure.to_csv('salida.csv', index=False)

Dataset con valores defuzzificados:
                                                  text  positive_score  \
0    i love my kindle not that the dx is cool but t...           0.446   
1    reading my kindle love it lee childs is good read           0.470   
2    ok first assesment of the kindle it fucking rocks           0.216   
3    you ll love your kindle i ve had mine for a fe...           0.204   
4    fair enough but i have the kindle and i think ...           0.456   
..                                                 ...             ...   
354  after using latex a lot any other typeset math...           0.000   
355  on that note i hate word i hate pages i hate l...           0.000   
356  ah back in a real text editing environment i l...           0.000   
357  trouble in iran i see hmm iran iran so far awa...           0.000   
358  reading the tweets coming out of iran the whol...           0.000   

     negative_score  fuzzy_output  defuzzified_value final_sentiment  
0   

#### Modulo 6 - Benchmarks

In [13]:
# Imprimir cantidad de tweets por sentimiento
total_positive = data[data['final_sentiment'] == 'Positivo'].shape[0]
total_neutral = data[data['final_sentiment'] == 'Neutral'].shape[0]
total_negative = data[data['final_sentiment'] == 'Negativo'].shape[0]

print(f'Total de tweets positivos: {total_positive}')
print(f'Total de tweets neutrales: {total_neutral}')
print(f'Total de tweets negativos: {total_negative}')

# Calcular el tiempo total de ejecución
total_execution_time = end_time - start_time

# Calcular el tiempo promedio de ejecución por tweet
average_execution_time = total_execution_time / len(data)

# Imprimir el tiempo promedio de ejecución por tweet y el tiempo total de ejecución
print(f'Tiempo promedio de ejecución por fila: {average_execution_time:.6f} segundos')
print(f'Tiempo total de ejecución: {total_execution_time:.6f} segundos')

Total de tweets positivos: 10
Total de tweets neutrales: 268
Total de tweets negativos: 81
Tiempo promedio de ejecución por fila: 0.009765 segundos
Tiempo total de ejecución: 3.505528 segundos


##### Gracias por su atencion <3