**Exploración Profunda de Emociones Textuales**

In [249]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import cross_val_score




**ADQUISICIÓN DE LOS DATOS**

In [250]:
#Descargar el dataset desde https://www.kaggle.com/datasets/kazanova/sentiment140/data
#Una vez que se descargue, se debe cambiar el nombre del archivo a train.csv

#Cargamos el dataset
dataset = pd.read_csv('train.csv', encoding='latin-1', sep=',')

# Como no hay nombres de columnas asignadas, se asignan los nombres de las columnas
dataset.columns = ['clasificacion', 'id', 'fecha', 'NO_QUERY', 'usuario', 'tweet']

# Se eliminan las columnas que no se utilizarán
dataset = dataset.drop(columns=['clasificacion','id','fecha', 'NO_QUERY', 'usuario'])

# Se eliminan los registros duplicados
dataset = dataset.drop_duplicates()

#Mostramos las primeras filas del dataset
print(dataset.head(20), '\n')

                                                tweet
0   is upset that he can't update his Facebook by ...
1   @Kenichan I dived many times for the ball. Man...
2     my whole body feels itchy and like its on fire 
3   @nationwideclass no, it's not behaving at all....
4                       @Kwesidei not the whole crew 
5                                         Need a hug 
6   @LOLTrish hey  long time no see! Yes.. Rains a...
7                @Tatiana_K nope they didn't have it 
8                           @twittera que me muera ? 
9         spring break in plain city... it's snowing 
10                         I just re-pierced my ears 
11  @caregiving I couldn't bear to watch it.  And ...
12  @octolinz16 It it counts, idk why I did either...
13  @smarrison i would've been the first, but i di...
14  @iamjazzyfizzle I wish I got to watch it with ...
15  Hollis' death scene will hurt me severely to w...
16                               about to file taxes 
17  @LettyA ahh ive always w

**PROCESAMIENTO DE LOS DATOS Y EXTRACCIÓN DE LAS CARACTERISTICAS**

In [251]:
# Creamos una función para clasificar los tweets según su contenido
def clasificar_tweet(tweet):
    if 'upset' in tweet.lower():
        return 'Negativo'
    elif 'happy' in tweet.lower():
        return 'Positivo'
    else:
        return 'Neutral'

# Aplicamos la función a la columna de tweets
dataset['clasificacion'] = dataset['tweet'].apply(clasificar_tweet)

# Mostramos las primeras filas del dataset
print(dataset.head(20), '\n')

                                                tweet clasificacion
0   is upset that he can't update his Facebook by ...      Negativo
1   @Kenichan I dived many times for the ball. Man...       Neutral
2     my whole body feels itchy and like its on fire        Neutral
3   @nationwideclass no, it's not behaving at all....       Neutral
4                       @Kwesidei not the whole crew        Neutral
5                                         Need a hug        Neutral
6   @LOLTrish hey  long time no see! Yes.. Rains a...       Neutral
7                @Tatiana_K nope they didn't have it        Neutral
8                           @twittera que me muera ?        Neutral
9         spring break in plain city... it's snowing        Neutral
10                         I just re-pierced my ears        Neutral
11  @caregiving I couldn't bear to watch it.  And ...       Neutral
12  @octolinz16 It it counts, idk why I did either...       Neutral
13  @smarrison i would've been the first, but i 

In [252]:
#Vamos a limpiar los datos que no necesitamos por ejemplo las menciones, los acentos, etc
#Para ello vamos a utilizar expresiones regulares
import re

#Función para limpiar los mensaje
def eliminar_dataset(mensaje):
    #Eliminamos las menciones
    mensaje = re.sub(r'@[A-Za-z0-9]+', '', mensaje)
    #Eliminamos los links
    mensaje = re.sub(r'https?://[A-Za-z0-9./]+', '', mensaje)
    #Eliminamos los hashtags
    mensaje = re.sub(r'#', '', mensaje)
    #Eliminamos los signos de puntuación
    mensaje = re.sub(r'[^\w\s]', '', mensaje)
    #Eliminamos los guin bajo
    mensaje = re.sub(r'_', '', mensaje)
    return mensaje

#Aplicamos la función a la columna de los mensajes
dataset['tweet'] = dataset['tweet'].apply(lambda x: eliminar_dataset(x))

# Eliminar la columna de sentimiento
dataset = dataset.drop(columns=['clasificacion'])

In [253]:
#Mostramos las primeras filas del dataset con el borrado
print(dataset.head(20), '\n')

                                                tweet
0   is upset that he cant update his Facebook by t...
1    I dived many times for the ball Managed to sa...
2     my whole body feels itchy and like its on fire 
3    no its not behaving at all im mad why am i he...
4                                 not the whole crew 
5                                         Need a hug 
6    hey  long time no see Yes Rains a bit only a ...
7                          K nope they didnt have it 
8                                      que me muera  
9             spring break in plain city its snowing 
10                          I just repierced my ears 
11   I couldnt bear to watch it  And I thought the...
12   It it counts idk why I did either you never t...
13   i wouldve been the first but i didnt have a g...
14   I wish I got to watch it with you I miss you ...
15  Hollis death scene will hurt me severely to wa...
16                               about to file taxes 
17   ahh ive always wanted t