# Preprocesamiento de Datos

## I. Importaciones

**Autor:** Pablo Spínola López

**Descripción:** Este notebook realiza la limpieza y preprocesamiento de los datos crudos. 

In [7]:
import sys
import os
import pandas as pd
import swifter
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

## II. Definición de stopwords para vectorización

In [8]:
# Cargar stopwords en español como lista
stop_words_es = list(stopwords.words('spanish'))
# Agregar stopwords adicionales específicas del dominio
stop_words_adicionales = [
    'rt', 'https', 'http', 'tco', 'twitter', 'com',  # URLs y términos de Twitter
    't', 'co',  # Partes de 't.co'
    'q', 'k', 'd', 'tb', 'tmb', 'pq', 'xq', 'dnd', 'kien', 'salu2', 'aki', 'tqm'  # Abreviaturas comunes
]
stop_words_es.extend(stop_words_adicionales)
# Eliminar duplicados y ordenar
stop_words_es = sorted(list(set(stop_words_es)))

## III. Importación de función extracción de características

In [9]:
sys.path.append(os.path.abspath("../funciones"))

from funcionesPreprocesamiento import (
    obtener_hashtags_frecuentes_individuales,
    extraer_caracteristicas,
)

# Pipeline principal

1. Cargar datos

In [10]:
print("Cargando datos...")
df_train = pd.read_csv("../../data/data_train.csv", encoding="latin1", header=0)
df_test = pd.read_csv("../../data/data_test_fold1.csv", encoding="latin1", header=0)
df_test_final = pd.read_csv("../../data/data_test_fold2.csv", encoding="latin1", header=0)

df = pd.concat([df_train, df_test, df_test_final], ignore_index=True)

print(df.shape)
df.head(5)

Cargando datos...
(2000, 4)


Unnamed: 0,user_id,tweet_id,tweet_text,class
0,user0001,0d3ed29586ce,Cheesecake saludable sin azÃºcar y sin lactosa...,control
1,user0002,c3cf897a495b,ser como ellas â¡â¡\n #HastaLosHuesos,anorexia
2,user0003,5041d85c45c6,"Comida Real o , la clave para estar mÃ¡s sana,...",control
3,user0004,d18285d3c7ec,Entre el cambio de hora y la bajada de las #te...,control
4,user0005,4d81892f3217,Hace mucho tiempo no sentÃ­a mi cuerpo tan frÃ­o,anorexia


2. Preprocesamiento inicial

In [11]:
df["tweet_text"] = df["tweet_text"].fillna("")
print("Preprocesamiento inicial exitoso :)")

Preprocesamiento inicial exitoso :)


3. Extraer características iniciales

In [12]:
print("Extrayendo características iniciales...")
features_df = df["tweet_text"].swifter.apply(extraer_caracteristicas).apply(pd.Series)

Extrayendo características iniciales...


Pandas Apply:   0%|          | 0/2000 [00:00<?, ?it/s]

In [13]:
print("Características iniciales:")
for caracteristica in features_df.columns:
    print(f'\t- {caracteristica}: {type(caracteristica)}')

Características iniciales:
	- tweet_text: <class 'str'>
	- hashtags: <class 'str'>
	- texto_completo: <class 'str'>
	- texto_bert: <class 'str'>
	- longitud_texto: <class 'str'>
	- num_palabras: <class 'str'>
	- comida: <class 'str'>
	- restriccion: <class 'str'>
	- purga: <class 'str'>
	- imagen_corporal: <class 'str'>
	- ejercicio: <class 'str'>
	- polaridad: <class 'str'>
	- subjetividad: <class 'str'>


In [14]:
# Reemplazar columnas duplicadas
df["tweet_text"] = features_df["tweet_text"]
df["hashtags"] = features_df["hashtags"]

# Agregar columnas nuevas
columnas_nuevas = features_df.drop(columns=["tweet_text", "hashtags"])
df = pd.concat([df, columnas_nuevas], axis=1)

print(f'Cantidad de columnas hasta este punto: {df.shape[1]}')
print("Columnas de nuestro dataset hasta este punto:")
print(f'--> ', end='')
for col in df.columns:
    print(f', {col}', end='')
print('\n')


Cantidad de columnas hasta este punto: 16
Columnas de nuestro dataset hasta este punto:
--> , user_id, tweet_id, tweet_text, class, hashtags, texto_completo, texto_bert, longitud_texto, num_palabras, comida, restriccion, purga, imagen_corporal, ejercicio, polaridad, subjetividad



4. Procesar hashtags

In [15]:
print("Procesando hashtags...")
mlb = MultiLabelBinarizer()
hashtags_df = pd.DataFrame(mlb.fit_transform(df["hashtags"]), 
                            columns=[f"tag_{tag}" for tag in mlb.classes_])

Procesando hashtags...


In [16]:
# Primera separación, para resguardar los hashtags de prueba y no influenciarlos con los vistos en entrenamiento:
hashtags_df_train = hashtags_df.iloc[:-500].copy()
hashtags_df_test = hashtags_df.iloc[-500:-250].copy()
hashtags_df_test_final = hashtags_df.iloc[-250:].copy()

print(f'Forma de hashtags: {hashtags_df.shape}')
print(f'Hashtags de entrenamiento: {hashtags_df_train.shape}')
print(f'Hashtags de prueba: {hashtags_df_test.shape}')
print(f'Hashtags de prueba final: {hashtags_df_test_final.shape}')

Forma de hashtags: (2000, 2438)
Hashtags de entrenamiento: (1500, 2438)
Hashtags de prueba: (250, 2438)
Hashtags de prueba final: (250, 2438)


In [17]:
# Filtrar hashtags frecuentes
umbral = 10
hashtags_frecuentes = hashtags_df_train.columns[hashtags_df_train.sum() >= umbral]
hashtags_frecuentes_df_train = hashtags_df_train[hashtags_frecuentes]
print(f'{len(list(hashtags_frecuentes_df_train))} hashtags frecuentes de entrenamiento: {list(hashtags_frecuentes_df_train)}')

print(type(hashtags_frecuentes))

59 hashtags frecuentes de entrenamiento: ['tag_Anorexia', 'tag_Bulimia', 'tag_ED', 'tag_RexyBill', 'tag_Salud', 'tag_TCA', 'tag_Thinspo', 'tag_adelgazar', 'tag_alimentacionsaludable', 'tag_ana', 'tag_anamia', 'tag_anorexia', 'tag_anorexiaeetclub', 'tag_anorexic', 'tag_bulimia', 'tag_bulimianervosa', 'tag_bulimic', 'tag_bulimicgirl', 'tag_comida', 'tag_comidasaludable', 'tag_comidasana', 'tag_deporte', 'tag_desayuno', 'tag_dieta', 'tag_eatclean', 'tag_ed', 'tag_ejercicio', 'tag_entrenamiento', 'tag_fat', 'tag_fit', 'tag_food', 'tag_foodie', 'tag_foodporn', 'tag_gorda', 'tag_gym', 'tag_healthy', 'tag_healthyfood', 'tag_instafood', 'tag_lifestyle', 'tag_mia', 'tag_motivaciÃ³n', 'tag_motivation', 'tag_nutricion', 'tag_nutricionista', 'tag_perdergrasa', 'tag_perderpeso', 'tag_proana', 'tag_promia', 'tag_recetas', 'tag_salud', 'tag_saludable', 'tag_skinny', 'tag_tca', 'tag_thin', 'tag_thinspiration', 'tag_thinspo', 'tag_training', 'tag_vegan', 'tag_workout']
<class 'pandas.core.indexes.base.

In [21]:
# Filtrarlos para el conjunto de prueba
hashtags_frecuentes_df_test = hashtags_df_test[hashtags_frecuentes]

print(f'{len(list(hashtags_frecuentes_df_test))} filtro aplicado para el conjunto de prueba: {list(hashtags_frecuentes_df_test)}')
print("Prueba:", list(hashtags_frecuentes_df_train) == list(hashtags_frecuentes_df_test))

59 filtro aplicado para el conjunto de prueba: ['tag_Anorexia', 'tag_Bulimia', 'tag_ED', 'tag_RexyBill', 'tag_Salud', 'tag_TCA', 'tag_Thinspo', 'tag_adelgazar', 'tag_alimentacionsaludable', 'tag_ana', 'tag_anamia', 'tag_anorexia', 'tag_anorexiaeetclub', 'tag_anorexic', 'tag_bulimia', 'tag_bulimianervosa', 'tag_bulimic', 'tag_bulimicgirl', 'tag_comida', 'tag_comidasaludable', 'tag_comidasana', 'tag_deporte', 'tag_desayuno', 'tag_dieta', 'tag_eatclean', 'tag_ed', 'tag_ejercicio', 'tag_entrenamiento', 'tag_fat', 'tag_fit', 'tag_food', 'tag_foodie', 'tag_foodporn', 'tag_gorda', 'tag_gym', 'tag_healthy', 'tag_healthyfood', 'tag_instafood', 'tag_lifestyle', 'tag_mia', 'tag_motivaciÃ³n', 'tag_motivation', 'tag_nutricion', 'tag_nutricionista', 'tag_perdergrasa', 'tag_perderpeso', 'tag_proana', 'tag_promia', 'tag_recetas', 'tag_salud', 'tag_saludable', 'tag_skinny', 'tag_tca', 'tag_thin', 'tag_thinspiration', 'tag_thinspo', 'tag_training', 'tag_vegan', 'tag_workout']
Prueba: True


In [28]:
# Filtrar hashtags frecuentes
hashtags_frecuentes_df_test_final = hashtags_df_test_final[hashtags_frecuentes]

print(f"{len(list(hashtags_frecuentes_df_test_final))} columnas aplicadas. Columnas de hashtags frequentes: {list(hashtags_frecuentes_df_test_final.columns)}")
print("Prueba:", list(hashtags_frecuentes_df_train) == list(hashtags_frecuentes_df_test_final))

59 columnas aplicadas. Columnas de hashtags frequentes: ['tag_Anorexia', 'tag_Bulimia', 'tag_ED', 'tag_RexyBill', 'tag_Salud', 'tag_TCA', 'tag_Thinspo', 'tag_adelgazar', 'tag_alimentacionsaludable', 'tag_ana', 'tag_anamia', 'tag_anorexia', 'tag_anorexiaeetclub', 'tag_anorexic', 'tag_bulimia', 'tag_bulimianervosa', 'tag_bulimic', 'tag_bulimicgirl', 'tag_comida', 'tag_comidasaludable', 'tag_comidasana', 'tag_deporte', 'tag_desayuno', 'tag_dieta', 'tag_eatclean', 'tag_ed', 'tag_ejercicio', 'tag_entrenamiento', 'tag_fat', 'tag_fit', 'tag_food', 'tag_foodie', 'tag_foodporn', 'tag_gorda', 'tag_gym', 'tag_healthy', 'tag_healthyfood', 'tag_instafood', 'tag_lifestyle', 'tag_mia', 'tag_motivaciÃ³n', 'tag_motivation', 'tag_nutricion', 'tag_nutricionista', 'tag_perdergrasa', 'tag_perderpeso', 'tag_proana', 'tag_promia', 'tag_recetas', 'tag_salud', 'tag_saludable', 'tag_skinny', 'tag_tca', 'tag_thin', 'tag_thinspiration', 'tag_thinspo', 'tag_training', 'tag_vegan', 'tag_workout']
Prueba: True


In [29]:
hashtags_frecuentes_df = pd.concat([hashtags_frecuentes_df_train, hashtags_frecuentes_df_test, hashtags_frecuentes_df_test_final], ignore_index=True)
print(hashtags_frecuentes_df.shape)
hashtags_frecuentes_df

(2000, 59)


Unnamed: 0,tag_Anorexia,tag_Bulimia,tag_ED,tag_RexyBill,tag_Salud,tag_TCA,tag_Thinspo,tag_adelgazar,tag_alimentacionsaludable,tag_ana,...,tag_salud,tag_saludable,tag_skinny,tag_tca,tag_thin,tag_thinspiration,tag_thinspo,tag_training,tag_vegan,tag_workout
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1998,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


5. Preparar texto completo

In [30]:
umbral_bajo = 5
hashtags_vectorizacion = hashtags_df.columns[hashtags_df.sum() >= umbral_bajo]
hashtags_validos = {col.replace('tag_', '') for col in hashtags_vectorizacion}
df["hashtags_frecuentes_bajos"] = df["hashtags"].apply(lambda h: obtener_hashtags_frecuentes_individuales(h, hashtags_validos))
df["texto_completo"] = df["tweet_text"] + " " + df["hashtags_frecuentes_bajos"]

6. Vectorización TF-IDF

In [31]:
print("Aplicando vectorización TF-IDF...")
vectorizer = TfidfVectorizer(
    max_features=1500,
    ngram_range=(1,3),
    stop_words=stop_words_es,
    min_df=2,
    max_df=0.85,
    sublinear_tf=True,
    norm='l2'
)
X_tfidf = vectorizer.fit_transform(df["texto_completo"])
y = df["class"]

Aplicando vectorización TF-IDF...


In [33]:
print("\nInformación sobre las características TF-IDF:")
print(f"Número total de características: {X_tfidf.shape[1]}")
print(f"Número de muestras: {X_tfidf.shape[0]}")
print("\nTop 10 términos más importantes:")
feature_names = vectorizer.get_feature_names_out()
idf_values = vectorizer.idf_
top_terms = sorted(zip(feature_names, idf_values), key=lambda x: x[1], reverse=True)[:10]
for term, idf in top_terms:
    print(f"{term}: {idf:.2f}")


Información sobre las características TF-IDF:
Número total de características: 1500
Número de muestras: 2000

Top 10 términos más importantes:
3x8: 7.50
be: 7.50
boedo: 7.50
cabra: 7.50
cacahuate: 7.50
calmar: 7.50
camarón: 7.50
cena sopa: 7.50
coco curry: 7.50
col: 7.50


7. Construcción de dataset final

In [34]:
print("Creando DataFrame final...")
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=[f"tfidf_{i}" for i in range(X_tfidf.shape[1])])

tfidf_df

Creando DataFrame final...


Unnamed: 0,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,tfidf_8,tfidf_9,...,tfidf_1490,tfidf_1491,tfidf_1492,tfidf_1493,tfidf_1494,tfidf_1495,tfidf_1496,tfidf_1497,tfidf_1498,tfidf_1499
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
# Seleccionar columnas para el DataFrame final
columnas_base = ["tweet_id", "tweet_text", "texto_completo", "texto_bert"]
columnas_metricas = ["longitud_texto", "num_palabras"]
columnas_palabras_clave = ["comida", "restriccion", "purga", "imagen_corporal", "ejercicio"]
columnas_sentimiento = ["polaridad", "subjetividad"]

In [38]:
# Crear el DataFrame final sin la columna class
df_final = pd.concat([
    df[columnas_base],
    df[columnas_metricas],
    df[columnas_palabras_clave],
    df[columnas_sentimiento],
    hashtags_frecuentes_df,
    tfidf_df
], axis=1)

print(f'Número total de columnas finales: {df_final.shape[1]}')
df_final.head(5)

Número total de columnas finales: 1572


Unnamed: 0,tweet_id,tweet_text,texto_completo,texto_bert,longitud_texto,num_palabras,comida,restriccion,purga,imagen_corporal,...,tfidf_1490,tfidf_1491,tfidf_1492,tfidf_1493,tfidf_1494,tfidf_1495,tfidf_1496,tfidf_1497,tfidf_1498,tfidf_1499
0,0d3ed29586ce,cheesecakir saludable azucar lactosa mermerlad...,cheesecakir saludable azucar lactosa mermerlad...,cheesecake saludable sin azúcar y sin lactosa ...,59,7,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,c3cf897a495b,,,ser como ellas. Etiquetas: HastaLosHuesos,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5041d85c45c6,comida real clave sano delgado feliz,comida real clave sano delgado feliz,"comida real o , la clave para estar más sana, ...",36,6,3,2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,d18285d3c7ec,cambio hora bajada destemplado recomendar plat...,cambio hora bajada destemplado recomendar plat...,entre el cambio de hora y la bajada de las est...,62,8,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4d81892f3217,tiempo sentia cuerpo frio,tiempo sentia cuerpo frio,hace mucho tiempo no sentía mi cuerpo tan frío,25,4,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


8. Normalización de características

In [39]:
columnas_a_escalar = ["longitud_texto", "num_palabras", "polaridad", "subjetividad"]
scaler = StandardScaler()
df_final[columnas_a_escalar] = scaler.fit_transform(df_final[columnas_a_escalar])

In [40]:
# Convertir etiquetas a valores numéricos y agregar al final
df_final['class'] = df['class'].map({'control': 0, 'anorexia': 1})

df_final_train = df_final.iloc[:-500].copy()
df_final_test = df_final.iloc[-500:-250].copy()
df_final_test_final = df_final.iloc[-250:].copy()

print(f'Tamaño de dataset de entrenamiento: {df_final_train.shape}')
df_final_train

Tamaño de dataset de entrenamiento: (1500, 1573)


Unnamed: 0,tweet_id,tweet_text,texto_completo,texto_bert,longitud_texto,num_palabras,comida,restriccion,purga,imagen_corporal,...,tfidf_1491,tfidf_1492,tfidf_1493,tfidf_1494,tfidf_1495,tfidf_1496,tfidf_1497,tfidf_1498,tfidf_1499,class
0,0d3ed29586ce,cheesecakir saludable azucar lactosa mermerlad...,cheesecakir saludable azucar lactosa mermerlad...,cheesecake saludable sin azúcar y sin lactosa ...,0.238798,-0.062255,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,c3cf897a495b,,,ser como ellas. Etiquetas: HastaLosHuesos,-1.536538,-1.607600,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,5041d85c45c6,comida real clave sano delgado feliz,comida real clave sano delgado feliz,"comida real o , la clave para estar más sana, ...",-0.453282,-0.283019,3,2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,d18285d3c7ec,cambio hora bajada destemplado recomendar plat...,cambio hora bajada destemplado recomendar plat...,entre el cambio de hora y la bajada de las est...,0.329069,0.158508,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,4d81892f3217,tiempo sentia cuerpo frio,tiempo sentia cuerpo frio,hace mucho tiempo no sentía mi cuerpo tan frío,-0.784277,-0.724546,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,996e4fcb00b3,desayuno sano rico,desayuno sano rico comidasana,desayuno sano y rico!!!. Etiquetas: comidasana...,-0.994910,-0.945310,1,3,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1496,c84e209a708f,perfeccion sabado ana abandona,perfeccion sabado ana abandona Thinspo Thinspi...,perfección para este sábado ana y mía nunca la...,-0.633825,-0.724546,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1497,2aad68408fcd,comer devolver caloria,comer devolver caloria TengoUnDeseoYEs perfeccion,comer y devolver las calorías.. Etiquetas: Ten...,-0.874548,-0.945310,1,2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1498,a495c0188dc6,entender sigo soltero cocino,entender sigo soltero cocino realfooder,"yo no entiendo cómo sigo soltero, con lo bien ...",-0.694006,-0.724546,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [41]:
print(f'Tamaño de dataset de prueba: {df_final_test.shape}')
df_final_test

Tamaño de dataset de prueba: (250, 1573)


Unnamed: 0,tweet_id,tweet_text,texto_completo,texto_bert,longitud_texto,num_palabras,comida,restriccion,purga,imagen_corporal,...,tfidf_1491,tfidf_1492,tfidf_1493,tfidf_1494,tfidf_1495,tfidf_1496,tfidf_1497,tfidf_1498,tfidf_1499,class
1500,a2ffa2b1425c,empezar anorexia bulimia facil acceder,empezar anorexia bulimia facil acceder,a ver cuando se empiezan con las de anorexia y...,-0.393101,-0.503783,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1501,bfdce07b546c,7 7 razón entrenar llegar domingo wow acabo se...,7 7 razón entrenar llegar domingo wow acabo se...,"¡7 días, 7 razones para entrenar! - llegamos a...",2.315037,2.366144,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1502,fdcb32f44ce7,bajar peso comer parar,bajar peso comer parar Tips,como bajar peso comiendo sin parar.. Etiquetas...,-0.874548,-0.724546,2,2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1503,17203fc43352,importar platillo lucir biendeliciosoricoeso m...,importar platillo lucir biendeliciosoricoeso m...,"no importa si ese platillo luce bien,delicioso...",0.178617,-0.283019,1,3,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1504,3947b6578c8e,polo fruta casero frase patatar sano cuidar él...,polo fruta casero frase patatar sano cuidar él...,por supuesto el polo es de fruta y casero más ...,2.315037,3.249199,1,3,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1745,2b400f44f0ca,whatsapp escondite comunidad,whatsapp escondite comunidad anorexia bulimia,whatsapp el nuevo escondite para las comunidad...,-0.694006,-0.945310,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1746,7843bfa17219,querer cuerpo,querer cuerpo,": "": quisiera tener un cuerpo como este :'( """,-1.145362,-1.166073,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1747,b6c4ee703ad3,desayuno martes empezar acabar ing,desayuno martes empezar acabar ing adelgazar l...,"desayuno de martes, lo que bien empieza bien a...",-0.513463,-0.503783,2,3,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1748,910ab22d43f7,querer necesito desear,querer necesito desear Thinspiration,"quiero, necesito, deseo y sé que puedo.. Etiqu...",-0.874548,-0.945310,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [42]:
print(f'Tamaño de dataset de prueba: {df_final_test_final.shape}')
df_final_test_final

Tamaño de dataset de prueba: (250, 1573)


Unnamed: 0,tweet_id,tweet_text,texto_completo,texto_bert,longitud_texto,num_palabras,comida,restriccion,purga,imagen_corporal,...,tfidf_1491,tfidf_1492,tfidf_1493,tfidf_1494,tfidf_1495,tfidf_1496,tfidf_1497,tfidf_1498,tfidf_1499,class
1750,c1a446a05987,malo ayunar 20 mes continuo bomitar,malo ayunar 20 mes continuo bomitar,no es malo ayunar 20 dias al mes ... ya que no...,-0.483373,-0.283019,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1751,bfdce07b546c,7 7 razón entrenar llegar domingo wow acabo se...,7 7 razón entrenar llegar domingo wow acabo se...,"¡7 días, 7 razones para entrenar! - llegamos a...",2.315037,2.366144,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1752,860a2eced012,avena banán platano cacao super sano facilisir...,avena banán platano cacao super sano facilisir...,"de avena, banana (o plátano) y cacao súper san...",1.382234,1.703853,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1753,b745c4311c24,pesar 29,pesar 29,cuanto pesan ustedes yo 29,-1.295814,-1.166073,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1754,fdcb32f44ce7,bajar peso comer parar,bajar peso comer parar Tips,como bajar peso comiendo sin parar.. Etiquetas...,-0.874548,-0.724546,2,2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,2f4f7d21b591,conocer alfa espiritu entrega ayudarar lograr ...,conocer alfa espiritu entrega ayudarar lograr ...,conoce nuestros alfas.! quienes con su espírit...,0.088345,-0.062255,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1996,2b400f44f0ca,whatsapp escondite comunidad,whatsapp escondite comunidad anorexia bulimia,whatsapp el nuevo escondite para las comunidad...,-0.694006,-0.945310,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1997,7843bfa17219,querer cuerpo,querer cuerpo,": "": quisiera tener un cuerpo como este :'( """,-1.145362,-1.166073,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1998,b6c4ee703ad3,desayuno martes empezar acabar ing,desayuno martes empezar acabar ing adelgazar l...,"desayuno de martes, lo que bien empieza bien a...",-0.513463,-0.503783,2,3,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


9. Guardar dataset con TODAS las columnas

In [43]:
print("Guardando resultados...")
df_final_train.to_csv("../../data/NO_USAR_tweets_procesados_TRAIN.csv", index=False, encoding="utf-8")
df_final_test.to_csv("../../data/NO_USAR_tweets_procesados_TEST.csv", index=False, encoding="utf-8")
df_final_test_final.to_csv("../../data/NO_USAR_tweets_procesados_TEST_FINAL.csv", index=False, encoding="utf-8")

Guardando resultados...


10. Guardar dataset para los modelos tradicionales

In [44]:
print("Guardando resultados para modelos tradicionales...")
ds_tradicional_train = df_final_train.drop(columns=["tweet_id", "tweet_text", "texto_completo", "texto_bert"])
ds_tradicional_test = df_final_test.drop(columns=["tweet_id", "tweet_text", "texto_completo", "texto_bert"])
ds_tradicional_test_final = df_final_test_final.drop(columns=["tweet_id", "tweet_text", "texto_completo", "texto_bert"])

ds_tradicional_train.to_csv("../../data/ds_tradicional.csv", index=False, encoding="utf-8")
ds_tradicional_test.to_csv("../../data/ds_tradicional_TEST.csv", index=False, encoding="utf-8")
ds_tradicional_test_final.to_csv("../../data/ds_tradicional_TEST_FINAL.csv", index=False, encoding="utf-8")

Guardando resultados para modelos tradicionales...


11. Guardar dataset para modelo BERT

In [45]:
print("Guardando resultados para el modelo BETO...")
ds_BERT_train = df_final_train[["texto_bert", "class"]]
ds_BERT_test = df_final_test[["texto_bert", "class"]]
ds_BERT_test_final = df_final_test_final[["texto_bert", "class"]]

ds_BERT_train.to_csv("../../data/ds_BETO.csv", index=False, encoding="utf-8")
ds_BERT_test.to_csv("../../data/ds_BETO_TEST.csv", index=False, encoding="utf-8")
ds_BERT_test_final.to_csv("../../data/ds_BETO_TEST_FINAL.csv", index=False, encoding="utf-8")

Guardando resultados para el modelo BETO...


#### Información final

In [47]:
print("\nDataset final guardado con las siguientes columnas:")
print("\nColumnas base:", columnas_base)
print("\nMétricas estilísticas:", columnas_metricas)
print("\nPalabras clave:", columnas_palabras_clave)
print("\nAnálisis de sentimiento:", columnas_sentimiento)
print("\nHashtags frecuentes:", list(hashtags_frecuentes))
print("\nTotal de características TF-IDF:", X_tfidf.shape[1])
print("\nDistribución de clases de entrenamiento:")
print(df_final_train['class'].value_counts())
print("\nDistribución de clases de prueba:")
print(df_final_test['class'].value_counts())
print("\nDistribución de clases de prueba final:")
print(df_final_test_final['class'].value_counts())


Dataset final guardado con las siguientes columnas:

Columnas base: ['tweet_id', 'tweet_text', 'texto_completo', 'texto_bert']

Métricas estilísticas: ['longitud_texto', 'num_palabras']

Palabras clave: ['comida', 'restriccion', 'purga', 'imagen_corporal', 'ejercicio']

Análisis de sentimiento: ['polaridad', 'subjetividad']

Hashtags frecuentes: ['tag_Anorexia', 'tag_Bulimia', 'tag_ED', 'tag_RexyBill', 'tag_Salud', 'tag_TCA', 'tag_Thinspo', 'tag_adelgazar', 'tag_alimentacionsaludable', 'tag_ana', 'tag_anamia', 'tag_anorexia', 'tag_anorexiaeetclub', 'tag_anorexic', 'tag_bulimia', 'tag_bulimianervosa', 'tag_bulimic', 'tag_bulimicgirl', 'tag_comida', 'tag_comidasaludable', 'tag_comidasana', 'tag_deporte', 'tag_desayuno', 'tag_dieta', 'tag_eatclean', 'tag_ed', 'tag_ejercicio', 'tag_entrenamiento', 'tag_fat', 'tag_fit', 'tag_food', 'tag_foodie', 'tag_foodporn', 'tag_gorda', 'tag_gym', 'tag_healthy', 'tag_healthyfood', 'tag_instafood', 'tag_lifestyle', 'tag_mia', 'tag_motivaciÃ³n', 'tag_mot