# Preprocesamiento de Datos

## I. Importaciones

**Autor:** Pablo Spínola López

**Descripción:** Este notebook realiza la limpieza y preprocesamiento de los datos crudos. 

In [1]:
import sys
import os
import pandas as pd
import swifter
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

## II. Definición de stopwords para vectorización

In [2]:
# Cargar stopwords en español como lista
stop_words_es = list(stopwords.words('spanish'))
# Agregar stopwords adicionales específicas del dominio
stop_words_adicionales = [
    'rt', 'https', 'http', 'tco', 'twitter', 'com',  # URLs y términos de Twitter
    't', 'co',  # Partes de 't.co'
    'q', 'k', 'd', 'tb', 'tmb', 'pq', 'xq', 'dnd', 'kien', 'salu2', 'aki', 'tqm'  # Abreviaturas comunes
]
stop_words_es.extend(stop_words_adicionales)
# Eliminar duplicados y ordenar
stop_words_es = sorted(list(set(stop_words_es)))

## III. Importación de función extracción de características

In [3]:
sys.path.append(os.path.abspath("../funciones"))

from funcionesPreprocesamiento import (
    obtener_hashtags_frecuentes_individuales,
    extraer_caracteristicas
)

# Pipeline principal

1. Cargar datos

In [4]:
print("Cargando datos...")
df_train = pd.read_csv("../../data/data_train.csv", encoding="latin1", header=0)
df_test = pd.read_csv("../../data/data_test_fold1.csv", encoding="latin1", header=0)

df = pd.concat([df_train, df_test], ignore_index=True)

print(df.shape)
df.head(-5)

Cargando datos...
(1750, 4)


Unnamed: 0,user_id,tweet_id,tweet_text,class
0,user0001,0d3ed29586ce,Cheesecake saludable sin azÃºcar y sin lactosa...,control
1,user0002,c3cf897a495b,ser como ellas â¡â¡\n #HastaLosHuesos,anorexia
2,user0003,5041d85c45c6,"Comida Real o , la clave para estar mÃ¡s sana,...",control
3,user0004,d18285d3c7ec,Entre el cambio de hora y la bajada de las #te...,control
4,user0005,4d81892f3217,Hace mucho tiempo no sentÃ­a mi cuerpo tan frÃ­o,anorexia
...,...,...,...,...
1740,user1864,90c3b14b843c,Chuleta de Cerdo al horno sobre cama de ensala...,control
1741,user1865,a25a7fd51d01,Intentando comer cada dÃ­a mejor,control
1742,user1866,9a308565186a,Omg!!!!! Llevo una semana en ayuno y me siento...,anorexia
1743,user1868,ec90850e026b,CÃ³mo conseguir un ejercicio intenso para el g...,control


2. Preprocesamiento inicial

In [5]:
df["tweet_text"] = df["tweet_text"].fillna("")
print("Preprocesamiento inicial exitoso :)")

Preprocesamiento inicial exitoso :)


3. Extraer características iniciales

In [6]:
print("Extrayendo características iniciales...")
features_df = df["tweet_text"].swifter.apply(extraer_caracteristicas).apply(pd.Series)

Extrayendo características iniciales...


Pandas Apply:   0%|          | 0/1750 [00:00<?, ?it/s]

In [7]:
print("Características iniciales:")
for caracteristica in features_df.columns:
    print(f'\t- {caracteristica}: {type(caracteristica)}')

Características iniciales:
	- tweet_text: <class 'str'>
	- hashtags: <class 'str'>
	- texto_completo: <class 'str'>
	- texto_bert: <class 'str'>
	- longitud_texto: <class 'str'>
	- num_palabras: <class 'str'>
	- comida: <class 'str'>
	- restriccion: <class 'str'>
	- purga: <class 'str'>
	- imagen_corporal: <class 'str'>
	- ejercicio: <class 'str'>
	- polaridad: <class 'str'>
	- subjetividad: <class 'str'>


In [8]:
# Reemplazar columnas duplicadas
df["tweet_text"] = features_df["tweet_text"]
df["hashtags"] = features_df["hashtags"]

# Agregar columnas nuevas
columnas_nuevas = features_df.drop(columns=["tweet_text", "hashtags"])
df = pd.concat([df, columnas_nuevas], axis=1)

print(f'Cantidad de columnas hasta este punto: {df.shape[1]}')
print("Columnas de nuestro dataset hasta este punto:")
print(f'--> ', end='')
for col in df.columns:
    print(f', {col}', end='')
print('\n')


Cantidad de columnas hasta este punto: 16
Columnas de nuestro dataset hasta este punto:
--> , user_id, tweet_id, tweet_text, class, hashtags, texto_completo, texto_bert, longitud_texto, num_palabras, comida, restriccion, purga, imagen_corporal, ejercicio, polaridad, subjetividad



4. Procesar hashtags

In [9]:
print("Procesando hashtags...")
mlb = MultiLabelBinarizer()
hashtags_df = pd.DataFrame(mlb.fit_transform(df["hashtags"]), 
                            columns=[f"tag_{tag}" for tag in mlb.classes_])

Procesando hashtags...


In [10]:
# Primera separación, para resguardar los hashtags de prueba y no influenciarlos con los vistos en entrenamiento:
hashtags_df_train = hashtags_df.iloc[:-250].copy()
hashtags_df_test = hashtags_df.iloc[-250:].copy()

print(f'Hashtags de entrenamiento: {hashtags_df_train.shape}')
print(f'Hashtags de prueba: {hashtags_df_test.shape}')

Hashtags de entrenamiento: (1500, 2312)
Hashtags de prueba: (250, 2312)


In [11]:
# Filtrar hashtags frecuentes
umbral = 10
hashtags_frecuentes = hashtags_df_train.columns[hashtags_df_train.sum() >= umbral]
hashtags_frecuentes_df_train = hashtags_df_train[hashtags_frecuentes]
print(f'{len(list(hashtags_frecuentes_df_train))} hashtags frecuentes de entrenamiento: {list(hashtags_frecuentes_df_train)}')

59 hashtags frecuentes de entrenamiento: ['tag_Anorexia', 'tag_Bulimia', 'tag_ED', 'tag_RexyBill', 'tag_Salud', 'tag_TCA', 'tag_Thinspo', 'tag_adelgazar', 'tag_alimentacionsaludable', 'tag_ana', 'tag_anamia', 'tag_anorexia', 'tag_anorexiaeetclub', 'tag_anorexic', 'tag_bulimia', 'tag_bulimianervosa', 'tag_bulimic', 'tag_bulimicgirl', 'tag_comida', 'tag_comidasaludable', 'tag_comidasana', 'tag_deporte', 'tag_desayuno', 'tag_dieta', 'tag_eatclean', 'tag_ed', 'tag_ejercicio', 'tag_entrenamiento', 'tag_fat', 'tag_fit', 'tag_food', 'tag_foodie', 'tag_foodporn', 'tag_gorda', 'tag_gym', 'tag_healthy', 'tag_healthyfood', 'tag_instafood', 'tag_lifestyle', 'tag_mia', 'tag_motivaciÃ³n', 'tag_motivation', 'tag_nutricion', 'tag_nutricionista', 'tag_perdergrasa', 'tag_perderpeso', 'tag_proana', 'tag_promia', 'tag_recetas', 'tag_salud', 'tag_saludable', 'tag_skinny', 'tag_tca', 'tag_thin', 'tag_thinspiration', 'tag_thinspo', 'tag_training', 'tag_vegan', 'tag_workout']


In [12]:
# Filtrarlos para el conjunto de prueba
hashtags_frecuentes_df_test = hashtags_df_test[hashtags_frecuentes]
print(f'{len(list(hashtags_frecuentes_df_test))} filtro aplicado para el conjunto de prueba: {list(hashtags_frecuentes_df_test)}')

print("Prueba:", list(hashtags_frecuentes_df_train) == list(hashtags_frecuentes_df_test))

59 filtro aplicado para el conjunto de prueba: ['tag_Anorexia', 'tag_Bulimia', 'tag_ED', 'tag_RexyBill', 'tag_Salud', 'tag_TCA', 'tag_Thinspo', 'tag_adelgazar', 'tag_alimentacionsaludable', 'tag_ana', 'tag_anamia', 'tag_anorexia', 'tag_anorexiaeetclub', 'tag_anorexic', 'tag_bulimia', 'tag_bulimianervosa', 'tag_bulimic', 'tag_bulimicgirl', 'tag_comida', 'tag_comidasaludable', 'tag_comidasana', 'tag_deporte', 'tag_desayuno', 'tag_dieta', 'tag_eatclean', 'tag_ed', 'tag_ejercicio', 'tag_entrenamiento', 'tag_fat', 'tag_fit', 'tag_food', 'tag_foodie', 'tag_foodporn', 'tag_gorda', 'tag_gym', 'tag_healthy', 'tag_healthyfood', 'tag_instafood', 'tag_lifestyle', 'tag_mia', 'tag_motivaciÃ³n', 'tag_motivation', 'tag_nutricion', 'tag_nutricionista', 'tag_perdergrasa', 'tag_perderpeso', 'tag_proana', 'tag_promia', 'tag_recetas', 'tag_salud', 'tag_saludable', 'tag_skinny', 'tag_tca', 'tag_thin', 'tag_thinspiration', 'tag_thinspo', 'tag_training', 'tag_vegan', 'tag_workout']
Prueba: True


In [13]:
hashtags_frecuentes_df = pd.concat([hashtags_frecuentes_df_train, hashtags_frecuentes_df_test], ignore_index=True)
print(hashtags_frecuentes_df.shape)
hashtags_frecuentes_df

(1750, 59)


Unnamed: 0,tag_Anorexia,tag_Bulimia,tag_ED,tag_RexyBill,tag_Salud,tag_TCA,tag_Thinspo,tag_adelgazar,tag_alimentacionsaludable,tag_ana,...,tag_salud,tag_saludable,tag_skinny,tag_tca,tag_thin,tag_thinspiration,tag_thinspo,tag_training,tag_vegan,tag_workout
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1745,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1746,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1747,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


5. Preparar texto completo

In [14]:
umbral_bajo = 5
hashtags_vectorizacion = hashtags_df.columns[hashtags_df.sum() >= umbral_bajo]
hashtags_validos = {col.replace('tag_', '') for col in hashtags_vectorizacion}
df["hashtags_frecuentes_bajos"] = df["hashtags"].apply(lambda h: obtener_hashtags_frecuentes_individuales(h, hashtags_validos))
df["texto_completo"] = df["tweet_text"] + " " + df["hashtags_frecuentes_bajos"]

6. Vectorización TF-IDF

In [15]:
print("Aplicando vectorización TF-IDF...")
vectorizer = TfidfVectorizer(
    max_features=1300,
    ngram_range=(1,3),
    stop_words=stop_words_es,
    min_df=2,
    max_df=0.85,
    sublinear_tf=True,
    norm='l2'
)
X_tfidf = vectorizer.fit_transform(df["texto_completo"])
y = df["class"]

Aplicando vectorización TF-IDF...


In [16]:
print("\nInformación sobre las características TF-IDF:")
print(f"Número total de características: {X_tfidf.shape[1]}")
print(f"Número de muestras: {X_tfidf.shape[0]}")
print("\nTop 10 términos más importantes:")
feature_names = vectorizer.get_feature_names_out()
idf_values = vectorizer.idf_
top_terms = sorted(zip(feature_names, idf_values), key=lambda x: x[1], reverse=True)[:10]
for term, idf in top_terms:
    print(f"{term}: {idf:.2f}")


Información sobre las características TF-IDF:
Número total de características: 1300
Número de muestras: 1750

Top 10 términos más importantes:
3x8: 7.37
antojo: 7.37
arte: 7.37
be: 7.37
cabra: 7.37
camarón: 7.37
dietar: 7.37
empresa: 7.37
fortalecer: 7.37
monstruo: 7.37


7. Construcción de dataset final

In [17]:
print("Creando DataFrame final...")
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=[f"tfidf_{i}" for i in range(X_tfidf.shape[1])])

tfidf_df

Creando DataFrame final...


Unnamed: 0,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,tfidf_8,tfidf_9,...,tfidf_1290,tfidf_1291,tfidf_1292,tfidf_1293,tfidf_1294,tfidf_1295,tfidf_1296,tfidf_1297,tfidf_1298,tfidf_1299
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1748,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Seleccionar columnas para el DataFrame final
columnas_base = ["tweet_id", "tweet_text", "texto_completo", "texto_bert"]
columnas_metricas = ["longitud_texto", "num_palabras"]
columnas_palabras_clave = ["comida", "restriccion", "purga", "imagen_corporal", "ejercicio"]
columnas_sentimiento = ["polaridad", "subjetividad"]

In [19]:
# Crear el DataFrame final sin la columna class
df_final = pd.concat([
    df[columnas_base],
    df[columnas_metricas],
    df[columnas_palabras_clave],
    df[columnas_sentimiento],
    hashtags_frecuentes_df,
    tfidf_df
], axis=1)

print(f'Número total de columnas finales: {df_final.shape[1]}')
df_final.head(5)

Número total de columnas finales: 1372


Unnamed: 0,tweet_id,tweet_text,texto_completo,texto_bert,longitud_texto,num_palabras,comida,restriccion,purga,imagen_corporal,...,tfidf_1290,tfidf_1291,tfidf_1292,tfidf_1293,tfidf_1294,tfidf_1295,tfidf_1296,tfidf_1297,tfidf_1298,tfidf_1299
0,0d3ed29586ce,cheesecakir saludable azucar lactosa mermerlad...,cheesecakir saludable azucar lactosa mermerlad...,cheesecake saludable sin azúcar y sin lactosa ...,59,7,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,c3cf897a495b,,,ser como ellas. Etiquetas: HastaLosHuesos,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5041d85c45c6,comida real clave sano delgado feliz,comida real clave sano delgado feliz,"comida real o , la clave para estar más sana, ...",36,6,3,2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,d18285d3c7ec,cambio hora bajada destemplado recomendar plat...,cambio hora bajada destemplado recomendar plat...,entre el cambio de hora y la bajada de las est...,62,8,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4d81892f3217,tiempo sentia cuerpo frio,tiempo sentia cuerpo frio,hace mucho tiempo no sentía mi cuerpo tan frío,25,4,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


8. Normalización de características

In [20]:
columnas_a_escalar = ["longitud_texto", "num_palabras", "polaridad", "subjetividad"]
scaler = StandardScaler()
df_final[columnas_a_escalar] = scaler.fit_transform(df_final[columnas_a_escalar])

In [21]:
# Convertir etiquetas a valores numéricos y agregar al final
df_final['class'] = df['class'].map({'control': 0, 'anorexia': 1})

df_final_train = df_final.iloc[:-250].copy()
df_final_test = df_final.iloc[-250:].copy()

print(f'Tamaño de dataset de entrenamiento: {df_final_train.shape}')
df_final_train

Tamaño de dataset de entrenamiento: (1500, 1373)


Unnamed: 0,tweet_id,tweet_text,texto_completo,texto_bert,longitud_texto,num_palabras,comida,restriccion,purga,imagen_corporal,...,tfidf_1291,tfidf_1292,tfidf_1293,tfidf_1294,tfidf_1295,tfidf_1296,tfidf_1297,tfidf_1298,tfidf_1299,class
0,0d3ed29586ce,cheesecakir saludable azucar lactosa mermerlad...,cheesecakir saludable azucar lactosa mermerlad...,cheesecake saludable sin azúcar y sin lactosa ...,0.242153,-0.059671,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,c3cf897a495b,,,ser como ellas. Etiquetas: HastaLosHuesos,-1.549998,-1.624910,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,5041d85c45c6,comida real clave sano delgado feliz,comida real clave sano delgado feliz,"comida real o , la clave para estar más sana, ...",-0.456482,-0.283276,3,2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,d18285d3c7ec,cambio hora bajada destemplado recomendar plat...,cambio hora bajada destemplado recomendar plat...,entre el cambio de hora y la bajada de las est...,0.333279,0.163935,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,4d81892f3217,tiempo sentia cuerpo frio,tiempo sentia cuerpo frio,hace mucho tiempo no sentía mi cuerpo tan frío,-0.790612,-0.730488,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,996e4fcb00b3,desayuno sano rico,desayuno sano rico comidasana,desayuno sano y rico!!!. Etiquetas: comidasana...,-1.003240,-0.954093,1,3,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1496,c84e209a708f,perfeccion sabado ana abandona,perfeccion sabado ana abandona Thinspo Thinspi...,perfección para este sábado ana y mía nunca la...,-0.638735,-0.730488,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1497,2aad68408fcd,comer devolver caloria,comer devolver caloria,comer y devolver las calorías.. Etiquetas: Ten...,-0.881739,-0.954093,1,2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1498,a495c0188dc6,entender sigo soltero cocino,entender sigo soltero cocino realfooder,"yo no entiendo cómo sigo soltero, con lo bien ...",-0.699486,-0.730488,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [22]:
print(f'Tamaño de dataset de prueba: {df_final_test.shape}')
df_final_test

Tamaño de dataset de prueba: (250, 1373)


Unnamed: 0,tweet_id,tweet_text,texto_completo,texto_bert,longitud_texto,num_palabras,comida,restriccion,purga,imagen_corporal,...,tfidf_1291,tfidf_1292,tfidf_1293,tfidf_1294,tfidf_1295,tfidf_1296,tfidf_1297,tfidf_1298,tfidf_1299,class
1500,a2ffa2b1425c,empezar anorexia bulimia facil acceder,empezar anorexia bulimia facil acceder,a ver cuando se empiezan con las de anorexia y...,-0.395731,-0.506882,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1501,bfdce07b546c,7 7 razón entrenar llegar domingo wow acabo se...,7 7 razón entrenar llegar domingo wow acabo se...,"¡7 días, 7 razones para entrenar! - llegamos a...",2.338059,2.399991,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1502,fdcb32f44ce7,bajar peso comer parar,bajar peso comer parar,como bajar peso comiendo sin parar.. Etiquetas...,-0.881739,-0.730488,2,2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1503,17203fc43352,importar platillo lucir biendeliciosoricoeso m...,importar platillo lucir biendeliciosoricoeso m...,"no importa si ese platillo luce bien,delicioso...",0.181402,-0.283276,1,3,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1504,3947b6578c8e,polo fruta casero frase patatar sano cuidar él...,polo fruta casero frase patatar sano cuidar él...,por supuesto el polo es de fruta y casero más ...,2.338059,3.294413,1,3,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1745,2b400f44f0ca,whatsapp escondite comunidad,whatsapp escondite comunidad anorexia bulimia,whatsapp el nuevo escondite para las comunidad...,-0.699486,-0.954093,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1746,7843bfa17219,querer cuerpo,querer cuerpo,": "": quisiera tener un cuerpo como este :'( """,-1.155118,-1.177699,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1747,b6c4ee703ad3,desayuno martes empezar acabar ing,desayuno martes empezar acabar ing adelgazar l...,"desayuno de martes, lo que bien empieza bien a...",-0.517233,-0.506882,2,3,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1748,910ab22d43f7,querer necesito desear,querer necesito desear Thinspiration,"quiero, necesito, deseo y sé que puedo.. Etiqu...",-0.881739,-0.954093,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


9. Guardar dataset con TODAS las columnas

In [23]:
print("Guardando resultados...")
df_final_train.to_csv("../../data/NO_USAR_tweets_procesados_TRAIN.csv", index=False, encoding="utf-8")
df_final_test.to_csv("../../data/NO_USAR_tweets_procesados_TEST.csv", index=False, encoding="utf-8")

Guardando resultados...


10. Guardar dataset para los modelos tradicionales

In [24]:
print("Guardando resultados para modelos tradicionales...")
ds_tradicional_train = df_final_train.drop(columns=["tweet_id", "tweet_text", "texto_completo", "texto_bert"])
ds_tradicional_test = df_final_test.drop(columns=["tweet_id", "tweet_text", "texto_completo", "texto_bert"])

# ds_tradicional_train.to_csv("../../data/ds_tradicional.csv", index=False, encoding="utf-8")
# ds_tradicional_test.to_csv("../../data/ds_tradicional_TEST.csv", index=False, encoding="utf-8")

Guardando resultados para modelos tradicionales...


11. Guardar dataset para modelo BERT

In [25]:
print("Guardando resultados para el modelo BETO...")
ds_BERT_train = df_final_train[["texto_bert", "class"]]
ds_BERT_test = df_final_test[["texto_bert", "class"]]

ds_BERT_train.to_csv("../../data/ds_BETO.csv", index=False, encoding="utf-8")
ds_BERT_test.to_csv("../../data/ds_BETO_TEST.csv", index=False, encoding="utf-8")

Guardando resultados para el modelo BETO...


#### Información final

In [26]:
print("\nDataset final guardado con las siguientes columnas:")
print("\nColumnas base:", columnas_base)
print("\nMétricas estilísticas:", columnas_metricas)
print("\nPalabras clave:", columnas_palabras_clave)
print("\nAnálisis de sentimiento:", columnas_sentimiento)
print("\nHashtags frecuentes:", list(hashtags_frecuentes))
print("\nTotal de características TF-IDF:", X_tfidf.shape[1])
print("\nDistribución de clases de entrenamiento:")
print(df_final_train['class'].value_counts())
print("\nDistribución de clases de prueba:")
print(df_final_test['class'].value_counts())


Dataset final guardado con las siguientes columnas:

Columnas base: ['tweet_id', 'tweet_text', 'texto_completo', 'texto_bert']

Métricas estilísticas: ['longitud_texto', 'num_palabras']

Palabras clave: ['comida', 'restriccion', 'purga', 'imagen_corporal', 'ejercicio']

Análisis de sentimiento: ['polaridad', 'subjetividad']

Hashtags frecuentes: ['tag_Anorexia', 'tag_Bulimia', 'tag_ED', 'tag_RexyBill', 'tag_Salud', 'tag_TCA', 'tag_Thinspo', 'tag_adelgazar', 'tag_alimentacionsaludable', 'tag_ana', 'tag_anamia', 'tag_anorexia', 'tag_anorexiaeetclub', 'tag_anorexic', 'tag_bulimia', 'tag_bulimianervosa', 'tag_bulimic', 'tag_bulimicgirl', 'tag_comida', 'tag_comidasaludable', 'tag_comidasana', 'tag_deporte', 'tag_desayuno', 'tag_dieta', 'tag_eatclean', 'tag_ed', 'tag_ejercicio', 'tag_entrenamiento', 'tag_fat', 'tag_fit', 'tag_food', 'tag_foodie', 'tag_foodporn', 'tag_gorda', 'tag_gym', 'tag_healthy', 'tag_healthyfood', 'tag_instafood', 'tag_lifestyle', 'tag_mia', 'tag_motivaciÃ³n', 'tag_mot