In [1]:
import pandas as pd
from gemini import tag_topics
from models import variable_to_question, UserTaggedAnswer
from typing import List


In [2]:
data = pd.read_parquet('data/encuesta.parquet')

print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 26 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   response_id                            109 non-null    object        
 1   start_response                         109 non-null    datetime64[ns]
 2   complete_response                      109 non-null    datetime64[ns]
 3   age                                    109 non-null    category      
 4   sex                                    109 non-null    category      
 5   years_studying                         109 non-null    int64         
 6   degree                                 109 non-null    category      
 7   infrastructure_score                   109 non-null    category      
 8   notice_infrastructure_improvement      109 non-null    category      
 9   frecuency_digital_systems              109 non-null    category  

In [3]:
open_question_variables = [
    'infrastructure_changes',
    'most_important_technologies',
    'improved_aspects',
    'challenges',
    'recomendations'
]
open_questions_df = data.loc[:, ["response_id"]]
BATCH_SIZE = 110

for  open_question_variable in open_question_variables:
    print("Starting formatting of variable: ", open_question_variable)
    items_ready: List[UserTaggedAnswer] = []
    
    classification_input = data.loc[data[open_question_variable].apply(lambda x : len(x) > 0),['response_id', open_question_variable]]
    classification_input[open_question_variable] = classification_input[open_question_variable].apply(lambda x: x.tolist())

    batch_number = 1

    for start in range(0, len(data), BATCH_SIZE):
        batch = classification_input.iloc[start : start + BATCH_SIZE]
        fixed_batched = tag_topics(variable_to_question[open_question_variable], batch.to_dict('records'))

        items_ready += fixed_batched
        print("Finished batch: ", batch_number)
        batch_number+=1


    result_df = pd.DataFrame([x.model_dump() for x in items_ready])

    result_df["topic"] = result_df["topic"].astype('category')
    
    result_df.columns = ['response_id', open_question_variable+'_topic']

    open_questions_df = open_questions_df.merge(result_df, how='left', on='response_id')


data = data.merge(open_questions_df, on="response_id")


Starting formatting of variable:  infrastructure_changes
Finished batch:  1
Starting formatting of variable:  most_important_technologies
Finished batch:  1
Starting formatting of variable:  improved_aspects
Finished batch:  1
Starting formatting of variable:  challenges
Finished batch:  1
Starting formatting of variable:  recomendations
Finished batch:  1


In [4]:
data

Unnamed: 0,response_id,start_response,complete_response,age,sex,years_studying,degree,infrastructure_score,notice_infrastructure_improvement,frecuency_digital_systems,...,improved_aspects_sentiment,challenges,challenges_sentiment,recomendations,recomendations_sentiment,infrastructure_changes_topic,most_important_technologies_topic,improved_aspects_topic,challenges_topic,recomendations_topic
0,2,2025-10-27 09:32:00,2025-10-27 09:35:00,Menos de 17 años,Femenino,1,Licenciatura en Enfermería,Regular,Si,A veces,...,Positive,[],Non-Response,[],Non-Response,,,acceso_a_expedientes,,
1,3,2025-10-27 09:33:00,2025-10-27 09:35:00,17–19 años,Femenino,1,Licenciatura en Bioanálisis Clínico,Buena,No estoyy seguro/a,Frecuentemente,...,Positive,[calidad en el trato al paciente],Negative,[],Non-Response,,equipos_de_diagnostico_y_tratamiento,experiencia_paciente,calidad_de_atencion_y_trato,
2,4,2025-10-27 09:30:00,2025-10-27 09:35:00,17–19 años,Femenino,3,Licenciatura en Bioanálisis Clínico,Buena,Si,Nunca,...,Positive,[],Non-Response,[],Non-Response,mejoras_generales_instalaciones,gestion_de_informacion_medica,acceso_a_expedientes,,
3,5,2025-10-27 09:59:00,2025-10-27 10:03:00,17–19 años,Femenino,3,Medicina y Cirugía,Buena,Si,A veces,...,Positive,[tiempo de espera],Negative,[mas tecnologia medica],Positive,ampliacion_y_especializacion_de_areas,gestion_de_informacion_medica,acceso_a_expedientes,tiempos_de_espera,tecnologia_medica
4,6,2025-10-27 10:05:00,2025-10-27 10:11:00,17–19 años,Femenino,1,Licenciatura en Bioanálisis Clínico,Regular,Si,Frecuentemente,...,Positive,[],Non-Response,[mejores equipos],Positive,comodidad_del_paciente,equipos_de_diagnostico_y_tratamiento,infraestructura_y_acceso_a_expedientes,,tecnologia_medica
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104,106,2025-11-04 23:09:00,2025-11-04 23:20:00,17–19 años,Masculino,3,Medicina y Cirugía,Buena,Si,Rara vez,...,Positive,[Falta de empatia],Negative,[Implementar proyectos de mejora de infraestru...,Positive,ampliacion_y_salas_nuevas_en_hospitales_existe...,infraestructura_y_equipamiento_hospitalario,disponibilidad_de_equipos,calidad_de_atencion_y_trato,infraestructura_hospitalaria
105,107,2025-11-05 09:53:00,2025-11-05 09:59:00,17–19 años,Femenino,2,Licenciatura en Anestesia y Reanimación,Muy buena,No estoyy seguro/a,Frecuentemente,...,Positive,[],Non-Response,[],Non-Response,,equipos_de_diagnostico_y_tratamiento,disponibilidad_de_equipos,,
106,108,2025-11-05 10:57:00,2025-11-05 11:03:00,23–25 años,Masculino,2,Medicina y Cirugía,Buena,No estoyy seguro/a,Nunca,...,Positive,[Adopcion de tecnologia],Negative,[],Non-Response,,telemedicina,disponibilidad_de_equipos,tecnologia_y_digitalizacion,
107,109,2025-11-05 15:18:00,2025-11-05 15:24:00,23–25 años,Masculino,2,Licenciatura en Anestesia y Reanimación,Muy buena,Si,Frecuentemente,...,Positive,[Caracter del personal],Negative,[Personal mas empatico],Positive,nuevos_edificios_y_ampliacion_de_salas,gestion_de_informacion_medica,disponibilidad_de_equipos,calidad_de_atencion_y_trato,trato_humano_y_atencion_al_paciente


# Save data

In [5]:
data.to_parquet('data/02_Encuesta.parquet')