In [1]:
import pandas as pd
from gemini import tag_topics
from models import variable_to_question, UserTaggedAnswer
from typing import List

In [2]:
data = pd.read_parquet('data/01_Encuesta.parquet')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Data columns (total 25 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   response_id                            110 non-null    object        
 1   start_response                         110 non-null    datetime64[ns]
 2   complete_response                      110 non-null    datetime64[ns]
 3   age                                    110 non-null    category      
 4   sex                                    110 non-null    category      
 5   years_studying                         110 non-null    int64         
 6   degree                                 110 non-null    category      
 7   infrastructure_score                   110 non-null    category      
 8   notice_infrastructure_improvement      110 non-null    category      
 9   frecuency_digital_systems              110 non-null    category  

In [3]:
open_question_variables = [
    'infrastructure_changes',
    'most_important_technologies',
    'challenges',
    'recomendations'
]
open_questions_df = data.loc[:, ["response_id"]]
BATCH_SIZE = 110

for  open_question_variable in open_question_variables:
    print("Starting formatting of variable: ", open_question_variable)
    items_ready: List[UserTaggedAnswer] = []
    
    classification_input = data.loc[data[open_question_variable].apply(lambda x : len(x) > 0),['response_id', open_question_variable]]
    classification_input[open_question_variable] = classification_input[open_question_variable].apply(lambda x: x.tolist())

    batch_number = 1

    for start in range(0, len(data), BATCH_SIZE):
        batch = classification_input.iloc[start : start + BATCH_SIZE]
        fixed_batched = tag_topics(variable_to_question[open_question_variable], batch.to_dict('records'))

        items_ready += fixed_batched
        print("Finished batch: ", batch_number)
        batch_number+=1


    result_df = pd.DataFrame([x.model_dump() for x in items_ready])

    result_df["topic"] = result_df["topic"].astype('category')
    
    result_df.columns = ['response_id', open_question_variable+'_topic']

    open_questions_df = open_questions_df.merge(result_df, how='left', on='response_id')


data = data.merge(open_questions_df, on="response_id")


Starting formatting of variable:  infrastructure_changes
Finished batch:  1
Starting formatting of variable:  most_important_technologies
Finished batch:  1
Starting formatting of variable:  challenges
Finished batch:  1
Starting formatting of variable:  recomendations
Finished batch:  1


# Save data

In [4]:
data.to_parquet('data/02_Encuesta.parquet')