## Read Data

In [1]:
import pandas as pd
from datetime import datetime

results = pd.read_csv("results.csv")

## Helper functions

In [2]:

def string_to_number(number_string: str):
    number_string = number_string.strip()

    try:
        return int(number_string)
    except:
        pass

    return None

closed_question_serializer = lambda ans: ans=="Sí"

def range_question_serialize(ans: str):
    result = 0
    if ans == "Rara vez":
        result = -2
    elif ans == "A veces":
        result = -1
    elif ans == "Frecuentemente":
        result = 1
    elif ans == "Siempre":
        result = 2

    return result

def range_question_serialize_authority(ans: str):
    result = 0
    if ans == "Nada efectiva":
        result = -2
    elif ans == "Poca efectiva":
        result = -1
    elif ans == "Algo efectiva":
        result = 1
    elif ans == "Muy efectiva":
        result = 2

    return result

def string_list_serializer(ans: str):
    return ans.split(";")

## Change column names

In [3]:

new_name_columns = [
    "date",
    "age",
    "gender",
    "has_access_smartphone",
    "would_use_app",
    "uses_social_mobile_apps",
    "years_lived_in_dimitrov",
    "frequency_security_problems_dimitrov",
    "frequency_waste_problems_dimitrov",
    "security_problems",
    "waste_problems",
    "knows_where_report",
    "are_authorities_efective",
    "likes_features_app"
]

results.columns = new_name_columns

## Serialize Data

In [4]:

results["date"] = results["date"].apply(lambda ds: datetime.strptime(ds[:-4], "%Y/%m/%d %I:%M:%S %p"))

# there are some ages that does not include only digits so i will omit those
results["age"] = results["age"].apply(string_to_number)
results["gender"] = results["gender"].apply(lambda gs: gs=="Masculinos")
results["has_access_smartphone"] = results["has_access_smartphone"].apply(closed_question_serializer)
results["would_use_app"] = results["would_use_app"].apply(lambda ws: 1 if ws == "Sí" else (0 if ws == "Tal vez" else -1) )
results["uses_social_mobile_apps"] = results["uses_social_mobile_apps"].apply(closed_question_serializer)

# drop years_lived_in_dimitrov because the data has too much noise
results.drop("years_lived_in_dimitrov", axis=1, inplace=True)

results["frequency_security_problems_dimitrov"] = results["frequency_security_problems_dimitrov"].apply(range_question_serialize)
results["frequency_waste_problems_dimitrov"] = results["frequency_waste_problems_dimitrov"].apply(range_question_serialize)
results["security_problems"] = results["security_problems"].apply(string_list_serializer)
results["waste_problems"] = results["waste_problems"].apply(string_list_serializer)
results["knows_where_report"] = results["knows_where_report"].apply(closed_question_serializer)

results["likes_features_app"] = results["likes_features_app"].apply(string_list_serializer)


## Save Serialized Data

In [5]:
results["date"].apply(datetime.date)
results.to_csv("output/serialized_results.csv", index=False)


spanish_results = results.rename(columns={
    "date": "fecha",
    "age": "edad",
    "gender": "genero",
    "has_access_smartphone": "tiene_acceso_smartphone",
    "would_use_app": "usaria_app",
    "uses_social_mobile_apps": "usa_apps_sociales",
    "years_lived_in_dimitrov": "anios_en_dimitrov",
    "frequency_security_problems_dimitrov": "frecuencia_problemas_seguridad",
    "frequency_waste_problems_dimitrov": "frecuencia_problemas_basura",
    "security_problems": "tipos_problemas_seguridad",
    "waste_problems": "tipos_problemas_basura",
    "knows_where_report": "sabe_donde_reportar",
    "are_authorities_efective": "autoridades_efectivas",
    "likes_features_app": "funciones_deseadas_app"
})
spanish_results.to_csv("output/datos_serializados.csv")
