In [1]:
from os import chdir, getcwd, listdir, makedirs, path
from pandas import read_excel
from re import compile
from unidecode import unidecode

In [2]:
def cambiar_ruta_trabajo(path=".."):
    chdir(path)

In [3]:
def eliminar_caracteres(df_data, na_action="ignore"):
    return df_data.applymap(lambda x: unidecode(str(x)), na_action=na_action)

In [4]:
def reemplazar_nulos(df_data, value_to_replace=""):
    return df_data.fillna(value_to_replace)

In [5]:
def reemplazar_valores(df_data, pattern, value=None, regex=False):
    if regex:
        pattern = compile(pattern)
    if type(pattern) == str:
        return df_data.replace(to_replace=pattern, value=value,regex=regex)
    else:
        return df_data.replace(to_replace=pattern)

In [6]:
def remover_publicaciones(df_data, column_1="descripcion", column_2="titulo_marketplace", pattern="#adi"):
    return df_data.drop(df_data[df_data[column_1].isna() & df_data[column_2].str.lower().str.contains(pattern)].index)

In [7]:
def remover_duplicados(df_data, columns):
    return df_data.drop(df_data[df_data[columns].duplicated()].index)

In [8]:
def remover_columnas_innecesarias(df_data, columns):
    return df_data.drop(columns, axis = 1)

In [9]:
def cambiar_tipo_dato(df_data, datatype="str"):
    return df_data.astype(datatype)

In [10]:
def procesar_data(df_data):    
    cols_str = ['titulo_marketplace', 'descripcion', 'locacion']
    cols_bool = ["disponible", "vendido"]
    df_data[cols_str] = eliminar_caracteres(df_data[cols_str])
    df_data[cols_str] = reemplazar_valores(df_data[cols_str], r"\r?\n", " ", regex=True)
    df_data = remover_publicaciones(df_data)
    df_data = reemplazar_nulos(df_data, "null")
    df_data[cols_bool] = cambiar_tipo_dato(df_data[cols_bool])
    df_data = remover_duplicados(df_data, ["id_vendedor", "titulo_marketplace"])
    return df_data

In [11]:
def main():
    root_path = getcwd()
    raw_data_folder = "Data/datos_obtenidos"
    processed_data_folder = "../datos_depurados"
    cambiar_ruta_trabajo(raw_data_folder)
    data_folder = listdir()[-1]
    data_filename = listdir(data_folder)
    filename = list(filter(lambda x: x.split(".")[-1] in ["xlsx", "csv"], data_filename))[0]
    data_path = data_folder + "/" + filename
    df_ropa = read_excel(data_path)
    df_ropa = procesar_data(df_ropa)
    cambiar_ruta_trabajo(processed_data_folder)
    if not path.exists(data_folder):
        makedirs(data_folder)
    df_ropa.to_excel(data_path)
    cambiar_ruta_trabajo(root_path)

In [12]:
main()