In [1]:
import re
import os
import json
import pprint

In [7]:
folder_path = 'data_cleaned'
cars = []


for filename in os.listdir(folder_path):
    #print("Procesando archivo: ", filename)
    if filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
                #print(f'Hay {len(data)} coches')
                cars.extend(data)
            except json.JSONDecodeError as e:
                print(f"Error al procesar {filename}: {e}")

print(f'ARCHIVOS PROCESADOS: {len(os.listdir(folder_path))} TOTAL: {len(cars)}')

ARCHIVOS PROCESADOS: 20 TOTAL: 17741


In [21]:
def extract_word(pattern, text, default_value = None):
    data = re.search(pattern, text)
    if data != None: return data.group(1)
    else: return default_value

def exist_word(word, text): return (word in text)

def extract_phrase (pattern, text, group_index = 1, default_value = None):
    data = re.search(pattern, text)
    if data != None: return data.group(group_index).strip()
    else: return default_value

def extract_list_phrases (car, keyWord, categorie):
    ret = []
    for phrase in car[categorie]:
        if(keyWord in phrase):
            ret.append(phrase)
    return ret

'''
Traducción de las claves:
traccion → traction
frenos → brakes
suspension_delantera → front_suspension
suspension_trasera → rear_suspension
potencia_cv → power_cv
potencia_kw → power_kw
rpm_potencia_max → rpm_max_power
par_maximo_nm → max_torque_nm
rpm_par_max → rpm_max_torque
motor_description → motor_description
transmission_description → transmission_description
'''
def process_technical_details(car):

    if "Ficha Técnica" in car:
        text = "#".join(car["Ficha Técnica"])

        car["jato_classification"] = extract_word(r'clasificación JATO: .*?(\w+\d+)', text)
        car["traction"] = extract_word(r'Tracción\s+(\w+)', text)
        car["brakes"] = extract_phrase(r'[^#]*frenos[^#]*', text, 0)
        car["front_suspension"] = extract_phrase(r'Suspensión delantera([^,]*)', text)
        car["rear_suspension"] = extract_phrase(r'suspensión trasera([^#]*)', text)
        car["power_cv"] = extract_word(r'(\d+)\s*CV', text)
        car["power_kw"] = extract_word(r'(\d+)\s*kW', text)
        car["rpm_max_power"] = extract_word(r'([\d.,]+)\s*rpm\s*\(potencia max\)', text)
        car["max_torque_nm"] = extract_word(r'(\d+)\s*Nm', text)
        car["rpm_max_torque"] = extract_word(r'([\d.,]+)\s*rpm\s*\(par max\)', text)
        car["motor_description"] = extract_phrase(r'Motor de[^#]*', text, 0)
        car["transmission_description"] = extract_phrase(r'Transmisión de tipo[^#]*', text, 0)
        car.pop("Ficha Técnica")
    else:
        car["jato_classification"] = None
        car["traction"] = None
        car["brakes"] = None
        car["front_suspension"] = None
        car["rear_suspension"] = None
        car["power_cv"] = None
        car["power_kw"] = None
        car["rpm_max_power"] = None
        car["max_torque_nm"] = None
        car["rpm_max_torque"] = None
        car["motor_description"] = None
        car["transmission_description"] = None
    return car

'''
bodyType None = Undetermined (can be anyone)
bodyType 1 = Berlina
bodyType 2 = Coupe
bodyType 3 = Cabrio
bodyType 4 = Familiar
bodyType 5 = Monovolumen
bodyType 6 = SUV
bodyType 7 = Pick Up
bodyType 8 = Furgoneta
'''
def assign_body_type(car):
    match car["bodyTypeId"]:
        case None: car["bodyTypeId"] = None
        case 1: car["bodyTypeId"] = "Berlina"
        case 2: car["bodyTypeId"] = "Coupe"
        case 3: car["bodyTypeId"] = "Cabrio"
        case 4: car["bodyTypeId"] = "Familiar" 
        case 5: car["bodyTypeId"] = "Monovolumen"
        case 6: car["bodyTypeId"] = "SUV"
        case 7: car["bodyTypeId"] = "Pick Up"
        case 8: car["bodyTypeId"] = "Furgoneta"
    return car

def process_audio(car):
    # We will reduce the performance from having to not having, specifications like the screen size 
    # have so little variation that it is negligible from 6.5 to 7 inches, and it is assumed that if there is a trip computer, 
    # there will obviously be a touchscreen display; the opposite makes no sense.
    if "Multimedia y Audio" in car:
        text = "#".join(car["Multimedia y Audio"])
        car["speakers"] = extract_word(r'(\w+)\s+altavoces', text, default_value=0)
        car["trip_computer"] = exist_word("Ordenador de viaje", text)
        car["remote_audio_control_on_steering_wheel"] = exist_word("Control remoto de audio", text)
        car["dvd_navigation"] = exist_word("Navegador DVD", text)
        car["antenna"] = exist_word("Antena", text) or exist_word("antena", text) or exist_word("antenas", text) or exist_word("Antenas", text)
        car.pop("Multimedia y Audio")
    else:
        car["speakers"] = None
        car["trip_computer"] = None
        car["remote_audio_control_on_steering_wheel"] = None
        car["dvd_navigation"] = None
        car["antenna"] = None
    return car

def process_segurity(car):
    if "Seguridad" in car:
        text = "#".join(car["Seguridad"])
        car["abs"] = exist_word("ABS", text)  # Anti-Block System
        car["electronic_traction_control"] = exist_word("Control electrónico de tracción", text)
        car["parking_sensors"] = extract_phrase(r'Sensores[^#]*', text, 0, default_value="No tiene")

        if exist_word("Airbag", text):
            car["airbag"] = extract_list_phrases(car, "Airbag", "Seguridad")
        else:
            car["airbag"] = ["No tiene"]

        car["stability_control"] = exist_word("Control de estabilidad", text)
        car["curve_braking_control"] = exist_word("Control de frenada en curva", text)
        car["isofix_system"] = exist_word("Isofix", text)
        car["start_stop_automatic"] = exist_word("Start/Stop parada y arranque automático", text)
        car.pop("Seguridad")
    else:
        car["abs"] = None
        car["electronic_traction_control"] = None
        car["parking_sensors"] = None
        car["airbag"] = None
        car["stability_control"] = None
        car["curve_braking_control"] = None
        car["isofix_system"] = None
        car["start_stop_automatic"] = None
    return car

def process_dimension(car):
    if "Dimensiones" in car:
        text = "#".join(car["Dimensiones"])
        phrase = extract_phrase(r'Capacidad del compartimento de carga([^,]*)', text)
        if( phrase != None):
            car["cubicCapacity_seat_folded"] = extract_word(r'\by\s+(\d+)', phrase, default_value=car["cubicCapacity"])
        car.pop("Dimensiones")
    else:
        car["cubicCapacity_seat_folded"] = car["cubicCapacity"]
    return car

def process_electrical_features(car):
    #['powerSource', 'power', 'chargingInformation', 'motorType', 'chargingConnector', 'combinedConsumption', 'range']
    pass

my_list = []
for car in cars:
    #if "Ficha Técnica" in car : my_list = car["Ficha Técnica"] + my_list #Esto va a ser chungo

    car = process_technical_details(car)
    car = assign_body_type(car)
    car = process_audio(car)
    car = process_segurity(car)
    car = process_dimension(car)

    if "Información Básica" in car : car.pop("Información Básica")

#pprint.pprint(cars[1])
txt = (my_list)
txt = sorted(txt)
for x in txt: print(x)

In [5]:
with open("data_processed/json_test.json", 'w', encoding='utf-8') as outfile:
    json.dump(cars, outfile, ensure_ascii=False, indent=4)

Los datos que estan en formato de lista o diccionario hay que solucionarlo pero la mayoria de ellos se soluciona de forma iterativa