# Notebook configuration

In [195]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Neoauto

In [198]:
neoauto = pd.read_csv('datasets/raw/neoauto.csv')
print(neoauto.shape)
neoauto.head()

(5362, 15)


Unnamed: 0,Brands,Models,Version,Currency,Price,Urlpic,Year,KM,Fuel_type,Transmission,Location,Color,Cilinder,Upholstery,Engine
0,SUZUKI,S-PRESSO,4x2,USD,"US$ 9,800",https://cde.neoauto.pe/autos_usados/360x240/66...,2023.0,16684.0,Gasolina,MecÃ¡nica,"Lima, Lima",Rojo,3.0,tela,"1,000 cc"
1,DFSK,GLORY 500,Delantera,USD,"US$ 12,000",https://cde.neoauto.pe/autos_usados/360x240/70...,2023.0,11054.0,Gas GLP,MecÃ¡nica,"Lima, Lima",Blanco,4.0,tela,"1,500 cc"
2,TOYOTA,YARIS,Delantera,USD,"US$ 15,500",https://cde.neoauto.pe/autos_usados/360x240/70...,2023.0,15500.0,Gas GNV,MecÃ¡nica,"Lima, Lima",Rojo,4.0,tela,"1,300 cc"
3,FORD,RAPTOR,4x4,USD,"US$ 75,900",https://cde.neoauto.pe/autos_usados/360x240/30...,2020.0,12500.0,Gasolina,AutomÃ¡tica - Secuencial,"Lima, Lima",Blanco,6.0,tela,"3,500 cc"
4,KIA,SPORTAGE,4x2,USD,"US$ 27,900",https://cde.neoauto.pe/autos_usados/360x240/70...,2023.0,52130.0,Gasolina,MecÃ¡nica,"Lima, Lima",Gris Oscuro,4.0,Asientos de cuero,"1,999 cc"


Declare paths and constants

In [180]:
CURRENT_YEAR = 2024

## Clean numerical features

Change

In [199]:
neoauto[['Price', 'Engine']] = neoauto[['Price', 'Engine']].replace(to_replace = r'[ ,a-zA-Z$]', value = '', regex = True)\
                                                           .replace(to_replace = '', value = np.nan)\
                                                           .astype(float)

Validate

In [200]:
neoauto[['Price', 'Engine']].dtypes

Price     float64
Engine    float64
dtype: object

## Clean categorical features

Check unique values

In [201]:
neoauto.describe(include = object)

Unnamed: 0,Brands,Models,Version,Currency,Urlpic,Fuel_type,Transmission,Location,Color,Upholstery
count,5361,5361,4900,5362,5334,5339,5361,5362,5072,5361
unique,81,846,541,1,5326,9,3,42,53,2
top,TOYOTA,RAV4,4x2,USD,https://cde.neoauto.pe/autos_usados/360x240/68...,Gasolina,AutomÃ¡tica - Secuencial,"Lima, Lima",Blanco,tela
freq,495,91,1617,5362,2,4123,2179,4908,849,4125


### Transmission

Create

In [202]:
neoauto['Transmission'] = neoauto['Transmission'].map({'MecÃ¡nica': 'mecanica',
                                                       'AutomÃ¡tica': 'automatica',
                                                       'AutomÃ¡tica - Secuencial': 'automatica_secuencial'})

Validate

In [203]:
neoauto['Transmission'].value_counts()

Transmission
automatica_secuencial    2179
mecanica                 1619
automatica               1563
Name: count, dtype: int64

### Version

Create

In [204]:
neoauto['Version'] = np.select([neoauto['Version'].str.lower().str.strip().str.contains('4x2', na = False),
                                neoauto['Version'].str.lower().str.strip().str.contains('4x4', na = False),
                                neoauto['Version'].str.lower().str.strip().str.contains('delantera', na = False),
                                neoauto['Version'].str.lower().str.strip().str.contains('posterior', na = False)],
                               ['4x2', '4x4', 'delantera', 'posterior'],
                               default = 'otros')

Validate

In [205]:
neoauto['Version'].value_counts()

Version
4x2          1919
otros        1856
4x4           896
delantera     469
posterior     222
Name: count, dtype: int64

### Upholstery

Create

In [206]:
neoauto['Upholstery'] = np.where(neoauto['Upholstery'] == 'Asientos de cuero', 'cuero', neoauto['Upholstery'])

Validate

In [207]:
neoauto['Upholstery'].value_counts()

Upholstery
tela     4125
cuero    1236
Name: count, dtype: int64

### Color

Create

In [208]:
neoauto['Color'] = neoauto['Color'].astype(str)

def extract_base_color(color):
    # Definir patrones de regex para los colores base
    patterns = {
        'blanco': r'^Blanco',
        'plata': r'^Plata',
        'gris': r'^Gris',
        'rojo': r'^Rojo',
        'azul': r'^Azul',
        'negro': r'^Negro',
        'marron': r'^(Marrón|Chocolate|Beige)',
    }
    
    for base_color, pattern in patterns.items():
        if pd.Series(color).str.contains(pattern,  na = False).any():
            return base_color
    
    return 'Otro'

neoauto['Color'] = neoauto['Color'].apply(extract_base_color)

Validate

In [209]:
neoauto['Color'].value_counts()

Color
gris      1119
blanco     931
negro      838
Otro       810
plata      555
rojo       532
azul       507
marron      70
Name: count, dtype: int64

### Location

Create

In [210]:
neoauto['Location'] = neoauto['Location'].str.split(',', expand = True).iloc[:, 1].str.strip()

In [212]:
neoauto['Location'] = np.where(neoauto['Location'] != 'Lima', 'Provincias', neoauto['Location'])

Validate

In [213]:
neoauto['Location'].value_counts()

Location
Lima          4926
Provincias     436
Name: count, dtype: int64

### Fuel_type

Create

In [214]:
neoauto['Fuel_type'] = neoauto['Fuel_type'].map({'Gasolina': 'gasolina',
                                                 'Gas GLP': 'gas_glp',
                                                 'Diesel': 'diesel',
                                                 'Dual': 'dual',
                                                 'Gas GNV': 'gas_gnv',
                                                 'Gasolina-HÃ­brido': 'otros',
                                                 'ElÃ©ctrico': 'electrico'})

Validate

In [215]:
neoauto['Fuel_type'].value_counts()

Fuel_type
gasolina     4123
dual          441
diesel        423
gas_glp       148
gas_gnv        76
otros          69
electrico      12
Name: count, dtype: int64

## Feature Engineering

In [216]:
neoauto['Age'] = CURRENT_YEAR - neoauto['Year']

Save csv

In [217]:
neoauto.to_csv('datasets/clean/neoauto_clean.csv', index = False)

# Autocosmos

Read data

In [219]:
autocosmos = pd.read_csv('datasets/raw/autocosmos.csv')
print(autocosmos.shape)
autocosmos.head()

(1576, 15)


Unnamed: 0,Brands,Models,Version,Currency,Price,Urlpic,Year,KM,Fuel_type,Transmission,Location,Color,Cilinders,Upholstery,Engine
0,MG,3,1.5 Std,USD,"u$s4,800",https://acroadtrip.blob.core.windows.net/publi...,2014,85000 km,gasolina,manual 5 velocidades,Lima |Lima,Blanco,4 en lÃ­nea,tela,1498 cc
1,KIA,Cerato,1.6L EX Aut,USD,"u$s7,999",https://acroadtrip.blob.core.windows.net/publi...,2014,156000 km,gasolina,automÃ¡tica 6 velocidades,Lima |Lima,Azul Galaxia,4 en lÃ­nea,tela,1591 cc
2,Toyota,Rav4,2.0L Full 4x2,USD,"u$s8,800",https://acroadtrip.blob.core.windows.net/publi...,2018,17000 km,gasolina,manual 6 velocidades,Trujillo |La Libertad,Plata MetÃ¡lico,4 en lÃ­nea,tela,1987 cc
3,Mazda,3 Sport,2.5L High Aut,USD,"u$s5,390",https://acroadtrip.blob.core.windows.net/publi...,2015,36000 km,gasolina,automÃ¡tica 6 velocidades,La UniÃ³n |Arequipa,Rojo,4 en lÃ­nea,cuero,2497 cc
4,Ford,Territory,Titanium Plus,USD,"u$s24,900",https://acroadtrip.blob.core.windows.net/publi...,2021,29300 km,gasolina,automÃ¡tica 8 velocidades,Lima |Lima,Blanco,4 en lÃ­nea,cuero,1500 cc


## Clean numerical features

Create

In [222]:
autocosmos[['Price', 'Engine', 'KM']] = autocosmos[['Price', 'Engine', 'KM']].replace(to_replace = r'[ ,a-zA-Z$]', value = '', regex = True)\
                                                                             .replace(to_replace = '', value = np.nan)\
                                                                             .astype(float)

Validate

In [224]:
autocosmos[['Price', 'Engine', 'KM']].dtypesS

Price     float64
Engine    float64
KM        float64
dtype: object

Validate

In [None]:
neoauto[['Price', 'Engine']].dtypes

Price     float64
Engine    float64
dtype: object