In [1]:
import pandas as pd
import numpy as np

# 1. Load the cars.csv file into a DataFrame
df = pd.read_csv('cars.csv')

display(df)

Unnamed: 0,stock_id,km,price,make,model,year,version,bluetooth,largo,ancho,altura,car_play
0,243587,77400,461999.0,Volkswagen,Touareg,2018,3.0 V6 TDI WOLFSBURG EDITION AUTO 4WD,Sí,4801.0,1940.0,1709.0,
1,229702,102184,660999.0,Land Rover,Discovery Sport,2018,2.0 HSE LUXURY AUTO 4WD,Sí,4599.0,2069.0,1724.0,
2,160422,56419,866999.0,BMW,Serie 2,2018,3.0 M2 DCT,Sí,4468.0,1854.0,1410.0,Sí
3,308634,76000,238999.0,Toyota,Avanza,2018,1.5 XLE AT,Sí,4140.0,1660.0,1695.0,
4,305016,29377,313999.0,Toyota,Corolla,2020,1.8 LE AUTO,Sí,4650.0,1776.0,1475.0,Sí
...,...,...,...,...,...,...,...,...,...,...,...,...
95,319698,97000,168999.0,Seat,Toledo,2016,1.6 ENTRY MT,,4482.0,1706.0,1461.0,
96,315497,52810,252999.0,Nissan,Altima,2018,2.5 SENSE AUTO,Sí,4874.0,1830.0,1488.0,
97,179859,98570,375999.0,Toyota,RAV4,2018,2.5 LIMITED AWD AT,Sí,4600.0,1845.0,1705.0,
98,181736,119813,464999.0,BMW,X5,2016,4.4 XDRIVE50IA EXCELLENCE AT 4WD,Sí,4886.0,1938.0,1762.0,


In [2]:
# 2. Check the data types of the columns
print(df.dtypes)

stock_id       int64
km             int64
price        float64
make          object
model         object
year           int64
version       object
bluetooth     object
largo        float64
ancho        float64
altura       float64
car_play      object
dtype: object


In [3]:
# 3. Check for missing values
print(df.isna().sum())

stock_id      0
km            0
price         0
make          0
model         0
year          0
version       0
bluetooth     3
largo         0
ancho         2
altura        1
car_play     60
dtype: int64


In [4]:
# 3.1 Replace missing values
df['car_play'] = df['car_play'].replace({np.nan: 'No', 'Sí': 'Yes'})
df['bluetooth'] = df['bluetooth'].replace({np.nan: 'No', 'Sí': 'Yes'})

In [5]:
# 4. Convert the columns to boolean
df['car_play'] = df['car_play'].map({'Yes': True, 'No': False})
df['bluetooth'] = df['bluetooth'].map({'Yes': True, 'No': False})

In [6]:
# 5. Check the numeric columns
numeric_cols = ['km', 'price', 'largo', 'ancho', 'altura', 'year']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [7]:
# 6. Check the data types of the columns
print(df.dtypes)

stock_id       int64
km             int64
price        float64
make          object
model         object
year           int64
version       object
bluetooth       bool
largo        float64
ancho        float64
altura       float64
car_play        bool
dtype: object


In [8]:
# 7. Normalize the text columns
text_cols = ['make', 'model', 'version']
for col in text_cols:
    df[col] = df[col].str.strip()

In [9]:
#8. Rename the columns
df.rename(columns={
    'stock_id': 'stock_id',
    'km': 'kilometers',
    'price': 'price',
    'make': 'make',
    'model': 'model',
    'year': 'year',
    'version': 'version',
    'bluetooth': 'bluetooth',
    'largo': 'length',
    'ancho': 'width',
    'altura': 'height',
    'car_play': 'carplay'
}, inplace=True)

In [10]:
display(df.head())

Unnamed: 0,stock_id,kilometers,price,make,model,year,version,bluetooth,length,width,height,carplay
0,243587,77400,461999.0,Volkswagen,Touareg,2018,3.0 V6 TDI WOLFSBURG EDITION AUTO 4WD,True,4801.0,1940.0,1709.0,False
1,229702,102184,660999.0,Land Rover,Discovery Sport,2018,2.0 HSE LUXURY AUTO 4WD,True,4599.0,2069.0,1724.0,False
2,160422,56419,866999.0,BMW,Serie 2,2018,3.0 M2 DCT,True,4468.0,1854.0,1410.0,True
3,308634,76000,238999.0,Toyota,Avanza,2018,1.5 XLE AT,True,4140.0,1660.0,1695.0,False
4,305016,29377,313999.0,Toyota,Corolla,2020,1.8 LE AUTO,True,4650.0,1776.0,1475.0,True


In [None]:
# 9. Save the cleaned data to a new file
df.to_csv('cars_cleaned.csv', index=False)