In [59]:
# Exploratory Data Analysis - Vehicles US Dataset

import pandas as pd
import plotly.express as px

In [60]:
#Cargar dataset

car_df = pd.read_csv('vehicles_us.csv')  
print(car_df.head())

   price  model_year           model  condition  cylinders fuel  odometer  \
0   9400      2011.0          bmw x5       good        6.0  gas  145000.0   
1  25500         NaN      ford f-150       good        6.0  gas   88705.0   
2   5500      2013.0  hyundai sonata   like new        4.0  gas  110000.0   
3   1500      2003.0      ford f-150       fair        8.0  gas       NaN   
4  14900      2017.0    chrysler 200  excellent        4.0  gas   80903.0   

  transmission    type paint_color  is_4wd date_posted  days_listed  
0    automatic     SUV         NaN     1.0  2018-06-23           19  
1    automatic  pickup       white     1.0  2018-10-19           50  
2    automatic   sedan         red     NaN  2019-02-07           79  
3    automatic  pickup         NaN     NaN  2019-03-22            9  
4    automatic   sedan       black     NaN  2019-04-02           28  


In [61]:
# Información general del dataset
car_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


In [62]:
# Histograma de Kilometraje (odometer) 
fig_km = px.histogram(car_df, x='odometer', nbins=50, 
                      title='Distribución de Kilometraje')
fig_km.show()

In [63]:
# Histograma de condición vs año de modelo
fig_condition = px.histogram(car_df, x='model_year', color='condition',
                             barmode='overlay',
                             title='Distribución de Condición vs Año de Modelo')
fig_condition.show()

In [64]:
# Gráfico de dispersión: Kilometraje vs Precio

fig_scatter = px.scatter(car_df, x='odometer', y='price', color='type',
                         title='Precio vs Kilometraje por Tipo de Vehículo')
fig_scatter.show()

In [68]:
# Renombramos columnas
car_df.rename(columns={"price": "precio", "model_year": "año_modelo"}, inplace=True)

# Crear histograma
fig_hist = px.histogram(
    car_df,
    x="año_modelo",
    y="precio",
    histfunc="avg",  # Para mostrar el precio promedio por año
    labels={"año_modelo": "Año del Modelo", "precio": "Precio Promedio (USD)"},
    title="Precio Promedio por Año del Modelo"
)

fig_hist.show()