# Análisis exploratorio

In [20]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import numpy as np

In [21]:
car_data = pd.read_csv('/Users/hectormoralesosorio/Tripleten/appweb/vehicles_us.csv') # leer los datos
car_data.head() # vista parcial

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28


In [22]:
car_data.describe() # medidas descriptivas

Unnamed: 0,price,model_year,cylinders,odometer,is_4wd,days_listed
count,51525.0,47906.0,46265.0,43633.0,25572.0,51525.0
mean,12132.46492,2009.75047,6.125235,115553.461738,1.0,39.55476
std,10040.803015,6.282065,1.66036,65094.611341,0.0,28.20427
min,1.0,1908.0,3.0,0.0,1.0,0.0
25%,5000.0,2006.0,4.0,70000.0,1.0,19.0
50%,9000.0,2011.0,6.0,113000.0,1.0,33.0
75%,16839.0,2014.0,8.0,155000.0,1.0,53.0
max,375000.0,2019.0,12.0,990000.0,1.0,271.0


In [23]:
car_data.info() # registros nulos y tipos de datos]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


## Tipos de vehículos por condición


In [24]:
car_data_type_condition = car_data.dropna(subset=['price'])
car_data_type_condition_price = car_data_type_condition.groupby(['type', 'condition']).mean().reset_index() # Agrupar datos por las variables "type" y "condition"

In [25]:
# Gráfico de diagrama de árbol
fig = px.treemap(car_data_type_condition_price, path=[px.Constant("Tipos de carros por precio"), 'type', 'condition'], values='days_listed',
                  color='price',
                  color_continuous_scale='RdBu')
fig.show()

## Precios de los  coupes, convertibles, pick ups y trucks nuevos

In [26]:
car_new_coupe_price = car_data[(car_data["type"]=="coupe")  & (car_data["condition"]=="new")]["price"]
car_new_convertible_price = car_data[(car_data["type"]=="convertible")  & (car_data["condition"]=="new")]["price"]
car_new_pickup_price = car_data[(car_data["type"]=="pickup")  & (car_data["condition"]=="new")]["price"]
car_new_truck_price = car_data[(car_data["type"]=="truck")  & (car_data["condition"]=="new")]["price"]

In [27]:
# Gráfico de caja y brazos
x_data = ['Coupe', 'Pickup',
          'Truck', 'Convertible']


y_data = [car_new_coupe_price,  car_new_pickup_price, car_new_truck_price, car_new_convertible_price]

colors = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)', 'rgba(44, 160, 101, 0.5)',
          'rgba(255, 65, 54, 0.5)']

fig = go.Figure()

for xd, yd, cls in zip(x_data, y_data, colors):
        fig.add_trace(go.Box(
            y=yd,
            name=xd,
            boxpoints='all',
            jitter=0.5,
            whiskerwidth=0.2,
            fillcolor=cls,
            marker_size=2,
            line_width=1)
        )

fig.update_layout(
    title='Tipos de carros más caros',
    yaxis=dict(
        autorange=True,
        showgrid=True,
        zeroline=True,
        dtick=5,
        gridcolor='rgb(255, 255, 255)',
        gridwidth=1,
        zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2,
    ),
    margin=dict(
        l=40,
        r=30,
        b=80,
        t=100,
    ),
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',
    showlegend=False
)

fig.show()

## Precio de la pickup con más anuncios de venta 

In [28]:
car_new_pickup_model = car_data[(car_data["type"]=="pickup")]["model"]

In [29]:
car_new_pickup_model.value_counts()

ford f-150                       902
chevrolet silverado 1500         671
ram 1500                         614
chevrolet silverado              511
ram 2500                         352
toyota tacoma                    347
nissan frontier crew cab sv      340
chevrolet silverado 1500 crew    301
ford f150 supercrew cab xlt      300
gmc sierra 1500                  269
chevrolet silverado 2500hd       260
ford f150                        242
ford ranger                      216
toyota tundra                    184
ford f-250                       168
ram 3500                         146
gmc sierra                       141
ford f250 super duty             140
ford f250                        139
gmc sierra 2500hd                124
nissan frontier                  117
dodge dakota                     104
ford f350 super duty              95
chevrolet colorado                94
ford f-250 super duty             82
ford f350                         66
chevrolet silverado 3500hd        39
c

In [30]:
ford_f150_price = car_data[car_data["model"]=="ford f-150"]["price"]

In [31]:
ford_f150_price.describe()

count      2796.000000
mean      14105.922747
std       12502.771873
min           1.000000
25%        5995.000000
50%       12294.000000
75%       19900.000000
max      189000.000000
Name: price, dtype: float64

In [32]:
# Histograma y curva de la densidad del precio de la camionetas más caras
hist_data = [ford_f150_price]
group_labels = ['Precio Ford f150']
colors = ['#F66095']

fig = ff.create_distplot(hist_data, group_labels, colors=colors, bin_size=1000)
fig.show()

## Relación entre el precio y el odometro

In [33]:
ford_f150 = car_data[car_data["model"]=="ford f150"]

In [34]:
ford_f150.head()

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
139,19500,2014.0,ford f150,excellent,8.0,gas,116000.0,automatic,truck,grey,1.0,2018-06-26,40
247,13499,2014.0,ford f150,good,6.0,gas,,automatic,pickup,,1.0,2019-02-13,20
267,3250,2001.0,ford f150,fair,8.0,gas,296000.0,automatic,truck,black,1.0,2018-12-07,24
339,5500,2006.0,ford f150,excellent,8.0,gas,,automatic,truck,white,,2018-05-16,56
439,1000,2008.0,ford f150,salvage,8.0,gas,285000.0,automatic,pickup,white,,2018-10-19,19


In [35]:
fig = px.scatter(ford_f150, x="price", y="odometer", color="condition", trendline="ols", title="Relación inversa entre las variables odómetro y precio")
fig.show()