# Import Libraries

In [1]:
import pandas as pd
import plotly.express as px
import streamlit as st
from pathlib import Path

# Reading Data Set

In [2]:
df = pd.read_csv('../vehicles.csv')

# Analysis

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


In [11]:
df.sample(5)

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
2037,34990,2019.0,gmc sierra 1500,good,8.0,gas,16989.0,automatic,truck,blue,1.0,2019-04-08,14
24800,11900,2011.0,jeep grand cherokee,good,6.0,gas,128423.0,automatic,SUV,white,1.0,2018-06-21,17
17756,18995,,honda civic,excellent,4.0,gas,31360.0,manual,hatchback,blue,,2018-10-24,30
11763,1,2017.0,toyota tundra,excellent,8.0,gas,41512.0,automatic,truck,black,1.0,2018-08-09,24
46691,3725,2000.0,ford f-150,good,8.0,gas,166500.0,automatic,pickup,silver,1.0,2018-08-28,35


In [14]:
# Converting data types to int without affecting null values.
df['model_year'] = df['model_year'].astype('Int64')
df['cylinders'] = df['cylinders'].astype('Int64')
df['odometer'] = df['odometer'].astype('Int64')
df['is_4wd'] = df['is_4wd'].astype('Int64')
# Converting date_posted to datetime format.
df['date_posted'] = pd.to_datetime(df['date_posted'], format='%Y-%m-%d')

In [15]:
print(df['price'].min())
print(df['price'].max())

1
375000


In [16]:
df['type'].value_counts()

type
SUV            12405
truck          12353
sedan          12154
pickup          6988
coupe           2303
wagon           1541
mini-van        1161
hatchback       1047
van              633
convertible      446
other            256
offroad          214
bus               24
Name: count, dtype: int64

In [17]:
# Creating a histogram with plotly.express to analyze the price by car types.
fig1 = px.histogram(df, x='price', color='type', nbins=40, title='Prices by Types',
                   color_discrete_map={'SUV': 'green', 'truck': 'red', 'sedan': 'deepskyblue', 'pickup': 'yellow', 'coupe': 'chocolate', 
                                      'wagon': 'cyan', 'mini-van': 'lightblue', 'hatchback': 'turquoise', 'van': 'springgreen', 
                                      'convertible': 'lime', 'other': 'goldenrod', 'offroad': 'violet', 'bus': 'purple'})
fig1.show()

In [18]:
print(df['odometer'].min())
print(df['odometer'].max())

0
990000


In [19]:
df['condition'].value_counts()

condition
excellent    24773
good         20145
like new      4742
fair          1607
new            143
salvage        115
Name: count, dtype: int64

In [25]:
# Creating a scatterplot to analyze mileage and condition by model year.
fig2 = px.scatter(df, x='model_year', y='odometer', title='Odometer by Model Year', color='condition', 
                 color_discrete_map={'good': 'green', 'like new': 'deepskyblue', 'fair': 'crimson', 'excellent': 'goldenrod', 
                                     'salvage': 'red', 'new': 'mediumseagreen'})
fig2.show()

In [22]:
print(df['model_year'].min())
print(df['model_year'].max())

1908
2019


In [24]:
# Creating a histogram to analyze model year by car type.
fig3 = px.histogram(df, x='model_year', title='Model Year per Type', nbins=50, color='type', 
                   color_discrete_map={'SUV': 'green', 'truck': 'red', 'sedan': 'deepskyblue', 'pickup': 'yellow', 'coupe': 'chocolate', 
                                      'wagon': 'cyan', 'mini-van': 'lightblue', 'hatchback': 'turquoise', 'van': 'springgreen', 
                                      'convertible': 'lime', 'other': 'goldenrod', 'offroad': 'violet', 'bus': 'purple'})
fig3.show()

In [26]:
df['model'].value_counts()

model
ford f-150                           2796
chevrolet silverado 1500             2171
ram 1500                             1750
chevrolet silverado                  1271
jeep wrangler                        1119
                                     ... 
ford f-250 super duty                 241
acura tl                              236
kia sorento                           236
nissan murano                         235
mercedes-benz benze sprinter 2500      41
Name: count, Length: 100, dtype: int64