In [120]:
### included sections for data preprocessing, analysis, and findings of the set .csv file
### this is an SDA of the vehicles_us.csv, where we'll clear duplicates, clean the dataset, and reprocess it for analysis

In [121]:
import pandas as pd
import plotly.express as px
import streamlit as st

In [122]:
data = pd.read_csv('C:\\Users\\micha\\TripleTen\\Project4\\vehicles_us.csv')

In [123]:
duplicates = data.duplicated().sum()
data = data.drop_duplicates()

In [124]:
missing_values = data.isnull().sum()
data = data.fillna(method='ffill')
display(missing_values)


DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



price               0
model_year       3619
model               0
condition           0
cylinders        5260
fuel                0
odometer         7892
transmission        0
type                0
paint_color      9267
is_4wd          25953
date_posted         0
days_listed         0
dtype: int64

In [125]:
display(data)

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,2011.0,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,1.0,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,110000.0,automatic,pickup,red,1.0,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,1.0,2019-04-02,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...
51520,9249,2013.0,nissan maxima,like new,6.0,gas,88136.0,automatic,sedan,black,1.0,2018-10-03,37
51521,2700,2002.0,honda civic,salvage,4.0,gas,181500.0,automatic,sedan,white,1.0,2018-11-14,22
51522,3950,2009.0,hyundai sonata,excellent,4.0,gas,128000.0,automatic,sedan,blue,1.0,2018-11-15,32
51523,7455,2013.0,toyota corolla,good,4.0,gas,139573.0,automatic,sedan,black,1.0,2018-07-02,71


In [126]:
data['model_year'] = data['model_year'].fillna(data['model_year'].median())
data['odometer'] = data['odometer'].fillna(data['odometer'].median())
data['cylinders'] = data['cylinders'].fillna(data['cylinders'].mode()[0])

In [127]:
missing_paint = data['paint_color'].isnull().sum()
print(f"Missing values in 'paint_color': {missing_paint}")

data['paint_color'] = data['paint_color'].fillna('Unknown')

Missing values in 'paint_color': 1


In [128]:
print(data.info())
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    51525 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     51525 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      51525 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   51525 non-null  object 
 10  is_4wd        51525 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB
None
               price    model_year     cylinders       odometer   is_4wd  \
count   51525.000000  51525.000000  51525.000000   51525.000000  51525.0   
mean    12132.464920   2009

In [129]:
scatter_fig = px.scatter(data, x='is_4wd', y='price', title='Scatterplot of 4WD Status vs Price',
                          labels={'is_4wd': '4WD (1 = Yes, 0 = No)', 'price': 'Price'})
st.plotly_chart(scatter_fig)



DeltaGenerator()

In [130]:
hist_fig = px.histogram(data, x='is_4wd', nbins=2, title='Distribution of 4WD Vehicles',
                         labels={'is_4wd': '4WD Status (1 = Yes, 0 = No)'})
st.plotly_chart(hist_fig)



DeltaGenerator()

In [131]:
fourwd_counts = data['is_4wd'].value_counts()
print(fourwd_counts)

is_4wd
1.0    51525
Name: count, dtype: int64


In [132]:
summary_df = fourwd_counts.reset_index()
summary_df.columns = ['4WD Status', 'Count']

In [133]:
bar_fig = px.bar(summary_df, x='4WD Status', y='Count', 
                  title='Distribution of 4WD Vehicles', 
                  labels={'4WD Status': '4WD Status (0 = Non-4WD, 1 = 4WD)', 'Count': 'Count'})
bar_fig.update_xaxes(tickvals=[0, 1], ticktext=['Non-4WD', '4WD'])
st.plotly_chart(bar_fig)





DeltaGenerator()

In [134]:
missing_values = data.isnull().sum()
print("Missing values after filling:\n", missing_values)

Missing values after filling:
 price           0
model_year      0
model           0
condition       0
cylinders       0
fuel            0
odometer        0
transmission    0
type            0
paint_color     0
is_4wd          0
date_posted     0
days_listed     0
dtype: int64


In [135]:
# Check for missing values after filling
missing_values = data.isnull().sum()
st.write("Missing values after filling:\n", missing_values)

if 'paint_color' in data.columns:
    # Example: Boxplot for price distribution by paint color
    box_fig = px.box(data, x='paint_color', y='price', title='Price Distribution by Paint Color',
                     labels={'paint_color': 'Paint Color', 'price': 'Price'})
    box_fig.update_xaxes(tickangle=45)
    st.plotly_chart(box_fig)  # Use st.plotly_chart to display the box plot in Streamlit

    # Count of vehicles by paint color
    paint_color_counts = data['paint_color'].value_counts().reset_index()
    paint_color_counts.columns = ['paint_color', 'count']

    st.write("Counts of vehicles by paint color:\n", paint_color_counts)

    # Bar plot for the distribution of vehicles by paint color
    paint_bar_fig = px.bar(paint_color_counts, 
                            x='paint_color', y='count', 
                            title='Distribution of Vehicles by Paint Color', 
                            labels={'paint_color': 'Paint Color', 'count': 'Count'})
    paint_bar_fig.update_xaxes(tickangle=45)  # Correctly indented under the bar plot creation
    st.plotly_chart(paint_bar_fig)  # Use st.plotly_chart to display the bar plot in Streamlit

else:
    st.write("'paint_color' column is not found in the DataFrame.")

