In [None]:
#Imports
import pandas as pd
import plotly.express as px

#Reading the file
df = pd.read_csv('/Users/nathanmerlino/my_project/vehicles_us.csv')

#Setting date posted to datetime
df['date_posted'] = pd.to_datetime(df['date_posted'])
print(df.info())
df.head()

In [None]:
#Wanted to find the mean and median of a view columns to fill in missing values. 
df.describe()

In [None]:
#Checking the numbe of missing values in the dataframe
df.isnull().sum()

In [None]:
#Getting the median of each of the following columns.
median_model_year = df['model_year'].median()
median_cylinders = df['cylinders'].median()
median_odometer = df['odometer'].median()

# Fill missing values using .fillna() without inplace=True
df['model_year'] = df['model_year'].fillna(median_model_year)
df['cylinders'] = df['cylinders'].fillna(median_cylinders)
df['odometer'] = df['odometer'].fillna(median_odometer)

df.isnull().sum()

In [None]:
#Setting the is_4wd to a boolean value where true = 4wd, false = no 4wd
df['is_4wd'] = df['is_4wd'].fillna(False).astype(bool)

df.isnull().sum()

In [None]:
#Filling in the missing values with no color    
df['paint_color'] = df['paint_color'].fillna('no color')

df.head(15)

In [None]:
#Wanted to check that there wasn't in duplicated types, transmissions, model, and condition
print(df['type'].unique())
print()
print(df['transmission'].unique())
print()
print(df['model'].unique())
print()
print(df['condition'].unique())

In [None]:
df.info()

In [None]:
def extract_manufacturer(model):
    return model.split()[0]

df['manufacturer'] = df['model'].apply(extract_manufacturer)

df = df[['price', 'model_year', 'manufacturer', 'model', 'condition', 'cylinders', 'fuel', 
         'odometer', 'transmission', 'type', 'paint_color', 'is_4wd', 'date_posted', 
         'days_listed']]


df.head()

In [None]:
fig = px.histogram(df, x='price', color='condition', title='Histogram of Price by Condition', range_x=[0,80000])
fig.show()

In [None]:
median_days_listed = df.groupby('condition')['days_listed'].median().sort_values().index

# Convert to categorical with sorted order
df['condition'] = pd.Categorical(df['condition'], categories=median_days_listed, ordered=True)

# Plot histogram with sorted condition order
fig = px.histogram(df, x='days_listed', color='transmission', facet_col='condition',
                   title='Histogram of Days Listed by Transmission and Condition',
                   category_orders={'condition': median_days_listed})
fig.show()

In [None]:
top_models = df['model'].value_counts().nlargest(25)

# Create a bar chart using Plotly Express
fig = px.bar(top_models, x=top_models.index, y=top_models.values,
             labels={'x': 'Car Model', 'y': 'Number of Listings'},
             title='Top 25 Car Models by Number of Listings')
fig.show()

In [None]:
manufacturer_counts = df['manufacturer'].value_counts().reset_index()
manufacturer_counts.columns = ['manufacturer', 'count']

fig1 = px.bar(manufacturer_counts, x='manufacturer', y='count', 
              title='Number of Listings by Manufacturer')
fig1.update_xaxes(title='Manufacturer')
fig1.update_yaxes(title='Number of Listings')
fig1.show()