# installing required libraries

In [None]:
! pip install --user pandas numpy matplotlib bokeh

# Adding Required libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker
from bokeh.io import output_file
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource

x = np.arange(5)
width = 0.40
formatter = ticker.ScalarFormatter()
formatter.set_scientific(False)

df = pd.read_csv("used_car_dataset.csv")
df

# sanitizing the columns

In [None]:
# year
df["Year"] = df["Year"].apply(lambda x: int(x) if x.isdigit() else np.nan)
df.dropna(axis=0, inplace=True)
# doing this, because they would all end up as floats, if I don't
df["Year"] = df["Year"].apply(lambda x: int(x))

# Price
df["Price"] = df["Price"].apply(lambda x: int(
    x.replace(",", '')) if x.replace(",", '').isdigit() else np.nan)
df.dropna(axis=0, inplace=True)
## doing this, because they would all end up as floats, if I don't
df["Price"] = df["Price"].apply(lambda x: int(x))
## removing outliers
df = df[np.abs(df.Price-df.Price.mean())<=(3*df.Price.std())] #keep only the ones that are within +3 to -3 standard deviations in the column 'Data'.

# kms driven
df["Kms driven"] = df["Kms driven"].apply(lambda x: int(x.replace(",", '')) if x.replace(",", '').isdigit() else np.nan)
df.dropna(axis=0, inplace=True)
## doing this, because they would all end up as floats, if I don't
df["Kms driven"] = df["Kms driven"].apply(lambda x: int(x))

# Fuel Type
df["Fuel_type"] = df["Fuel_type"].apply(
    lambda x: x if x == "Diesel" or x == "Petrol" else np.nan)
df.dropna(axis=0, inplace=True)

df

# adding a price/km column

In [None]:
df['Price_per_km'] = df['Price'] / df['Kms driven']
df['Price_per_km'] = df['Price_per_km'].apply(lambda o: round(o, 2))
df

# Price of used cars over the years


In [None]:
output_file('Price_of_used_car_over_the_years.html',
            title="Price of used car over the years")


avg_price = df.groupby('Year')['Price'].mean()

source = ColumnDataSource(data={
    'years': list(avg_price.index),
    'avg_price': list(avg_price.values)
})

fig = figure(
    height=400,
    width=600,
    title='Average Car Price per Year',
    y_axis_label='Average Price',
    x_axis_label='Year'
)

fig.vbar(
    x='years',
    top='avg_price',
    source=source,
    width=0.5
)

show(fig)

# Price of used cars over the years separated by diesel and gas


In [None]:
df_piv = pd.pivot_table(
	df,
	values="Price",
	index="Year",
	columns="Fuel_type",
	aggfunc=np.mean
)

ax = df_piv.plot(kind="bar")
fig = ax.get_figure()
fig.set_size_inches(7, 6)
# Change the axes labels
ax.set_xlabel("Years")
ax.set_ylabel("Average Price")
ax.yaxis.set_major_formatter(formatter)

plt.show()

# What is the average price for each manufacturer?

In [None]:
output_file('Price_of_used_car_seperated_by_manufacturer.html',
            title="Average price of used car seperated by manufacturer")


avg_price = df.groupby('Company name')['Price'].mean().sort_values()

source = ColumnDataSource(data={
    'manufacturers': list(avg_price.index),
    'avg_price': list(avg_price.values)
})

fig = figure(
    x_range=list(avg_price.index),
    height=400,
    width=600,
    title='Average Car Price per Manufacturer',
    y_axis_label='Average Price',
    x_axis_label='Manufacturer'
)

# make the orientation of the x ticks angled to remove the overlap
fig.xaxis.major_label_orientation = 1

fig.vbar(
    x='manufacturers',
    top='avg_price',
    source=source,
    width=0.5
)

show(fig)

# What is the price per kilometers driven?

In [None]:
fig = plt.figure()
ax = fig.add_axes([0.1, 0.1, 1, 1])

kms = df["Kms driven"]
price = df["Price"]

ax.scatter(kms,price)
ax.set_title("Correlation between kms driven and price")

ax.set_xlabel("kms driven")
ax.set_ylabel("Price")
ax.yaxis.set_major_formatter(formatter)
plt.show()

# How many cars did each manufacturer sell per year? 

In [None]:
output_file('amount_of_cars_each_manufacturer_sold.html',
            title="Amount of cars sold by each manufacturer")


company_sold = df.groupby('Company name')["Price"].count().sort_values()

source = ColumnDataSource(data={
    'manufacturers': list(company_sold.index),
    'avg_price': list(company_sold.values)
})

fig = figure(
    x_range=list(company_sold.index),
    height=400,
    width=600,
    title='Amount of cars sold by manufacturer',
    y_axis_label='# of cars sold',
    x_axis_label='Manufacturer'
)

# make the orientation of the x ticks angled to remove the overlap
fig.xaxis.major_label_orientation = 1

fig.vbar(
    x='manufacturers',
    top='avg_price',
    source=source,
    width=0.5
)

show(fig)