Import car file

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('data/auto.csv')
df.head()

Print column types

In [None]:
df.dtypes

Drop unwanted columns

In [None]:
df.drop('Engine Fuel Type', axis=1, inplace=True)
df.drop('Market Category', axis=1, inplace=True)
df.drop('Vehicle Style', axis=1, inplace=True)
df.drop('Popularity', axis=1, inplace=True)
df.drop('Number of Doors', axis=1, inplace=True)
df.drop('Vehicle Size', axis=1, inplace=True)
df.head()

Renamed columns

In [None]:
df = df.rename(columns={'Engine HP': 'HP', 'Engine Cylinders': 'Cylinders', 'Transmission Type': 'Transmission', 'Driven_Wheels': 'Drive Mode', 'highway MPG': 'MPG-H', 'city mpg': 'MPG-C', 'MSRP': 'Price'})
df.head()

Remove duplicates

In [None]:
duplicate_rows_df = df[df.duplicated()]
print('Amount of duplicate rows (duplicate rows, columns): ', duplicate_rows_df.shape)
df = df.drop_duplicates()

Drop columns with missing values

In [None]:
print(df.isnull().sum())
df = df.dropna()

Visualize outliers

In [None]:
sns.boxplot(x=df['Price'])

Remove outliers (but only in numeric columns, otherwise we get an error)

In [None]:
numeric_cols = df.select_dtypes(include=[np.number]).columns

Q1 = df[numeric_cols].quantile(q=0.25)
Q3 = df[numeric_cols].quantile(q=0.75)
IQR = Q3 - Q1
df = df[~((df[numeric_cols] < (Q1 - 1.5 * IQR)) | (df[numeric_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

Visualize the brand with the most cars

In [None]:
df.Make.value_counts().nlargest(20).plot(kind='bar', figsize=(10,5))
plt.title('Amount of cars for each brand')
plt.ylabel('Amount of cars')
plt.xlabel('Brand')

Create a heat map to visualize the variable dependencies (again, only for numeric columns!)

In [None]:
plt.figure(figsize=(10,5))
corr=df.corr(numeric_only=True)
sns.heatmap(corr, cmap='BrBG', annot=True)

Create a scatter plot to show that the price raises for cars with higher HP (horse power)

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(df['HP'], df['Price'])
ax.set_xlabel('HP')
ax.set_ylabel('Price')
plt.show()