#Vehicle Sales EDA

Importing Libraries and Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

warnings.filterwarnings('ignore')
sns.set_style()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data = pd.read_csv('/content/drive/MyDrive/ML_Python/car_price.csv')
data.head()

#Understanding Data

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.info()

In [None]:
data.describe().T

In [None]:
data.describe(exclude='number').T

#Handling Missing Values

In [None]:
data.isnull().sum()/data.count()*100

There are many missing values in the data. As we see there are many values missing. There are different methods to handle missing values or we can try to fill those missing values such as the 'Make' column missing value can be filled by doing some research 'Vin' values.
But for now I'm going to remove the missing values. We have good number of data points and the missing values is less the 5% in almost all of the columns besides of 'Transmission' column as it has 13% of values missing.

In [None]:
# We are filling missing values with 'Automatic' because it is the most used transmission type.

data['transmission'].fillna('automatic', inplace=True)

In [None]:
data.dropna(axis=0, inplace=True)

In [None]:
data.isnull().sum()/data.count()*100

We have successfully removed all the missing values either by removing them or replacing them with the alternative.

#Data Cleaning & Transforming

In [None]:
# Since year is a time based data, the best approach is to convert it into datetime data type.

data['year'] = pd.to_datetime(data['year'], format='%Y').dt.year

In [None]:
data['condition'].unique()

The condition column has values between 1 to around 50 which is not correct. It should be between 1 to 5 as it does not look correct to me to have condition 45.

Let's change it into 1 to 5

In [None]:
data['condition'].replace(range(10, 21), 1, inplace=True)
data['condition'].replace(range(20, 31), 2, inplace=True)
data['condition'].replace(range(30, 41), 3, inplace=True)
data['condition'].replace(range(40, 51), 4, inplace=True)

In [None]:
data['color'].unique()

In [None]:
data['interior'].unique()

We can see that 'Color' and 'Interior' column has value '-' which is not correct, there might be any other color but since about that I have no knowledge or idea, i'll use 'MultiColor' as value. This way, we know it is our custom used value yet it will give meaning to the data.

In [None]:
data['color'].replace('—', 'multicolor', inplace=True)
data['interior'].replace('—', 'multicolor', inplace=True)

In [None]:
data['saledate'] = pd.to_datetime(data['saledate'], utc=True, format='mixed').dt.date

Saledate is also datetime column but it has simply object as data-type. We will convert it into datetime and keep the date part only discarding the time part

In [None]:
data['body'] = data['body'].str.lower()

In [None]:
data.head()

In [None]:
# @title transmission

from matplotlib import pyplot as plt
import seaborn as sns
data.groupby('transmission').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

#Data Visualization

Let's define a function to create a Count Plot so we don't have to write all the necessary lines again and again.

In [None]:
def countplot(col, label, rotation=0, size=(12,6)):
    '''
    Create Count Plot using seaborn library.
    It takes four arguments.

    col (str): The column used to create countplot
    label (str): Name of x-label and it is also used as title.
    rotation (int): How much you want to rotate the x-ticks.
    size (tuple): It is a tuple and takes the size for figure in (width, height) format.
    '''

In [None]:
def countplot(col, label, rotation=0, size=(12,6)):
    '''
    Create Count Plot using seaborn library.
    It takes four arguments.

    col (str): The column used to create countplot
    label (str): Name of x-label and it is also used as title.
    rotation (int): How much you want to rotate the x-ticks.
    size (tuple): It is a tuple and takes the size for figure in (width, height) format.
    '''
    plt.figure(figsize=size)
    plt.title(f'Count by Car {label}', fontsize=20)
    sns.countplot(x=data[col], palette='viridis')
    plt.xlabel(label, fontsize=16)
    plt.xticks(rotation=rotation)
    plt.ylabel('Count', fontsize=16)

In [None]:
countplot('make', 'Brands', 90)

In [None]:
countplot('body', 'Body', 90, (18,6))

In [None]:
countplot('color', 'Color', 20)

We have checked all necessary categorical columns there are some columns left such as 'Model', 'Trim', 'Vin', and 'Seller'. These columns have a lot of data points which will not help us understanding data or anything related that much

In [None]:
def boxplot(col, label):
    '''
        Create Boxplot from the given data series.

        col (string): Used as column for dataframe.

        label (string): Used as label and title for chart.

    '''

    plt.figure(figsize=(10,6))
    sns.boxplot(data[col], palette='viridis')
    plt.title(label, fontsize=20)
    plt.ylabel(label, fontsize=16)

In [None]:
def histplot(col, label):
    '''
        Create Boxplot from the given data series.

        col (string): Used as column for dataframe.

        label (string): Used as label and title for chart.

    '''

    plt.figure(figsize=(10,6))
    sns.histplot(data[col], palette='viridis')
    plt.title(label, fontsize=20)
    plt.xlabel(label, fontsize=16)

In [None]:
boxplot('year', 'Year')

In [None]:
boxplot('condition', 'Condition')

In [None]:
boxplot('odometer', 'Odometer')

In [None]:
histplot('mmr', 'Manheim Market Report (MMR)')

In [None]:
histplot('sellingprice', 'Selling Price')

Checking out these results we can see that all these columns have number of outliers and the 'MMR', 'Selling Price' column is highly skewed to the right

In [None]:
cat_col = ['make', 'model', 'trim', 'body', 'transmission', 'vin', 'state', 'color', 'interior', 'seller']
num_col = ['year', 'condition', 'odometer', 'mmr', 'sellingprice']

In [None]:
sns.heatmap(data[num_col].corr(), annot=True)

In [None]:
data.groupby('make')[['sellingprice', 'condition']].mean().sort_values(by=['sellingprice', 'condition'], ascending=False)

In [None]:
plt.figure(figsize=(20,6))
plt.title('Brands vs Selling Price by Transmission', fontsize=20)
sns.barplot(x=data['make'], y=data['sellingprice'], hue=data['transmission'])
plt.xlabel('Brands', fontsize=16)
plt.xticks(rotation=90)
plt.ylabel('Selling Price', fontsize=16)

In [None]:
plt.figure(figsize=(18,6))
plt.title('Car MMR vs State', fontsize=20)
sns.barplot(x=data['state'], y=data['mmr'], errorbar=None, palette='viridis')
plt.xlabel('State', fontsize=16)
plt.xticks(rotation=45)
plt.ylabel('MMR', fontsize=16)

In [None]:
plt.figure(figsize=(18,6))
plt.title('Car Travelled vs Selling Price', fontsize=20)
sns.scatterplot(x=data['sellingprice'], y=data['odometer'], palette='viridis', alpha=0.5)
plt.xlabel('Selling Price', fontsize=16)
plt.xticks(rotation=45)
plt.ylabel('Car Travelled', fontsize=16)

#Report

After finishing the EDA let's see what we have.

1. We have 558837 rows and 16 columns at first after importing the data. These 16 columns have 5 integer type columns and rest 11 are object or we can say string type columns.

2. As per the statistical analysis we can say that the Manheim Market Report (MMR) and Selling Price are quite close. The max of odometer is 999999 which seems very high but it is not impossible.

3. We have some cars manufactured in 1982 this might result as having some Vintage Cars in the dataset.

4. There are lots of missing values in dataset. Transmission has the highest missing values about 13% which is quite high. Other than this Make, Model, Trim, Body, and Condition have 1%-2% missing values. Besides these other have less than 1% missing values.

5. We have Year and Sale date columns which should be of datetime data type but they are in int and object type so we'll convert them.
6. The condition column have values such as 45, 59, 34 etc but this is not correct as it is a column which should have values between 1-5 as rating.
7. The 'Color' and 'Interior' columns has data point with values '-' which replaced by 'Multicolor' as we don't want to add more values to a particular color.
8. 'Body' column values have some inconsistensies like Sedan, sedan or SUV, suv.
9. Ford, Chevrolet, Nissan, Toyota, and Dodge are the top 5 brands by the count.
10. Sedan and SUV are the top 2 body types used in cars.

In [None]:
# Example data - replace this with your actual data
X_ML2 = [[1, 2], [3, 4], [5, 6], [7, 8]]
y_ML2 = [0, 1, 1, 0]

In [None]:
from sklearn.model_selection import train_test_split

X_train_ML2, X_test_ML2, y_train_ML2, y_test_ML2 = train_test_split(X_ML2, y_ML2, test_size=0.2, random_state=42)
# Import the necessary function to split the dataset

In [None]:
X_train_ML2, X_test_ML2, y_train_ML2, y_test_ML2 = train_test_split(X_ML2, y_ML2, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

lr_car = LogisticRegression()

In [None]:
# Assuming X_train_ML2 and y_train_ML2 from train_test_split are your training data
lr_car.fit(X_train_ML2, y_train_ML2)

In [None]:
from sklearn.metrics import accuracy_score # Import accuracy_score

# accuracy of an biased model
y_pred_ones = np.ones(len(y_test_ML2))
accuracy_score(y_test_ML2, y_pred_ones)
print(f'Accuracy of predicting all 1s: {accuracy_score(y_test_ML2, y_pred_ones)}')

In [None]:
y_pred_twos = np.ones(len(y_test_ML2)) * 2
accuracy_score(y_test_ML2, y_pred_twos)
print(f'Accuracy of predicting all 2s: {accuracy_score(y_test_ML2, y_pred_twos)}')

In [None]:
y_pred_threes = np.ones(len(y_test_ML2)) * 3
accuracy_score(y_test_ML2, y_pred_threes)
print(f'Accuracy of predicting all 3s: {accuracy_score(y_test_ML2, y_pred_threes)}')

In [None]:
y_pred_fours = np.ones(len(y_test_ML2)) * 4
accuracy_score(y_test_ML2, y_pred_fours)
print(f'Accuracy of predicting all 4s: {accuracy_score(y_test_ML2, y_pred_fours)}')

In [None]:
y_pred_fives = np.ones(len(y_test_ML2)) * 5
accuracy_score(y_test_ML2, y_pred_fives)
print(f'Accuracy of predicting all 5s: {accuracy_score(y_test_ML2, y_pred_fives)}')

In [None]:
# How many samples are in the test set
# Use len() to get the number of elements in a list
num_test_samples = len(y_test_ML2)
print(f'The test set has {num_test_samples} samples')

#Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier # Import DecisionTreeClassifier from sklearn.tree

dt_model = DecisionTreeClassifier(random_state=42, max_depth=3)
dt_model.fit(X_train_ML2, y_train_ML2)

In [None]:
y_train_ML2_pred = dt_model.predict(X_train_ML2)
y_test_ML2_pred = dt_model.predict(X_test_ML2)

In [None]:

train_accuracy_dt = accuracy_score(y_train_ML2, y_train_ML2_pred)
test_accuracy_dt = accuracy_score(y_test_ML2, y_test_ML2_pred)

In [None]:
print(f'Train Accuracy: {train_accuracy_dt}')
print(f'Test Accuracy: {test_accuracy_dt}')
print(f'Therefore the Model is not good')

In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree

In [None]:
plt.figure(figsize=(20, 10))
plot_tree(dt_model, filled=True, feature_names=['feature1', 'feature2'], class_names=['class1', 'class2'])
plt.show()

#Random forest

In [None]:
# Import the necessary class
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=3)
rf_model.fit(X_train_ML2, y_train_ML2)

In [None]:

y_train_pred = rf_model.predict(X_train_ML2)
y_test_pred = rf_model.predict(X_test_ML2)

In [None]:
train_accuracy = accuracy_score(y_train_ML2, y_train_pred)
test_accuracy = accuracy_score(y_test_ML2, y_test_pred)

In [None]:
print(f'Train Accuracy: {train_accuracy}')
print(f'Test Accuracy: {test_accuracy}')

#Support vector machine

In [None]:
from sklearn.svm import SVC

#Implementation of SVM
svc_rest = SVC(kernel='rbf', random_state=0, gamma=.10, C=1.0)
svc_rest.fit(X_train_ML2,y_train_ML2)

In [None]:
#Implementation of SVM
svc_rest = SVC(kernel='rbf', random_state=0, gamma=.10, C=1.0)
svc_rest.fit(X_train_ML2,y_train_ML2)

In [None]:
predict_rest = svc_rest.predict(X_test_ML2)

In [None]:
print('misclassified samples %d' %(y_test_ML2 != predict_rest).sum())

In [None]:
svc_rest.score(X_test_ML2,y_test_ML2)

In [None]:
svc_rest.score(X_train_ML2, y_train_ML2)

#Thank You